In [1]:
import pandas as pd
import numpy as np

In [None]:

class Team:
    def __init__(self, name):
        self.name = name
        self.goals_for = 0
        self.goals_against = 0
        self.goal_difference = 0
        self.points = 0
        self.position = 0
        self.wins = 0
        self.draws = 0
        self.losses = 0
        


def update_table(team_1, team_1_score, team_2, team_2_score):
    team1 = next(team for team in team_objects if team.name == team_1)
    team2 = next(team for team in team_objects if team.name == team_2)

    scores = [team_1_score, team_2_score]
    new_teams = [team1, team2]

    for i, team in enumerate(new_teams):
        team.goals_for += scores[i]
        team.goals_against += scores[1 - i]
        team.goal_difference = team.goals_for - team.goals_against
        team.points += 3 if scores[i] > scores[1 - i] else 1 if scores[i] == scores[1 - i] else 0
        team.wins += 1 if scores[i] > scores[1 - i] else 0
        team.draws += 1 if scores[i] == scores[1 - i] else 0
        team.losses += 1 if scores[i] < scores[1 - i] else 0

    table = sorted(team_objects, key=lambda x: (-x.points, x.name.lower()))

    for i, team in enumerate(table):
        team.position = i + 1

    return table


def get_record(table_objects):
    table_dict = dict()
    for team in table_objects:
        table_dict[team.name] = (
            team.goals_for,
            team.goals_against,
            team.goal_difference,
            team.points,
            team.position,
            team.wins,
            team.draws,
            team.losses
        )

    return table_dict


def add_features(data):
    df = data.copy()

    col_list = []

    for i, row in df.iterrows():
        ht, hcs, at, acs = row["HT"], row["HCS"], row["AT"], row["ACS"]

        table = update_table(ht, hcs, at, acs)

        if row["week"] > 1:
            col_list.append(table_dict[ht] + table_dict[at])
        else:
            col_list.append((0,) * 10)

        if (i+1) % 10 == 0:
            table_dict = get_record(table)

    # Create a DataFrame from the list of tuples
    new_df = pd.DataFrame(col_list, columns=[
        "h_gf", "h_ga", "h_gd", "h_pts", "h_pos", "h_wins", "h_draws", "H_losses",
        "a_gf", "a_ga", "a_gd", "a_pts", "a_pos", "a_wins", "a_draws", "a_losses"])

    # Concatenate the new DataFrame with the existing DataFrame
    df = pd.concat([df, new_df], axis=1)

    return df


In [None]:

class Team:
    def __init__(self, name):
        self.name = name
        self.goals_for = 0
        self.goals_against = 0
        self.goal_difference = 0
        self.points = 0
        self.position = 0
        self.last_results = []  # List to store last results
        self.last_1_result = 0
        self.last_2_result = 0
        self.matches_played = 0
        self.wins = 0
        self.losses = 0
        self.draws = 0
        self.win_lose_ratio = 0
        self.pts_per_game = 0

def update_table(team_1, team_1_score, team_2, team_2_score):
    team1 = next(team for team in team_objects if team.name == team_1)
    team2 = next(team for team in team_objects if team.name == team_2)

    scores = [team_1_score, team_2_score]
    new_teams = [team1, team2]

    for i, team in enumerate(new_teams):
        team.goals_for += scores[i]
        team.goals_against += scores[1 - i]
        team.goal_difference = team.goals_for - team.goals_against
        team.points += 3 if scores[i] > scores[1 - i] else 1 if scores[i] == scores[1 - i] else 0
        team.last_results.append(1 if scores[i] > scores[1 - i] else 0 if scores[i] == scores[1 - i] else -1)
        team.last_1_result = team.last_results[-1]
        team.last_2_result = team.last_results[-2] if len(team.last_results) > 1 else 0
        team.matches_played += 1 
        team.wins += 1 if scores[i] > scores[1 - i] else 0
        team.losses += 1 if scores[i] < scores[1 - i] else 0
        team.draws += 1 if scores[i] == scores[1 - i] else 0
        team.win_lose_ratio = team.wins / team.matches_played if team.matches_played > 0 else 0.0
        team.pts_per_game = team.points / team.matches_played if team.matches_played > 0 else 0.0

    table = sorted(team_objects, key=lambda x: (-x.points, x.name.lower()))

    for i, team in enumerate(table):
        team.position = i + 1

    return table


def get_record(table_objects):
    table_dict = dict()
    for team in table_objects:
        table_dict[team.name] = (
            team.goals_for,
            team.goals_against,
            team.goal_difference,
            team.points,
            team.position,
            team.last_1_result,
            team.last_2_result,
            team.wins,
            team.losses,
            team.draws,
            team.win_lose_ratio,
            team.pts_per_game
        )

    return table_dict


def add_features(data):
    df = data.copy()

    col_list = []

    for i, row in df.iterrows():
        ht, hcs, at, acs = row["HT"], row["HCS"], row["AT"], row["ACS"]

        table = update_table(ht, hcs, at, acs)

        if row["week"] > 2:
            col_list.append(table_dict[ht] + table_dict[at])
        else:
            col_list.append((0,) * 24)

        if (i+1) % 20 == 0:
            table_dict = get_record(table)

    # Create a DataFrame from the list of tuples
    new_df = pd.DataFrame(col_list, columns=[
        "h_gf", "h_ga", "h_gd", "h_pts", "h_pos", "HL1R", "HL2R", "H_win", "H_loss", "H_draws", "H_wlr", "H_ppg", 
        "a_gf", "a_ga", "a_gd", "a_pts", "a_pos", "AL1R", "AL2R", "A_win", "A_loss", "A_draws", "A_wlr", "A_ppg"])

    # Concatenate the new DataFrame with the existing DataFrame
    df = pd.concat([df, new_df], axis=1)

    return df


In [None]:

class Team:
    def __init__(self, name):
        self.name = name
        self.goals_for = 0
        self.goals_against = 0
        self.goal_difference = 0
        self.points = 0
        self.position = 0
        self.last_results = []  # List to store last results
        self.last_1_result = 0
        self.last_2_result = 0
        self.matches_played = 0
        self.wins = 0
        self.losses = 0
        self.win_lose_ratio = 0
        self.pts_per_game = 0

def update_table(team_1, team_1_score, team_2, team_2_score):
    team1 = next(team for team in team_objects if team.name == team_1)
    team2 = next(team for team in team_objects if team.name == team_2)

    scores = [team_1_score, team_2_score]
    new_teams = [team1, team2]

    for i, team in enumerate(new_teams):
        team.goals_for += scores[i]
        team.goals_against += scores[1 - i]
        team.goal_difference = team.goals_for - team.goals_against
        team.points += 3 if scores[i] > scores[1 - i] else 1 if scores[i] == scores[1 - i] else 0
        team.last_results.append(1 if scores[i] > scores[1 - i] else 0 if scores[i] == scores[1 - i] else -1)
        team.last_1_result = team.last_results[-1]
        team.last_2_result = team.last_results[-2] if len(team.last_results) > 1 else 0
        team.matches_played += 1 
        team.wins += 1 if scores[i] > scores[1 - i] else 0
        team.losses += 1 if scores[i] < scores[1 - i] else 0
        team.win_lose_ratio = team.wins / team.matches_played if team.matches_played > 0 else 0.0
        team.pts_per_game = team.points / team.matches_played if team.matches_played > 0 else 0.0

    table = sorted(team_objects, key=lambda x: (-x.points, x.name.lower()))

    for i, team in enumerate(table):
        team.position = i + 1

    return table


def get_record(table_objects):
    table_dict = dict()
    for team in table_objects:
        table_dict[team.name] = (
            team.goals_for,
            team.goals_against,
            team.goal_difference,
            team.points,
            team.position,
            team.last_1_result,
            team.last_2_result,
            team.wins,
            team.losses,
            team.win_lose_ratio,
            team.pts_per_game
        )

    return table_dict


def add_features(data):
    df = data.copy()

    col_list = []

    for i, row in df.iterrows():
        ht, hcs, at, acs = row["HT"], row["HCS"], row["AT"], row["ACS"]

        table = update_table(ht, hcs, at, acs)

        if row["week"] > 2:
            col_list.append(table_dict[ht] + table_dict[at])
        else:
            col_list.append((0,) * 22)

        if (i+1) % 20 == 0:
            table_dict = get_record(table)

    # Create a DataFrame from the list of tuples
    new_df = pd.DataFrame(col_list, columns=[
        "h_gf", "h_ga", "h_gd", "h_pts", "h_pos", "HL1R", "HL2R", "H_win", "H_loss", "H_wlr", "H_ppg", 
        "a_gf", "a_ga", "a_gd", "a_pts", "a_pos", "AL1R", "AL2R", "A_win", "A_loss", "A_wlr", "A_ppg"])

    # Concatenate the new DataFrame with the existing DataFrame
    df = pd.concat([df, new_df], axis=1)

    return df


In [None]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

# Load your dataset into a pandas DataFrame
dataset = processed_df.copy()

# Load your dataset into X and y arrays
X = dataset.drop("result", axis=1)
y = dataset["result"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define a dictionary to store the models
models = {
    'Random Forest': RandomForestClassifier(random_state=42),
    'SVC': SVC(random_state=42),
    'XGBoost': XGBClassifier(random_state=42),
    'Gradient Boost': GradientBoostingClassifier(random_state=42),
    'Logistic Regression': LogisticRegression(random_state=42),
    # 'K-Nearest Neighbors': KNeighborsClassifier(n_neighbors=5)
}

# Calculate the F1 scores for each model
model_scores = []
for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = round(accuracy_score(y_test, y_pred), 3)
    f1 = round(f1_score(y_test, y_pred, average='weighted'), 3)
    model_scores.append((model_name, f1))
    print(f'{model_name}:')
    print('Accuracy:', accuracy)
    print('F1 score:', f1)
    print('-' * 40)

# Sort the models based on F1 scores in descending order
model_scores.sort(key=lambda x: x[1], reverse=True)
model_names, f1_scores = zip(*model_scores)

# Plot the horizontal bar graph
plt.figure(figsize=(12, 7))
plt.barh(model_names, f1_scores)  # <-- Use plt.barh() for horizontal bars
plt.ylabel('Model')
plt.xlabel('F1 Score')
plt.title('F1 Scores of Different Models')

# Add the scores beside each bar
for i, score in enumerate(f1_scores):
    plt.text(score, i, f'{score:.3f}', ha='left', va='center')  # <-- Adjust text position

# Display the horizontal bar graph
plt.show()

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

# Load your dataset into a pandas DataFrame
dataset = processed_df.copy()

# Load your dataset into X and y arrays
X = dataset.drop("result", axis=1)
y = dataset["result"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a Random Forest classifier
rf_model = RandomForestClassifier(random_state=42)

# Fit the model on the training data
rf_model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = rf_model.predict(X_test)

# Calculate the accuracy and F1 score of the model
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print('Accuracy:', accuracy)
print('F1 score:', f1)


In [None]:
# Random Forest:
# Accuracy: 0.502
# F1 score: 0.48
# ----------------------------------------
# SVC:
# Accuracy: 0.472
# F1 score: 0.402
# ----------------------------------------
# XGBoost:
# Accuracy: 0.493
# F1 score: 0.48
# ----------------------------------------
# Gradient Boost:
# Accuracy: 0.487
# F1 score: 0.431
# ----------------------------------------
# Logistic Regression:
# Accuracy: 0.476
# F1 score: 0.406
# ----------------------------------------


# Random Forest:
# Accuracy: 0.486
# F1 score: 0.469
# ----------------------------------------
# SVC:
# Accuracy: 0.47
# F1 score: 0.388
# ----------------------------------------
# XGBoost:
# Accuracy: 0.472
# F1 score: 0.456
# ----------------------------------------
# Gradient Boost:
# Accuracy: 0.481
# F1 score: 0.416
# ----------------------------------------
# Logistic Regression:
# Accuracy: 0.468
# F1 score: 0.391
# ----------------------------------------


# Random Forest:
# Accuracy: 0.486
# F1 score: 0.464
# ----------------------------------------
# SVC:
# Accuracy: 0.472
# F1 score: 0.388
# ----------------------------------------
# XGBoost:
# Accuracy: 0.47
# F1 score: 0.453
# ----------------------------------------
# Gradient Boost:
# Accuracy: 0.485
# F1 score: 0.424
# ----------------------------------------
# Logistic Regression:
# Accuracy: 0.464
# F1 score: 0.383

In [5]:

def txt_reader(_path):
    file = open(_path, 'r')  # Open the file in read mode
    contents = file.read()  # Read the contents of the file
    file.close()  # Close the file

    return contents

def table_creator(contents):
    
    # data_table = pd.DataFrame(columns=["league_id", "week", "hour", "minute", "HT", "AT", "HCS", "ACS"])

    record_list = []
    weeks = contents.split("WEEK")
    pos = weeks[0].index("League")
    league_no = int(weeks[0][(pos + 7): (pos + 11)])
    for val in weeks[1:]:
        scores = val.split("\n")[1:-1]
        for score in scores:
            temp_dict = {
                "league_id": league_no,
                "week": int((val[:3]).strip()),
                "hour": int(val.split("\n")[0][-8:-6]),
                "minute": int(val.split("\n")[0][-5:-3]),

                "HT": score[0:3],
                "AT": score[8:11],
                "HCS": int(score[4]),
                "ACS": int(score[6])
            }

            # data._append(temp_dict, ignore_index=True)
            record_list.append(temp_dict)
    data = pd.DataFrame(record_list)

    return data

In [6]:

df = table_creator(txt_reader("League_9.txt"))
df.head()

Unnamed: 0,league_id,week,hour,minute,HT,AT,HCS,ACS
0,6134,1,14,16,EVE,LIV,1,2
1,6134,1,14,16,CRY,MNC,0,1
2,6134,1,14,16,BRN,WHU,1,0
3,6134,1,14,16,BOU,NWC,0,1
4,6134,1,14,16,FOR,LEE,0,0


In [None]:

FIVE, FOUR, THREE, TWO, ONE, ZERO = 5, 4, 3, 2, 1, 0

def add_features(dataframe):
    def update_match_result(home_team, away_team, home_score, away_score, premier_league_table):
        home_index = -1
        away_index = -1
        for i, team in enumerate(premier_league_table):
            if team['team'] == home_team:
                home_index = i
            elif team['team'] == away_team:
                away_index = i

        premier_league_table[home_index]['played'] += 1
        premier_league_table[home_index]['goals_for'] += home_score
        premier_league_table[home_index]['goals_agst'] += away_score
        premier_league_table[away_index]['played'] += 1
        premier_league_table[away_index]['goals_for'] += away_score
        premier_league_table[away_index]['goals_agst'] += home_score

        premier_league_table[home_index]['goal_diff'] = \
            premier_league_table[home_index]['goals_for'] - premier_league_table[home_index]['goals_agst']

        premier_league_table[away_index]['goal_diff'] = \
            premier_league_table[away_index]['goals_for'] - premier_league_table[away_index]['goals_agst']

        if home_score > away_score:
            premier_league_table[home_index]['wins'] += 1
            premier_league_table[home_index]['points'] += 3
            premier_league_table[away_index]['losses'] += 1
        elif home_score < away_score:
            premier_league_table[away_index]['wins'] += 1
            premier_league_table[away_index]['points'] += 3
            premier_league_table[home_index]['losses'] += 1
        else:
            premier_league_table[home_index]['draws'] += 1
            premier_league_table[home_index]['points'] += 1
            premier_league_table[away_index]['draws'] += 1
            premier_league_table[away_index]['points'] += 1

        # Sort table by points, then alphabetically if multiple teams have the same points
        premier_league_table.sort(key=lambda x: (-x['points'], x['team']))

        # Assign positions to teams
        for i, team in enumerate(premier_league_table):
            team['position'] = i + 1

        return premier_league_table

    global table
    temp_df = dataframe.copy()

    # Initialize dictionaries and variables
    team_dict = {team: [] for team in teams}  # Dictionary to store scores for each team
    team_status = {team: [] for team in teams}  # Dictionary to store results (W/D/L) for each team

    h3, h2, h1= 0, 0, 0  # Variables to store home team scores from previous matches
    a3, a2, a1= 0, 0, 0  # Variables to store away team scores from previous matches
    hs_3, hs_2, hs_1 = 0, 0, 0  # Variables to store home team result (W/D/L) from previous matches
    as_3, as_2, as_1 = 0, 0, 0  # Variables to store away team result (W/D/L) from previous matches

    hsa, asa = 0, 0  # variables to store home & awat team scoring average
    hwr, hlr, hdr = 0, 0, 0  # store home and away win ratio
    awr, alr, adr = 0, 0, 0  # store home and away win, lose & draw ratio

    # Loop through each row in the dataframe
    for _, row in temp_df.iterrows():
        home_t = row["HT"]
        ht_score = row["HCS"]
        away_t = row["AT"]
        at_score = row["ACS"]

        # Update the league table with the match result
        table = update_match_result(home_t, away_t, ht_score, at_score, premier_league_table)
        temp = pd.DataFrame(table)

        # Add home team points and position to ht_points and ht_pos lists
        ht_points.append(temp.loc[temp['team'] == home_t, "points"].iloc[0])
        ht_pos.append(temp.loc[temp['team'] == home_t, "position"].iloc[0])

        # Add away team points and position to at_points and at_pos lists
        at_points.append(temp.loc[temp['team'] == away_t, "points"].iloc[0])
        at_pos.append(temp.loc[temp['team'] == away_t, "position"].iloc[0])

        # Add home & away wins, draws, losses, goals for, goals against, and goal diff to corresponding lists
        val_list = ["wins", "draws", "losses", "goals_for", "goals_agst", "goal_diff"]
        home_list = [h_wins, h_draws, h_loss, h_gf, h_ga, h_gd]
        away_list = [a_wins, a_draws, a_loss, a_gf, a_ga, a_gd]
        for home, away, val in zip(home_list, away_list, val_list):
            home.append(temp.loc[temp['team'] == home_t, val].iloc[0])
            away.append(temp.loc[temp['team'] == away_t, val].iloc[0])

        # Update home and away team dictionaries and status dictionaries with scores and results
        current_week = row["week"]
        for (a, b), c in zip(team_dict.items(), team_status.values()):
            
            if a == home_t:
                if current_week >= FIVE:
                    hsa = st.mean(b[-THREE:])
                    h3, hs_3 = b[-THREE], c[-THREE]
                    h2, hs_2 = b[-TWO], c[-TWO]
                    h1, hs_1 = b[-ONE], c[-ONE]
                    
                    hwr = c.count(ONE) / len(c)
                    hlr = c.count(-ONE) / len(c)
                    hdr = c.count(ZERO) / len(c)

                b.append(ht_score)

                # mapping "W" = 1, "D" = 0, "L" = -1
                c.append(ONE if ht_score > at_score else -ONE if ht_score < at_score else ZERO)

            elif a == away_t:
                if current_week >= FIVE:
                    asa = st.mean(b[-THREE:])
                    a3, as_3 = b[-THREE], c[-THREE]
                    a2, as_2 = b[-TWO], c[-TWO]
                    a1, as_1 = b[-ONE], c[-ONE]
                    
                    awr = c.count(ONE) / len(c)
                    alr = c.count(-ONE) / len(c)
                    adr = c.count(ZERO) / len(c)

                b.append(at_score)

                # mapping "W" = 1, "D" = 0, "L" = -1
                c.append(ONE if ht_score < at_score else -ONE if ht_score > at_score else ZERO)

        hs_avg.append(hsa), as_avg.append(asa)  # Adding scoring average

        # Adding win, lose and draw ratio
        hw_rat.append(hwr), hl_rat.append(hlr), hd_rat.append(hdr)
        aw_rat.append(awr), al_rat.append(alr), ad_rat.append(adr)

        # Add home and away team scores from previous matches to corresponding lists
        HL3S.append(h3), HL2S.append(h2), HL1S.append(h1)
        AL3S.append(a3), AL2S.append(a2), AL1S.append(a1)

        # Add home and away team result (W/D/L) from previous matches to corresponding lists
        hl3_stat.append(hs_3), hl2_stat.append(hs_2), hl1_stat.append(hs_1)
        al3_stat.append(as_3), al2_stat.append(as_2), al1_stat.append(as_1)

    # Create a new dictionary with the additional columns
    new_cols = {
        "HL1S": HL1S, "AL1S": AL1S, "HL2S": HL2S, 
        "AL2S": AL2S, "HL3S": HL3S, "AL3S": AL3S,

        "hl1_stat": hl1_stat, "al1_stat": al1_stat,
        "hl2_stat": hl2_stat, "al2_stat": al2_stat,
        "hl3_stat": hl3_stat, "al3_stat": al3_stat,

        "h_wins": h_wins, "a_wins": a_wins, "h_draws": h_draws, "a_draws": a_draws,
        "h_loss": h_loss, "a_loss": a_loss, "h_gf": h_gf, "a_gf": a_gf,
        "h_ga": h_ga, "a_ga": a_ga, "h_gd": h_gd, "a_gd": a_gd,

        "hs_avg": hs_avg, "as_avg": as_avg,

        # "H_ADV": H_ADV, #"A_ADV": A_ADV,

        "hw_rat": hw_rat, "aw_rat": aw_rat, 
        "hl_rat": hl_rat, "al_rat": al_rat, 
        "hd_rat": hd_rat, "ad_rat": ad_rat,

        "ht_points": ht_points, "at_points": at_points,
        "ht_pos": ht_pos, "at_pos": at_pos
    }
    # Add the new columns to the dataframe
    temp_df = temp_df.assign(**new_cols)

    return temp_df


# -----------------------------------------------------------------------------------------------------------

path = "league_data"
test_record = [f"{path}/L_18.txt"]
record = [
    f"{path}/L_6095.txt", f"{path}/L_6097.txt", f"{path}/L_6099.txt",
    f"{path}/L_6148.txt", f"{path}/L_6152.txt", f"{path}/L_6153.txt",
    f"{path}/L_6155.txt", f"{path}/L_6155.txt", f"{path}/L_6166.txt",
    f"{path}/L_6169.txt", f"{path}/L_6170.txt", f"{path}/L_6171.txt",
    f"{path}/L_6173.txt", f"{path}/L_6180.txt", f"{path}/L_6181.txt",
    f"{path}/L_6189.txt", f"{path}/L_6192.txt", f"{path}/L_6211.txt",
    f"{path}/L_6212.txt", f"{path}/L_6213.txt", f"{path}/L_6214.txt",
    f"{path}/L_6215.txt", f"{path}/L_6216.txt", f"{path}/L_6226.txt", 
    f"{path}/L_6227.txt", f"{path}/L_6230.txt"
]

f_paths = record

df_list = []
print(".txt files preprocessing --> \nPlease Wait ...")
for path in f_paths:
    teams = ['FOR', 'MNC', 'ASV', 'TOT', 'EVE', 'CHE', 'BRN', 'WHU', 'ARS', 'FUL',
             'NWC', 'BOU', 'LEI', 'LIV', 'WOL', 'MNU', 'LEE', 'SOU', 'BRI', 'CRY']
    premier_league_table = []

    for team in teams:
        team_dict = {
            'team': team,
            'played': 0,
            'wins': 0,
            'draws': 0,
            'losses': 0,
            'goals_for': 0,
            'goals_agst': 0,
            'goal_diff': 0,
            'points': 0,
            'position': 0
        }
        premier_league_table.append(team_dict)

    league_id, week, hour, minute = [], [], [], []
    home_team, away_team, home_score, away_score = [], [], [], []

    ht_points, at_points, ht_pos, at_pos = [], [], [], []

    h_wins, h_draws, h_loss, h_gf, h_ga, h_gd = [], [], [], [], [], []
    a_wins, a_draws, a_loss, a_gf, a_ga, a_gd = [], [], [], [], [], []

    hs_avg, as_avg = [], []  # home & away Scoring averagae column
    # H_ADV = []
    hw_rat, hl_rat, hd_rat = [], [], []
    aw_rat, al_rat, ad_rat = [], [], []

    HL1S, HL2S, HL3S = [], [], []
    AL1S, AL2S, AL3S = [], [], []

    hl3_stat, hl2_stat, hl1_stat = [], [], []
    al3_stat, al2_stat, al1_stat = [], [], []

    text_content = txt_reader(path)
    update_df = table_creator(text_content)
    update_df = add_features(update_df)

    df_list.append(update_df)

df = pd.concat(df_list, ignore_index=True)
print(f"All {len(f_paths)} .txt Files Successfully Processed ...\n{df.shape[0]} Observations")

df.to_csv("league_record.csv", index=False)
print("\n... Pre-processing Completed ...\nCSV Saved Successfully \n")

# print(df.tail(10))

# print(pd.DataFrame(premier_league_table))


In [120]:
df = pd.read_csv("league_record.csv")

In [3]:
pd.set_option('display.max_columns', None)

In [121]:
df

Unnamed: 0,league_id,week,hour,minutes,HT,AT,HLS,ALS,HL2S,AL2S,HL3S,AL3S,HL4S,AL4S,HL5S,AL5S,hl_stat,al_stat,hl2_stat,al2_stat,hl3_stat,al3_stat,h_wins,a_wins,h_draws,a_draws,h_loss,a_loss,h_gf,a_gf,h_ga,a_ga,h_gd,a_gd,ht_points,at_points,ht_pos,at_pos
0,6095,1,12,52,EVE,MNC,1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,2,2,1,-1,1,0,3,9,1
1,6095,1,12,52,BRN,TOT,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,1,0,-1,1,0,3,7,2
2,6095,1,12,52,BRI,FUL,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,1,1,-1,3,0,1,12
3,6095,1,12,52,LEI,WHU,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,2,1,1,2,1,-1,3,0,2,19
4,6095,1,12,52,WOL,ARS,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,1,6,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4555,6171,38,14,22,TOT,CHE,0,0,0,1,2,2,3,1,2,1,1,1,0,2,1,2,19,17,7,13,12,8,64,45,44,30,20,15,64,64,7,6
4556,6171,38,14,22,FUL,MNC,1,0,1,1,2,4,0,3,1,1,2,0,0,1,0,2,7,25,9,8,22,5,37,75,61,33,-24,42,30,83,19,2
4557,6171,38,14,22,WHU,BOU,3,0,1,0,1,2,2,0,3,0,2,0,0,1,1,1,24,8,9,7,5,23,71,39,35,76,36,-37,81,31,3,18
4558,6171,38,14,22,ARS,WOL,2,0,2,2,0,1,3,0,3,1,2,0,2,1,0,2,23,11,5,12,10,15,71,46,49,50,22,-4,74,45,4,13


#### Home Win Model

In [122]:
# Home win prediction
df_home = df.copy()
df_home = df_home[df_home['week'] > 4]  # remove rows with incomplete values
df_home = df_home.drop(["league_id", "ALS", "HT", "AT", "hl_stat", "al_stat"], axis=1)  # remove unnecessary features
df_home = df_home.reset_index(drop=True)  # reset the index values
df_home

Unnamed: 0,week,hour,minutes,HLS,HL2S,AL2S,HL3S,AL3S,HL4S,AL4S,HL5S,AL5S,hl2_stat,al2_stat,hl3_stat,al3_stat,h_wins,a_wins,h_draws,a_draws,h_loss,a_loss,h_gf,a_gf,h_ga,a_ga,h_gd,a_gd,ht_points,at_points,ht_pos,at_pos
0,5,13,0,0,2,1,2,0,1,2,1,2,2,1,2,0,2,0,1,4,2,1,6,5,8,10,-2,-5,7,4,6,13
1,5,13,0,2,0,3,5,0,0,0,2,2,0,2,2,0,3,1,0,1,2,3,9,6,4,8,5,-2,9,4,4,15
2,5,13,0,1,1,1,1,0,0,1,3,2,0,0,1,0,1,2,2,1,2,2,6,5,9,4,-3,1,5,7,12,9
3,5,13,0,1,0,1,0,4,0,3,0,3,0,2,1,2,0,5,2,0,3,0,1,15,7,4,-6,11,2,15,19,1
4,5,13,0,1,2,2,4,1,2,2,2,1,2,2,2,0,3,3,0,0,2,2,11,8,8,6,3,2,9,9,3,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4075,38,14,22,0,0,1,2,2,3,1,2,1,0,2,1,2,19,17,7,13,12,8,64,45,44,30,20,15,64,64,7,6
4076,38,14,22,1,1,1,2,4,0,3,1,1,0,1,0,2,7,25,9,8,22,5,37,75,61,33,-24,42,30,83,19,2
4077,38,14,22,3,1,0,1,2,2,0,3,0,0,1,1,1,24,8,9,7,5,23,71,39,35,76,36,-37,81,31,3,18
4078,38,14,22,2,2,2,0,1,3,0,3,1,2,1,0,2,23,11,5,12,10,15,71,46,49,50,22,-4,74,45,4,13


### 1. One-Hot Encoding:

In [123]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

df_temp1 = df_home.copy()

# create a one-hot encoder object
encoder = OneHotEncoder(sparse_output=False)

# fit the encoder to the categorical columns
encoder.fit(df_temp1[['hl2_stat', 'al2_stat', 'hl3_stat', 'al3_stat']])

# transform the categorical columns into one-hot encoded columns
one_hot_encoded = pd.DataFrame(encoder.transform(df_temp1[['hl2_stat', 'al2_stat', 'hl3_stat', 'al3_stat']]))

# concatenate the one-hot encoded columns with the original data frame
df_onehot = pd.concat([df_temp1, one_hot_encoded], axis=1)

# drop the original categorical columns
df_onehot.drop(['hl2_stat', 'al2_stat', 'hl3_stat', 'al3_stat'], axis=1, inplace=True)
df_onehot = df_onehot.dropna()

# rename columns to string names
df_onehot.columns = df_onehot.columns.astype(str)

df_onehot.shape

(4080, 40)

In [134]:
df_ordinal.tail(3)

Unnamed: 0,week,hour,minutes,HLS,HL2S,AL2S,HL3S,AL3S,HL4S,AL4S,HL5S,AL5S,h_wins,a_wins,h_draws,a_draws,h_loss,a_loss,h_gf,a_gf,h_ga,a_ga,h_gd,a_gd,ht_points,at_points,ht_pos,at_pos,hl2_stat_enc,al2_stat_enc,hl3_stat_enc,al3_stat_enc
4077,38,14,22,3,1,0,1,2,2,0,3,0,24,8,9,7,5,23,71,39,35,76,36,-37,81,31,3,18,0.0,1.0,1.0,1.0
4078,38,14,22,2,2,2,0,1,3,0,3,1,23,11,5,12,10,15,71,46,49,50,22,-4,74,45,4,13,2.0,1.0,0.0,2.0
4079,38,14,22,2,0,2,0,1,0,0,0,4,12,15,13,7,13,16,47,55,53,60,-6,-5,49,52,11,9,0.0,2.0,0.0,0.0


### 2. Ordinal Encoding:

In [124]:
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder

df_temp2 = df_home.copy()

# create an ordinal encoder object with specified categories
encoder = OrdinalEncoder(categories=[sorted(df_temp2[col].unique()) for col in ['hl2_stat', 'al2_stat', 'hl3_stat', 'al3_stat']])

# fit the encoder to the categorical columns
encoder.fit(df_temp2[['hl2_stat', 'al2_stat', 'hl3_stat', 'al3_stat']])

# transform the categorical columns into ordinal encoded columns
ordinal_encoded = pd.DataFrame(encoder.transform(df_temp2[['hl2_stat', 'al2_stat', 'hl3_stat', 'al3_stat']]), \
columns=['hl2_stat_enc', 'al2_stat_enc', 'hl3_stat_enc', 'al3_stat_enc'])

# concatenate the ordinal encoded columns with the original data frame
df_ordinal = pd.concat([df_temp2, ordinal_encoded], axis=1)

# drop the original categorical columns
df_ordinal.drop(['hl2_stat', 'al2_stat', 'hl3_stat', 'al3_stat'], axis=1, inplace=True)

df_ordinal.shape

(4080, 32)

In [133]:
df_ordinal.head()

Unnamed: 0,week,hour,minutes,HLS,HL2S,AL2S,HL3S,AL3S,HL4S,AL4S,HL5S,AL5S,h_wins,a_wins,h_draws,a_draws,h_loss,a_loss,h_gf,a_gf,h_ga,a_ga,h_gd,a_gd,ht_points,at_points,ht_pos,at_pos,hl2_stat_enc,al2_stat_enc,hl3_stat_enc,al3_stat_enc
0,5,13,0,0,2,1,2,0,1,2,1,2,2,0,1,4,2,1,6,5,8,10,-2,-5,7,4,6,13,2.0,1.0,2.0,0.0
1,5,13,0,2,0,3,5,0,0,0,2,2,3,1,0,1,2,3,9,6,4,8,5,-2,9,4,4,15,0.0,2.0,2.0,0.0
2,5,13,0,1,1,1,1,0,0,1,3,2,1,2,2,1,2,2,6,5,9,4,-3,1,5,7,12,9,0.0,0.0,1.0,0.0
3,5,13,0,1,0,1,0,4,0,3,0,3,0,5,2,0,3,0,1,15,7,4,-6,11,2,15,19,1,0.0,2.0,1.0,2.0
4,5,13,0,1,2,2,4,1,2,2,2,1,3,3,0,0,2,2,11,8,8,6,3,2,9,9,3,5,2.0,2.0,2.0,0.0


### 3. Target Encoding:

In [126]:
import pandas as pd
import category_encoders as ce

df_temp4 = df_home.copy()

# create a target encoder object
encoder = ce.TargetEncoder(cols=['hl2_stat', 'al2_stat', 'hl3_stat', 'al3_stat'])

# fit the encoder to the categorical columns and target variable
encoder.fit(df_temp4[['hl2_stat', 'al2_stat', 'hl3_stat', 'al3_stat']], df_temp4['HLS'])

# transform the categorical columns into target encoded columns
target_encoded = encoder.transform(df_temp4[['hl2_stat', 'al2_stat', 'hl3_stat', 'al3_stat']])

# concatenate the target encoded columns with the original data frame
df_encoded = pd.concat([df_temp4, target_encoded], axis=1)

# drop the original categorical columns
df_encoded.drop(['hl2_stat', 'al2_stat', 'hl3_stat', 'al3_stat'], axis=1, inplace=True)

df_encoded.shape

(4080, 28)

# Model Evaluation

In [128]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_squared_error, accuracy_score

def train_random_forest(dataset, target_var, test_size=0.2, random_state=42, **kwargs):
    """
    Train a random forest model on the given dataset and target variable.

    Args:
        dataset (pandas.DataFrame): The dataset to use for training and testing.
        target_var (str): The name of the target variable to predict.
        test_size (float, optional): The proportion of the data to use for testing. Defaults to 0.2.
        random_state (int, optional): The random seed to use for reproducibility. Defaults to 42.
        **kwargs: Other keyword arguments to pass to the RandomForestRegressor or RandomForestClassifier constructor.

    Returns:
        (float, float): A tuple containing the root mean squared error and accuracy of the model on the test set.
    """
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(dataset.drop(target_var, axis=1), dataset[target_var], test_size=test_size, random_state=random_state)
    
    # Train a random forest model on the training set
    if isinstance(y_train.iloc[0], str):
        is_classification = True
        model = RandomForestClassifier(random_state=random_state, **kwargs)
    else:
        is_classification = False
        model = RandomForestRegressor(random_state=random_state, **kwargs)
    model.fit(X_train, y_train)
    
    # Make predictions on the test set
    y_pred = model.predict(X_test)
    
    # Calculate the RMSE and accuracy of the model on the test set
    if is_classification:
        accuracy = accuracy_score(y_test, y_pred)
    else:
        accuracy = accuracy_score(y_test.round(), y_pred.round())
    
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    
    return rmse, accuracy

In [135]:
datasets = [df_home]
for data in datasets:   
    rmse, accuracy = train_random_forest(data, 'HLS', n_estimators=100)
    print(f'RMSE: {rmse}, Accuracy: {accuracy}')

RMSE: 1.1215452946890803, Accuracy: 0.37254901960784315


In [130]:
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_squared_error, accuracy_score


def cross_validate_random_forest(dataset, target_var, n_splits=5, random_state=42, **kwargs):
    """
    Perform k-fold cross validation on a random forest model on the given dataset and target variable.

    Args:
        dataset (pandas.DataFrame): The dataset to use for training and testing.
        target_var (str): The name of the target variable to predict.
        n_splits (int, optional): The number of folds to use for cross validation. Defaults to 5.
        random_state (int, optional): The random seed to use for reproducibility. Defaults to 42.
        **kwargs: Other keyword arguments to pass to the RandomForestRegressor or RandomForestClassifier constructor.

    Returns:
        (float, float): A tuple containing the mean root mean squared error and mean accuracy of the model across all folds for the appropriate task (regression or classification).
    """
    # Initialize the KFold cross validation object
    kfold = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)

    # Initialize lists to store the RMSE and accuracy values for each fold
    rmse_scores = []
    accuracy_scores = []

    # Determine whether the target variable is categorical or numeric
    if isinstance(dataset[target_var].iloc[0], str):
        is_classification = True
    else:
        is_classification = False

    # Loop over each fold
    for train_idx, test_idx in kfold.split(dataset):
        # Split the data into training and testing sets for the current fold
        X_train, X_test = dataset.drop(target_var, axis=1).iloc[train_idx], dataset.drop(target_var, axis=1).iloc[test_idx]
        y_train, y_test = dataset[target_var].iloc[train_idx], dataset[target_var].iloc[test_idx]

        # Train a random forest model on the training set
        if isinstance(y_train.iloc[0], str):
            is_classification = True
            model = RandomForestClassifier(random_state=random_state, **kwargs)
        else:
            is_classification = False
            model = RandomForestRegressor(random_state=random_state, **kwargs)
        model.fit(X_train, y_train)
        
        # Make predictions on the test set
        y_pred = model.predict(X_test)
        
        # Calculate the RMSE and accuracy of the model on the test set
        if is_classification:
            accuracy = accuracy_score(y_test, y_pred)
        else:
            accuracy = accuracy_score(y_test.round(), y_pred.round())
        
        rmse = mean_squared_error(y_test, y_pred, squared=False)
        
        # Append the RMSE and accuracy values for the current fold to the lists
        rmse_scores.append(rmse)
        accuracy_scores.append(accuracy)

    # Calculate the mean RMSE and accuracy across all folds
    if is_classification:
        mean_accuracy = sum(accuracy_scores) / n_splits
        mean_rmse = None
    else:
        mean_rmse = sum(rmse_scores) / n_splits
        mean_accuracy = accuracy_score(y_test.round(), y_pred.round())

    return mean_rmse, mean_accuracy

In [131]:
rmse, accuracy = cross_validate_random_forest(df_home, "HLS", n_splits=100, random_state=42)
print(f'RMSE: {rmse}, Accuracy: {accuracy}')

RMSE: 1.033632532316481, Accuracy: 0.375
RMSE: 1.0352467521128554, Accuracy: 0.375
RMSE: 1.0344052955166947, Accuracy: 0.35


In [None]:
# RMSE: 1.037260024806251, Accuracy: 0.45588235294117646    
# RMSE: 1.0395867421206864, Accuracy: 0.4411764705882353
# RMSE: 1.0394407662064982, Accuracy: 0.4117647058823529

# RMSE: 1.0291323227675842, Accuracy: 0.29411764705882354
# RMSE: 1.0306463633341978, Accuracy: 0.3235294117647059
# RMSE: 1.032703152907061, Accuracy: 0.2647058823529412

# RMSE: 1.0311636872784198, Accuracy: 0.40540540540540543
# RMSE: 1.0324646077714403, Accuracy: 0.40540540540540543
# RMSE: 1.0303426207783912, Accuracy: 0.40540540540540543