In [203]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

In [204]:
pd.set_option('display.max_columns', None)

In [205]:

def txt_reader(_path):
    file = open(_path, 'r')  # Open the file in read mode
    contents = file.read()  # Read the contents of the file
    file.close()  # Close the file

    return contents


def table_creator(contents):
    record_list = []
    weeks = contents.split("WEEK")
    pos = weeks[0].index("League")
    league_no = int(weeks[0][(pos + 7): (pos + 11)])
    for val in weeks[1:]:
        scores = val.split("\n")[1:-1]
        for score in scores:
            temp_dict = {
                "league_id": league_no,
                "week": int((val[:3]).strip()),
                "hour": int(val.split("\n")[0][-8:-6]),
                "minute": int(val.split("\n")[0][-5:-3]),

                "HT": score[0:3],
                "AT": score[8:11],
                "HCS": int(score[4]),
                "ACS": int(score[6])
            }

            # data._append(temp_dict, ignore_index=True)
            record_list.append(temp_dict)
    data = pd.DataFrame(record_list)

    return data

In [206]:

class Team:
    def __init__(self, name):
        self.name = name
        self.goals_for = 0
        self.goals_against = 0
        self.goal_difference = 0
        self.points = 0
        self.position = 0
        self.last_scores = []  # List to store last scores
        self.last_1_scores = 0
        self.last_2_scores = 0
        self.last_results = []  # List to store last results
        self.last_1_result = 0
        self.last_2_result = 0
        self.matches_played = 0
        self.wins = 0
        self.losses = 0
        self.draws = 0
        self.win_lose_ratio = 0
        self.pts_per_game = 0

def update_table(team_1, team_1_score, team_2, team_2_score):
    team1 = next(team for team in team_objects if team.name == team_1)
    team2 = next(team for team in team_objects if team.name == team_2)

    scores = [team_1_score, team_2_score]
    new_teams = [team1, team2]

    for i, team in enumerate(new_teams):
        team.goals_for += scores[i]
        team.goals_against += scores[1 - i]
        team.goal_difference = team.goals_for - team.goals_against
        team.points += 3 if scores[i] > scores[1 - i] else 1 if scores[i] == scores[1 - i] else 0

        team.last_scores.append(scores[i])
        team.last_1_scores = team.last_scores[-1]
        team.last_2_scores = team.last_scores[-2] if len(team.last_scores) > 1 else 1000

        team.last_results.append(1 if scores[i] > scores[1 - i] else 0 if scores[i] == scores[1 - i] else -1)
        team.last_1_result = team.last_results[-1]
        team.last_2_result = team.last_results[-2] if len(team.last_results) > 1 else 1000
        
        team.matches_played += 1 
        team.wins += 1 if scores[i] > scores[1 - i] else 0
        team.losses += 1 if scores[i] < scores[1 - i] else 0
        team.draws += 1 if scores[i] == scores[1 - i] else 0
        team.win_lose_ratio = team.wins / team.matches_played if team.matches_played > 0 else 0.0
        team.pts_per_game = team.points / team.matches_played if team.matches_played > 0 else 0.0

    table = sorted(team_objects, key=lambda x: (-x.points, x.name.lower()))

    for i, team in enumerate(table):
        team.position = i + 1

    return table


def get_record(table_objects):
    table_dict = dict()
    for team in table_objects:
        table_dict[team.name] = (
            team.goals_for,
            team.goals_against,
            team.goal_difference,
            team.points,
            team.position,
            team.last_1_scores,
            # team.last_2_scores,
            team.last_1_result,
            # team.last_2_result,
            team.wins,
            team.losses,
            team.draws,
            team.win_lose_ratio,
            team.pts_per_game
        )

    return table_dict


def add_features(data):
    df = data.copy()

    col_list = []

    for i, row in df.iterrows():
        ht, hcs, at, acs = row["HT"], row["HCS"], row["AT"], row["ACS"]

        table = update_table(ht, hcs, at, acs)

        if row["week"] > 1:
            col_list.append(table_dict[ht] + table_dict[at])
        else:
            col_list.append((0,) * 24)

        if (i+1) % 10 == 0:
            table_dict = get_record(table)

    # Create a DataFrame from the list of tuples
    new_df = pd.DataFrame(col_list, columns=[
        "h_gf", "h_ga", "h_gd", "h_pts", "h_pos", "HL1S", "HL1R", "H_win", "H_loss", "H_draws", "H_wlr", "H_ppg", 
        "a_gf", "a_ga", "a_gd", "a_pts", "a_pos", "AL1S", "AL1R", "A_win", "A_loss", "A_draws", "A_wlr", "A_ppg"])

    # Concatenate the new DataFrame with the existing DataFrame
    df = pd.concat([df, new_df], axis=1)

    return df


In [207]:
path = "league_data"
records = [
    f"{path}/L_6095.txt", f"{path}/L_6097.txt", f"{path}/L_6099.txt",
    f"{path}/L_6148.txt", f"{path}/L_6152.txt", f"{path}/L_6153.txt",
    f"{path}/L_6155.txt", f"{path}/L_6155.txt", f"{path}/L_6166.txt",
    f"{path}/L_6169.txt", f"{path}/L_6170.txt", f"{path}/L_6171.txt",
    f"{path}/L_6173.txt", f"{path}/L_6180.txt", f"{path}/L_6181.txt",
    f"{path}/L_6189.txt", f"{path}/L_6192.txt", f"{path}/L_6211.txt",
    f"{path}/L_6212.txt", f"{path}/L_6213.txt", f"{path}/L_6214.txt",
    f"{path}/L_6215.txt", f"{path}/L_6216.txt", f"{path}/L_6226.txt", 
    f"{path}/L_6227.txt", f"{path}/L_6216.txt"
]

In [208]:

# teams = ['FOR', 'MNC', 'ASV', 'TOT', 'EVE', 'CHE', 'BRN', 'WHU', 'ARS', 'FUL', 
#          'NWC', 'BOU', 'BUR', 'LIV', 'WOL', 'MNU', 'LUT', 'SHU', 'BRI', 'CRY']

teams = ['FOR', 'MNC', 'ASV', 'TOT', 'EVE', 'CHE', 'BRN', 'WHU', 'ARS', 'FUL', 
         'NWC', 'BOU', 'LEI', 'LIV', 'WOL', 'MNU', 'LEE', 'SOU', 'BRI', 'CRY']


df_temp = []
for record in records:
    text = txt_reader(record)  # read each txt data
    new_df = table_creator(text)  # convert each txt data to a dataframe
    
    # create a new empty league table to record scores & points
    team_objects = [Team(team) for team in teams]

    # adds scores & points from each league to the dataframe
    new_df = add_features(new_df)

    df_temp.append(new_df)  # add each dataframe to a list

df_records = pd.concat(df_temp, axis=0)

In [209]:
df_records[["HT", "AT", "HCS", "ACS", "HL1S", "HL1R", "AL1S", "AL1R"]].iloc[20:50]

Unnamed: 0,HT,AT,HCS,ACS,HL1S,HL1R,AL1S,AL1R
20,CRY,ARS,1,1,0,-1,4,1
21,SOU,CHE,2,0,2,0,1,1
22,ASV,MNC,4,2,2,1,4,1
23,LEI,BOU,5,0,0,-1,2,0
24,BRN,LIV,0,4,0,0,3,1
25,EVE,WHU,2,1,1,-1,2,1
26,BRI,FOR,1,0,1,-1,0,-1
27,LEE,MNU,0,1,0,-1,3,1
28,TOT,FUL,0,0,1,-1,0,0
29,WOL,NWC,1,0,0,-1,1,1


In [210]:
# Define function to check home and away scores and return result
def get_result(row):
    if row['HCS'] + row['ACS'] >= 3:
        return 1  # home_win = 1
    else:
        return 0  # draw = 0
        

def feature_engineering(data):
    df_result = data.copy()
        
    # Apply function to each row of the dataframe to create a new column
    df_result['result'] = df_result.apply(get_result, axis=1)

    df_result = df_result[df_result['week'] > 2]  # remove rows with incomplete values

    # Create a LabelEncoder object
    label_encoder = LabelEncoder()

    # Fit the LabelEncoder on the unique team names
    label_encoder.fit(teams)

    # Convert the categorical features to numeric using label encoding
    df_result['HT_encoded'] = label_encoder.transform(df_result['HT'])
    df_result['AT_encoded'] = label_encoder.transform(df_result['AT'])

    # remove irrelevant features
    df_result = df_result.drop(["league_id", "hour", "minute", "HCS", "ACS", "HT", "AT"], axis=1)  

    df_result = df_result.reset_index(drop=True)  # reset the index values

    # Scaling the dataset using standardization method
    X = df_result.drop("result", axis=1)
    y = df_result["result"]

    # assume that X is your dataset with numerical features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # # concatenate the scaled numerical features with the Target variable feature
    data_preprocessed = pd.concat([pd.DataFrame(X_scaled, columns=X.columns), y], axis=1)

    return data_preprocessed


processed_df = feature_engineering(df_records)

In [211]:
processed_df.head(3)

Unnamed: 0,week,h_gf,h_ga,h_gd,h_pts,h_pos,HL1S,HL1R,H_win,H_loss,H_draws,H_wlr,H_ppg,a_gf,a_ga,a_gd,a_pts,a_pos,AL1S,AL1R,A_win,A_loss,A_draws,A_wlr,A_ppg,HT_encoded,AT_encoded,result
0,-1.684588,-1.460978,-1.378759,-0.146137,-1.395691,-0.274249,-1.113322,-1.062367,-1.220192,-1.269759,-1.511523,0.761404,0.287592,-1.404425,-1.628428,0.223771,-1.347295,-1.11462,2.084075,1.061427,-1.230379,-1.464101,-1.22215,0.720578,1.258961,-0.608156,-1.646091,0
1,-1.684588,-1.523666,-1.443136,-0.146137,-1.514067,1.111058,0.695983,0.113574,-1.419271,-1.269759,-1.220463,-2.032172,-1.729231,-1.404425,-1.499643,0.068841,-1.347295,-0.940901,-0.378612,1.061427,-1.230379,-1.464101,-1.22215,0.720578,1.258961,1.126467,-0.779182,0
2,-1.684588,-1.39829,-1.507514,0.085989,-1.395691,-0.620576,0.695983,1.289515,-1.220192,-1.269759,-1.511523,0.761404,0.287592,-1.279032,-1.628428,0.3787,-1.228708,-1.462057,2.084075,1.061427,-1.031096,-1.464101,-1.513109,3.491038,3.277408,-1.475467,0.607874,1


# Modelling

In [212]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

# Load your dataset into a pandas DataFrame
dataset = processed_df.copy()

# Load your dataset into X and y arrays
X = dataset.drop("result", axis=1)
y = dataset["result"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a Random Forest classifier
rf_model = RandomForestClassifier(random_state=42)

# Fit the model on the training data
rf_model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = rf_model.predict(X_test)

# Calculate the accuracy and F1 score of the model
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print('Accuracy:', accuracy)
print('F1 score:', f1)


Accuracy: 0.5849358974358975
F1 score: 0.5838819601336614


In [213]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

# Load your dataset into a pandas DataFrame
dataset = processed_df.copy()

# Load your dataset into X and y arrays
X = dataset.drop("result", axis=1)
y = dataset["result"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create an XGBoost classifier
xgb_model = xgb.XGBClassifier(random_state=42)

# Fit the model on the training data
xgb_model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = xgb_model.predict(X_test)

# Calculate the accuracy and F1 score of the model
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print('Accuracy:', accuracy)
print('F1 score:', f1)

Accuracy: 0.5726495726495726
F1 score: 0.5716394016156947


In [214]:
num = 1000
actual_vals = processed_df["result"].head(num)

data = processed_df.drop("result", axis=1).head(num)
pred_vals = rf_model.predict(data).tolist()

count = 0
for x, y in zip(actual_vals, pred_vals):
    if x == y:
        count += 1
        
print("WIN % -", (count/num) * 100)

WIN % - 91.3


# Testing the model on real data

In [215]:
teams = ['FOR', 'MNC', 'ASV', 'TOT', 'EVE', 'CHE', 'BRN', 'WHU', 'ARS', 'FUL', 
         'NWC', 'BOU', 'BUR', 'LIV', 'WOL', 'MNU', 'LUT', 'SHU', 'BRI', 'CRY']

# teams = ['FOR', 'MNC', 'ASV', 'TOT', 'EVE', 'CHE', 'BRN', 'WHU', 'ARS', 'FUL', 
#          'NWC', 'BOU', 'LEI', 'LIV', 'WOL', 'MNU', 'LEE', 'SOU', 'BRI', 'CRY']

         
# create a new empty league table to record scores & points
team_objects = [Team(team) for team in teams]

test_record = "league_data/new_record.txt"
df_new = []

text = txt_reader(test_record)  # read each txt data

# adds scores & points from each league to the dataframe
new_df = add_features(table_creator(text))

df_new.append(new_df)  # add each dataframe to a list

records_df = pd.concat(df_new, axis=0)

process_df = feature_engineering(records_df)

In [216]:
process_df["result"].shape

(270,)

In [217]:
num = 268
actual_vals = process_df["result"].head(num)

data = process_df.drop("result", axis=1).head(num)
pred_vals = rf_model.predict(data).tolist()

count = 0
for x, y in zip(actual_vals, pred_vals):
    if x == y:
        # print(x, "-->", y, "WIN")
        count += 1
    # else:
    #     print(x, "-->", y)

        
print("Total Win Count:", count, "/", num, "\nWIN % -", (count/num) * 100)

Total Win Count: 132 / 268 
WIN % - 49.25373134328358


In [218]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

# Load your dataset into a pandas DataFrame
dataset = processed_df.copy()

# Load your dataset into X and y arrays
X = dataset.drop("result", axis=1)
y = dataset["result"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define a dictionary to store the models
models = {
    'Random Forest': RandomForestClassifier(random_state=42),
    'SVC': SVC(random_state=42),
    'XGBoost': XGBClassifier(random_state=42),
    'Gradient Boost': GradientBoostingClassifier(random_state=42),
    'Logistic Regression': LogisticRegression(random_state=42),
    # 'K-Nearest Neighbors': KNeighborsClassifier(n_neighbors=5)
}

# Calculate the F1 scores for each model
model_scores = []
for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = round(accuracy_score(y_test, y_pred), 3)
    f1 = round(f1_score(y_test, y_pred, average='weighted'), 3)
    model_scores.append((model_name, f1))
    print(f'{model_name}:')
    print('Accuracy:', accuracy)
    print('F1 score:', f1)
    print('-' * 40)

# Sort the models based on F1 scores in descending order
model_scores.sort(key=lambda x: x[1], reverse=True)
model_names, f1_scores = zip(*model_scores)

# Plot the horizontal bar graph
plt.figure(figsize=(12, 7))
plt.barh(model_names, f1_scores)  # <-- Use plt.barh() for horizontal bars
plt.ylabel('Model')
plt.xlabel('F1 Score')
plt.title('F1 Scores of Different Models')

# Add the scores beside each bar
for i, score in enumerate(f1_scores):
    plt.text(score, i, f'{score:.3f}', ha='left', va='center')  # <-- Adjust text position

# Display the horizontal bar graph
plt.show()

Random Forest:
Accuracy: 0.585
F1 score: 0.584
----------------------------------------
SVC:
Accuracy: 0.532
F1 score: 0.516
----------------------------------------
XGBoost:
Accuracy: 0.573
F1 score: 0.572
----------------------------------------
