In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

In [2]:
pd.set_option('display.max_columns', None)

In [3]:

def txt_reader(_path):
    file = open(_path, 'r')  # Open the file in read mode
    contents = file.read()  # Read the contents of the file
    file.close()  # Close the file

    return contents


def table_creator(contents):
    record_list = []
    weeks = contents.split("WEEK")
    pos = weeks[0].index("League")
    league_no = int(weeks[0][(pos + 7): (pos + 11)])
    for val in weeks[1:]:
        scores = val.split("\n")[1:-1]
        for score in scores:
            home_t = score[0:3]
            away_t = score[8:11]
            
            # Update names of new teams to old ones
            home_t = "BUR" if home_t == "LEE" else "LUT" if home_t == "LEI" else "SHU" if home_t == "SOU" else home_t
            away_t = "BUR" if away_t == "LEE" else "LUT" if away_t == "LEI" else "SHU" if away_t == "SOU" else away_t

            temp_dict = {
                # "league_id": league_no,
                "week": int((val[:3]).strip()),
                # "hour": int(val.split("\n")[0][-8:-6]),
                # "minute": int(val.split("\n")[0][-5:-3]),

                "HT": home_t,
                "AT": away_t,
                "HCS": int(score[4]),
                "ACS": int(score[6])
            }

            # data._append(temp_dict, ignore_index=True)
            record_list.append(temp_dict)
    data = pd.DataFrame(record_list)

    return data

In [4]:

class Team:
    def __init__(self, name):
        self.name = name
        self.goals_for = 0
        self.goals_against = 0
        self.goal_difference = 0
        self.points = 0
        self.position = 0
        self.last_scores = []  # List to store last scores
        self.last_1_scores = 0
        self.last_2_scores = 0
        self.last_results = []  # List to store last results
        self.last_1_result = 0
        self.last_2_result = 0
        self.matches_played = 0
        self.wins = 0
        self.losses = 0
        self.draws = 0
        self.win_lose_ratio = 0
        self.pts_per_game = 0

def update_table(team_1, team_1_score, team_2, team_2_score):
    team1 = next(team for team in team_objects if team.name == team_1)
    team2 = next(team for team in team_objects if team.name == team_2)

    scores = [team_1_score, team_2_score]
    new_teams = [team1, team2]

    for i, team in enumerate(new_teams):
        team.goals_for += scores[i]
        team.goals_against += scores[1 - i]
        team.goal_difference = team.goals_for - team.goals_against
        team.points += 3 if scores[i] > scores[1 - i] else 1 if scores[i] == scores[1 - i] else 0

        team.last_scores.append(scores[i])
        team.last_1_scores = team.last_scores[-1]
        team.last_2_scores = team.last_scores[-2] if len(team.last_scores) > 1 else 1000

        team.last_results.append(1 if scores[i] > scores[1 - i] else 0 if scores[i] == scores[1 - i] else -1)
        team.last_1_result = team.last_results[-1]
        team.last_2_result = team.last_results[-2] if len(team.last_results) > 1 else 1000
        
        team.matches_played += 1 
        team.wins += 1 if scores[i] > scores[1 - i] else 0
        team.losses += 1 if scores[i] < scores[1 - i] else 0
        team.draws += 1 if scores[i] == scores[1 - i] else 0
        team.win_lose_ratio = team.wins / team.matches_played if team.matches_played > 0 else 0.0
        team.pts_per_game = team.points / team.matches_played if team.matches_played > 0 else 0.0

    table = sorted(team_objects, key=lambda x: (-x.points, x.name.lower()))

    for i, team in enumerate(table):
        team.position = i + 1

    return table


def get_record(table_objects):
    table_dict = dict()
    for team in table_objects:
        table_dict[team.name] = (
            team.goals_for,
            team.goals_against,
            team.goal_difference,
            team.points,
            team.position,
            team.last_1_scores,
            # team.last_2_scores,
            team.last_1_result,
            # team.last_2_result,
            team.wins,
            team.losses,
            team.draws,
            team.win_lose_ratio,
            team.pts_per_game
        )

    return table_dict


def add_features(data):
    df = data.copy()

    col_list = []

    for i, row in df.iterrows():
        ht, hcs, at, acs = row["HT"], row["HCS"], row["AT"], row["ACS"]

        table = update_table(ht, hcs, at, acs)

        if row["week"] > 1:
            col_list.append(table_dict[ht] + table_dict[at])
        else:
            col_list.append((0,) * 24)

        if (i+1) % 10 == 0:
            table_dict = get_record(table)

    # Create a DataFrame from the list of tuples
    new_df = pd.DataFrame(col_list, columns=[
        "h_gf", "h_ga", "h_gd", "h_pts", "h_pos", "HL1S", "HL1R", "H_win", "H_loss", "H_draws", "H_wlr", "H_ppg", 
        "a_gf", "a_ga", "a_gd", "a_pts", "a_pos", "AL1S", "AL1R", "A_win", "A_loss", "A_draws", "A_wlr", "A_ppg"])

    # Concatenate the new DataFrame with the existing DataFrame
    df = pd.concat([df, new_df], axis=1)

    return df


In [5]:
path = "league_data"
records = [
    f"{path}/L_6095.txt", f"{path}/L_6097.txt", f"{path}/L_6099.txt",
    f"{path}/L_6148.txt", f"{path}/L_6152.txt", f"{path}/L_6153.txt",
    f"{path}/L_6155.txt", f"{path}/L_6156.txt", f"{path}/L_6166.txt",
    f"{path}/L_6169.txt", f"{path}/L_6170.txt", f"{path}/L_6171.txt",
    f"{path}/L_6173.txt", f"{path}/L_6180.txt", f"{path}/L_6181.txt",
    f"{path}/L_6189.txt", f"{path}/L_6192.txt", f"{path}/L_6211.txt",
    f"{path}/L_6212.txt", f"{path}/L_6213.txt", f"{path}/L_6214.txt",
    f"{path}/L_6215.txt", f"{path}/L_6216.txt", f"{path}/L_6226.txt", 
    f"{path}/L_6227.txt", f"{path}/L_6230.txt", f"{path}/LN_1575.txt",
    f"{path}/LN_1576.txt"
]

In [6]:

teams = ['FOR', 'MNC', 'ASV', 'TOT', 'EVE', 'CHE', 'BRN', 'WHU', 'ARS', 'FUL', 
         'NWC', 'BOU', 'BUR', 'LIV', 'WOL', 'MNU', 'LUT', 'SHU', 'BRI', 'CRY']

# teams = ['FOR', 'MNC', 'ASV', 'TOT', 'EVE', 'CHE', 'BRN', 'WHU', 'ARS', 'FUL', 
#          'NWC', 'BOU', 'LEI', 'LIV', 'WOL', 'MNU', 'LEE', 'SOU', 'BRI', 'CRY']


df_temp = []
for record in records:
    text = txt_reader(record)  # read each txt data
    new_df = table_creator(text)  # convert each txt data to a dataframe
    
    # create a new empty league table to record scores & points
    team_objects = [Team(team) for team in teams]

    # adds scores & points from each league to the dataframe
    new_df = add_features(new_df)

    df_temp.append(new_df)  # add each dataframe to a list

df_records = pd.concat(df_temp, axis=0)

In [7]:
# df_records[["HT", "AT", "HCS", "ACS", "HL1S", "HL1R", "AL1S", "AL1R"]].iloc[20:50]

In [8]:
# Define function to check home and away scores and return result
def get_result(row):
    if row['HCS'] + row['ACS'] > 3:
        return 1  # home_win = 1
    else:
        return 0  # draw = 0
        

def feature_engineering(data):
    df_result = data.copy()
        
    # Apply function to each row of the dataframe to create a new column
    df_result['result'] = df_result.apply(get_result, axis=1)

    df_result = df_result[df_result['week'] > 3]  # remove rows with incomplete values

    # Create a LabelEncoder object
    label_encoder = LabelEncoder()

    # Fit the LabelEncoder on the unique team names
    label_encoder.fit(teams)

    # Convert the categorical features to numeric using label encoding
    df_result['HT_encoded'] = label_encoder.transform(df_result['HT'])
    df_result['AT_encoded'] = label_encoder.transform(df_result['AT'])

    # remove irrelevant features
    df_result = df_result.drop(["HCS", "ACS", "HT", "AT"], axis=1)  

    df_result = df_result.reset_index(drop=True)  # reset the index values

    # Scaling the dataset using standardization method
    X = df_result.drop("result", axis=1)
    y = df_result["result"]

    # assume that X is your dataset with numerical features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # # concatenate the scaled numerical features with the Target variable feature
    data_preprocessed = pd.concat([pd.DataFrame(X_scaled, columns=X.columns), y], axis=1)

    return data_preprocessed


processed_df = feature_engineering(df_records)

In [9]:
processed_df.head(3)

Unnamed: 0,week,h_gf,h_ga,h_gd,h_pts,h_pos,HL1S,HL1R,H_win,H_loss,H_draws,H_wlr,H_ppg,a_gf,a_ga,a_gd,a_pts,a_pos,AL1S,AL1R,A_win,A_loss,A_draws,A_wlr,A_ppg,HT_encoded,AT_encoded,result
0,-1.683251,-1.208772,-1.449428,0.236635,-1.281173,-0.789747,0.697362,-1.063136,-1.073186,-1.321738,-1.578042,1.753668,1.336123,-1.530901,-1.574698,-0.008488,-1.289979,-1.118814,-0.377747,1.062088,-1.081913,-1.313126,-1.580799,1.735934,1.320169,0.605928,-1.124241,0
1,-1.683251,-1.463318,-1.449428,-0.067579,-1.401666,-0.270699,-1.108702,-1.063136,-1.275215,-1.321738,-1.28194,-0.185329,-0.05864,-1.276676,-1.640376,0.372374,-1.289979,-0.944961,2.903372,1.062088,-1.081913,-1.313126,-1.580799,1.735934,1.320169,-0.609401,0.434799,0
2,-1.683251,-1.272408,-1.514994,0.236635,-1.220926,-1.481811,-0.20567,1.288192,-1.073186,-1.529605,-1.28194,1.753668,2.033504,-1.721569,-1.706055,-0.08466,-1.531059,1.141273,-1.198027,-0.112417,-1.48585,-1.313126,-0.987947,-2.150599,-1.485995,0.779547,0.088346,0


# Modelling

In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

# Load your dataset into a pandas DataFrame
dataset = processed_df.copy()

# Load your dataset into X and y arrays
X = dataset.drop("result", axis=1)
y = dataset["result"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a Random Forest classifier
rf_model = RandomForestClassifier(random_state=42)

# Fit the model on the training data
rf_model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = rf_model.predict(X_test)

# Calculate the accuracy and F1 score of the model
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print('Accuracy:', accuracy)
print('F1 score:', f1)


Accuracy: 0.7030612244897959
F1 score: 0.6074031417366847


In [11]:
num = 1000
actual_vals = processed_df["result"].head(num)

data = processed_df.drop("result", axis=1).head(num)
pred_vals = rf_model.predict(data).tolist()

count = 0
for x, y in zip(actual_vals, pred_vals):
    if x == y:
        count += 1
        
print("WIN % -", (count/num) * 100)

WIN % - 93.89999999999999


# Testing the model on real data

In [12]:

# team = {"BUR": "LEE", "LUT": "LEI", "SHU": "SOU"}.get(team, team)
# print(team)


In [13]:
# teams = ['FOR', 'MNC', 'ASV', 'TOT', 'EVE', 'CHE', 'BRN', 'WHU', 'ARS', 'FUL',
#          'NWC', 'BOU', 'BUR', 'LIV', 'WOL', 'MNU', 'LUT', 'SHU', 'BRI', 'CRY']

# # teams = ['FOR', 'MNC', 'ASV', 'TOT', 'EVE', 'CHE', 'BRN', 'WHU', 'ARS', 'FUL',
# #          'NWC', 'BOU', 'LEI', 'LIV', 'WOL', 'MNU', 'LEE', 'SOU', 'BRI', 'CRY']

# path = "league_data"
# stake_records = [f"{path}/LN_1570.txt"]

# rec_list = []
# for test_record in stake_records:
#     # create a new empty league table to record scores & points
#     team_objects = [Team(team) for team in teams]

#     text = txt_reader(test_record)  # read each txt data

#     # adds scores & points from each league to the dataframe
#     new_df = add_features(table_creator(text))

#     rec_list.append(new_df)

# records_df = pd.concat(rec_list, axis=0)

# all_process_df = feature_engineering(records_df)
# all_process_df.shape[0]

In [14]:

# num = 360
# data = all_process_df.drop("result", axis=1).iloc[50:num]
# pred_results = rf_model.predict(data).tolist()

# actual_results = all_process_df["result"].iloc[50:num].tolist()

# total_played, total_won = 0, 0
# for x, y in zip(actual_results, pred_results):
#     if x == y and x == 1:
#         print(x, y)
#     if y == 1:
#         total_played += 1
#         if x == y:
#             total_won += 1
# print(f"{total_won} / {total_played}")


In [40]:
teams = ['FOR', 'MNC', 'ASV', 'TOT', 'EVE', 'CHE', 'BRN', 'WHU', 'ARS', 'FUL',
         'NWC', 'BOU', 'BUR', 'LIV', 'WOL', 'MNU', 'LUT', 'SHU', 'BRI', 'CRY']

stake_teams = txt_reader("league_data/stake_teams.txt")
new_week = int(stake_teams.split("\n")[4].split("WEEK")[1].strip())
stake_teams = stake_teams.split("\n")[5:-9][0:31:3][1:]
stake_vals = []
for club in stake_teams:
    club = club.strip()
    h_team = club[:4].strip()
    a_team = club[6:10].strip()
    stake_vals.append((new_week, h_team, a_team, 0, 0))
stake_teams_df = pd.DataFrame(stake_vals, columns=["week", "HT", "AT", "HCS", "ACS"])


team_objects = [Team(team) for team in teams]

stake_df = table_creator(txt_reader("league_data/LN_1570.txt"))
merge_df = pd.concat([stake_df, stake_teams_df], axis=0, ignore_index=True)
merge_df = add_features(merge_df)

process_df = feature_engineering(merge_df)

pred_data = process_df.drop("result", axis=1).tail(10)
pred_vals = rf_model.predict(pred_data).tolist()

home = merge_df["HT"].tail(10)
away = merge_df["AT"].tail(10)

for x, y, z in zip(home, away, pred_vals):
    print(x, "-", y, "=", z)


WOL - NWC = 0
BUR - FOR = 0
BRN - TOT = 0
BRI - SHU = 0
MNC - MNU = 0
EVE - CHE = 0
CRY - BOU = 0
WHU - ASV = 0
LUT - LIV = 0
FUL - ARS = 0


In [None]:
# from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
# from sklearn.svm import SVC
# from xgboost import XGBClassifier
# from sklearn.linear_model import LogisticRegression
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import accuracy_score, f1_score

# # Load your dataset into a pandas DataFrame
# dataset = processed_df.copy()

# # Load your dataset into X and y arrays
# X = dataset.drop("result", axis=1)
# y = dataset["result"]

# # Split the data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # Define a dictionary to store the models
# models = {
#     'Random Forest': RandomForestClassifier(random_state=42),
#     'SVC': SVC(random_state=42),
#     'XGBoost': XGBClassifier(random_state=42),
#     'Gradient Boost': GradientBoostingClassifier(random_state=42),
#     'Logistic Regression': LogisticRegression(random_state=42),
#     # 'K-Nearest Neighbors': KNeighborsClassifier(n_neighbors=5)
# }

# # Calculate the F1 scores for each model
# model_scores = []
# for model_name, model in models.items():
#     model.fit(X_train, y_train)
#     y_pred = model.predict(X_test)
#     accuracy = round(accuracy_score(y_test, y_pred), 3)
#     f1 = round(f1_score(y_test, y_pred, average='weighted'), 3)
#     model_scores.append((model_name, f1))
#     print(f'{model_name}:')
#     print('Accuracy:', accuracy)
#     print('F1 score:', f1)
#     print('-' * 40)

# # Sort the models based on F1 scores in descending order
# model_scores.sort(key=lambda x: x[1], reverse=True)
# model_names, f1_scores = zip(*model_scores)

# # Plot the horizontal bar graph
# plt.figure(figsize=(12, 7))
# plt.barh(model_names, f1_scores)  # <-- Use plt.barh() for horizontal bars
# plt.ylabel('Model')
# plt.xlabel('F1 Score')
# plt.title('F1 Scores of Different Models')

# # Add the scores beside each bar
# for i, score in enumerate(f1_scores):
#     plt.text(score, i, f'{score:.3f}', ha='left', va='center')  # <-- Adjust text position

# # Display the horizontal bar graph
# plt.show()