In [654]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

In [655]:
pd.set_option('display.max_columns', None)

In [656]:
# FIVE, FOUR, THREE, TWO, ONE, ZERO = 5, 4, 3, 2, 1, 0

# data_table = pd.DataFrame(columns=["league_id", "week", "hour", "minute", "HT", "AT", "HCS", "ACS"])

In [657]:

def txt_reader(_path):
    file = open(_path, 'r')  # Open the file in read mode
    contents = file.read()  # Read the contents of the file
    file.close()  # Close the file

    return contents


def table_creator(contents):
    record_list = []
    weeks = contents.split("WEEK")
    pos = weeks[0].index("League")
    league_no = int(weeks[0][(pos + 7): (pos + 11)])
    for val in weeks[1:]:
        scores = val.split("\n")[1:-1]
        for score in scores:
            temp_dict = {
                "league_id": league_no,
                "week": int((val[:3]).strip()),
                "hour": int(val.split("\n")[0][-8:-6]),
                "minute": int(val.split("\n")[0][-5:-3]),

                "HT": score[0:3],
                "AT": score[8:11],
                "HCS": int(score[4]),
                "ACS": int(score[6])
            }

            # data._append(temp_dict, ignore_index=True)
            record_list.append(temp_dict)
    data = pd.DataFrame(record_list)

    return data

In [658]:
# record = txt_reader("sample_4.txt")
# df = table_creator(record)
# df.head()

In [659]:

class Team:
    def __init__(self, name):
        self.name = name
        self.goals_for = 0
        self.goals_against = 0
        self.goal_difference = 0
        self.points = 0
        self.position = 0
        self.last_results = []  # List to store last results
        self.last_1_result = 0
        self.last_2_result = 0
        
def update_table(team_1, team_1_score, team_2, team_2_score):
    team1 = next(team for team in team_objects if team.name == team_1)
    team2 = next(team for team in team_objects if team.name == team_2)

    scores = [team_1_score, team_2_score]
    new_teams = [team1, team2]

    for i, team in enumerate(new_teams):
        team.goals_for += scores[i]
        team.goals_against += scores[1 - i]
        team.goal_difference = team.goals_for - team.goals_against
        team.points += 3 if scores[i] > scores[1 - i] else 1 if scores[i] == scores[1 - i] else 0
        team.last_results.append(1 if scores[i] > scores[1 - i] else 0 if scores[i] == scores[1 - i] else -1)
        team.last_1_result = team.last_results[-1]
        team.last_2_result = team.last_results[-2] if len(team.last_results) > 1 else 0

    table = sorted(team_objects, key=lambda x: (-x.points, x.name.lower()))

    for i, team in enumerate(table):
        team.position = i + 1

    return table


def get_record(table_objects):
    table_dict = dict()
    for team in table_objects:
        table_dict[team.name] = (
            team.points,
            team.position,
            team.last_1_result,
            team.last_2_result
        )

    return table_dict


def add_features(data):
    df = data.copy()

    col_list = []

    for i, row in df.iterrows():
        ht, hcs, at, acs = row["HT"], row["HCS"], row["AT"], row["ACS"]

        table = update_table(ht, hcs, at, acs)

        if row["week"] > 2:
            col_list.append(table_dict[ht] + table_dict[at])
        else:
            col_list.append((0,) * 8)

        if (i+1) % 20 == 0:
            table_dict = get_record(table)

    # Create a DataFrame from the list of tuples
    new_df = pd.DataFrame(col_list, columns=[
        "h_pts", "h_pos", "HL1R", "HL2R", 
        "a_pts", "a_pos", "AL1R", "AL2R"])

    # Concatenate the new DataFrame with the existing DataFrame
    df = pd.concat([df, new_df], axis=1)

    return df


# Reading the TXT data

In [660]:
path = "league_data"
records = [
    f"{path}/L_6095.txt", f"{path}/L_6097.txt", f"{path}/L_6099.txt",
    f"{path}/L_6148.txt", f"{path}/L_6152.txt", f"{path}/L_6153.txt",
    f"{path}/L_6155.txt", f"{path}/L_6156.txt", f"{path}/L_6166.txt",
    f"{path}/L_6169.txt", f"{path}/L_6170.txt", f"{path}/L_6171.txt",
    f"{path}/L_6173.txt", f"{path}/L_6180.txt", f"{path}/L_6181.txt",
    f"{path}/L_6189.txt", f"{path}/L_6192.txt", f"{path}/L_6211.txt",
    f"{path}/L_6212.txt", f"{path}/L_6213.txt", f"{path}/L_6214.txt",
    f"{path}/L_6215.txt", f"{path}/L_6216.txt", f"{path}/L_6226.txt", 
    f"{path}/L_6227.txt", f"{path}/L_6230.txt"
]

# Data Preprocessing

In [661]:

# teams = ['FOR', 'MNC', 'ASV', 'TOT', 'EVE', 'CHE', 'BRN', 'WHU', 'ARS', 'FUL', 
#          'NWC', 'BOU', 'BUR', 'LIV', 'WOL', 'MNU', 'LUT', 'SHU', 'BRI', 'CRY']

teams = ['FOR', 'MNC', 'ASV', 'TOT', 'EVE', 'CHE', 'BRN', 'WHU', 'ARS', 'FUL', 
         'NWC', 'BOU', 'LEI', 'LIV', 'WOL', 'MNU', 'LEE', 'SOU', 'BRI', 'CRY']


df_temp = []
for record in records:
    text = txt_reader(record)  # read each txt data
    new_df = table_creator(text)  # convert each txt data to a dataframe
    
    # create a new empty league table to record scores & points
    team_objects = [Team(team) for team in teams]

    # adds scores & points from each league to the dataframe
    new_df = add_features(new_df)

    df_temp.append(new_df)  # add each dataframe to a list

df_records = pd.concat(df_temp, axis=0)

In [662]:

df_records.tail(3)

Unnamed: 0,league_id,week,hour,minute,HT,AT,HCS,ACS,h_pts,h_pos,HL1R,HL2R,a_pts,a_pos,AL1R,AL2R
377,6155,38,18,6,FUL,LEE,1,1,38,14,-1,1,45,9,1,1
378,6155,38,18,6,EVE,CHE,1,3,35,18,1,-1,75,2,1,-1
379,6155,38,18,6,MNU,BRI,2,1,48,8,0,-1,44,10,1,1


In [663]:
# df_records.head(50)

In [664]:
# df_records[df_records["week"] == 36].head(10)

In [665]:
# df_records[df_records["week"] == 37].head(10)

In [666]:
# df_records[df_records["week"] == 38].head(10)

In [667]:
# Define function to check home and away scores and return result
def get_result(row):
    if row['HCS'] > row['ACS']:
        return 1  # home_win = 1
    elif row['HCS'] < row['ACS']:
        return 2  # away_win = 2
    else:
        return 0  # draw = 0
        

def feature_engineering(data):
    df_result = data.copy()
        
    # Apply function to each row of the dataframe to create a new column
    df_result['result'] = df_result.apply(get_result, axis=1)

    df_result = df_result[df_result['week'] > 2]  # remove rows with incomplete values

    # Create a LabelEncoder object
    label_encoder = LabelEncoder()

    # Fit the LabelEncoder on the unique team names
    label_encoder.fit(teams)

    # Convert the categorical features to numeric using label encoding
    df_result['HT_encoded'] = label_encoder.transform(df_result['HT'])
    df_result['AT_encoded'] = label_encoder.transform(df_result['AT'])

    # remove irrelevant features
    df_result = df_result.drop(["league_id", "hour", "minute", "HCS", "ACS", "HT", "AT"], axis=1)  

    df_result = df_result.reset_index(drop=True)  # reset the index values

    # Scaling the dataset using standardization method
    X = df_result.drop("result", axis=1)
    y = df_result["result"]

    # assume that X is your dataset with numerical features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # # concatenate the scaled numerical features with the Target variable feature
    data_preprocessed = pd.concat([pd.DataFrame(X_scaled, columns=X.columns), y], axis=1)

    return data_preprocessed


processed_df = feature_engineering(df_records)

In [668]:
processed_df.head(3)

Unnamed: 0,week,h_pts,h_pos,HL1R,HL2R,a_pts,a_pos,AL1R,AL2R,HT_encoded,AT_encoded,result
0,-1.684588,-1.339106,-0.262961,-1.156756,1.14461,-1.283261,-1.125762,1.155484,0.017983,-0.60739,-1.646625,0
1,-1.684588,-1.456221,1.122275,0.021508,-1.180595,-1.283261,-0.95207,1.155484,0.017983,1.127617,-0.779908,1
2,-1.684588,-1.339106,-0.60927,1.199772,-1.180595,-1.165949,-1.473148,1.155484,1.179962,-1.474894,0.606839,1


# Modelling

In [669]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

# Load your dataset into a pandas DataFrame
dataset = processed_df.copy()

# Load your dataset into X and y arrays
X = dataset.drop("result", axis=1)
y = dataset["result"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a Random Forest classifier
rf_model = RandomForestClassifier(random_state=42)

# Fit the model on the training data
rf_model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = rf_model.predict(X_test)

# Calculate the accuracy and F1 score of the model
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print('Accuracy:', accuracy)
print('F1 score:', f1)


Accuracy: 0.4742063492063492
F1 score: 0.4581324473093734


In [676]:
num = 1000
actual_vals = processed_df["result"].head(num)

data = processed_df.drop("result", axis=1).head(num)
pred_vals = rf_model.predict(data).tolist()

count = 0
for x, y in zip(actual_vals, pred_vals):
    if x == y:
        count += 1
        
print("WIN % -", (count/num) * 100)

WIN % - 89.2


# Testing the model on real data

In [671]:
teams = ['FOR', 'MNC', 'ASV', 'TOT', 'EVE', 'CHE', 'BRN', 'WHU', 'ARS', 'FUL', 
         'NWC', 'BOU', 'BUR', 'LIV', 'WOL', 'MNU', 'LUT', 'SHU', 'BRI', 'CRY']

# teams = ['FOR', 'MNC', 'ASV', 'TOT', 'EVE', 'CHE', 'BRN', 'WHU', 'ARS', 'FUL', 
#          'NWC', 'BOU', 'LEI', 'LIV', 'WOL', 'MNU', 'LEE', 'SOU', 'BRI', 'CRY']

         
# create a new empty league table to record scores & points
team_objects = [Team(team) for team in teams]

test_record = "league_data/new_record.txt"
df_new = []

text = txt_reader(test_record)  # read each txt data

# adds scores & points from each league to the dataframe
new_df = add_features(table_creator(text))

df_new.append(new_df)  # add each dataframe to a list

records_df = pd.concat(df_new, axis=0)

process_df = feature_engineering(records_df)

In [672]:
process_df["result"].shape

(270,)

In [677]:
num = 20
actual_vals = process_df["result"].head(num)

data = process_df.drop("result", axis=1).head(num)
pred_vals = rf_model.predict(data).tolist()

count = 0
for x, y in zip(actual_vals, pred_vals):
    if x == y:
        # print(x, "-->", y, "WIN")
        count += 1
    # else:
    #     print(x, "-->", y)

        
print("Total Win Count:", count, "/", num, "\nWIN % -", (count/num) * 100)

Total Win Count: 7 / 20 
WIN % - 35.0


# Over 2.5

In [674]:
league_record = pd.read_csv("league_record.csv")
league_record.shape

(9880, 44)

In [675]:
league_record.columns.tolist()

['league_id',
 'week',
 'hour',
 'minutes',
 'HT',
 'AT',
 'HCS',
 'ACS',
 'HL1S',
 'AL1S',
 'HL2S',
 'AL2S',
 'HL3S',
 'AL3S',
 'hl1_stat',
 'al1_stat',
 'hl2_stat',
 'al2_stat',
 'hl3_stat',
 'al3_stat',
 'h_wins',
 'a_wins',
 'h_draws',
 'a_draws',
 'h_loss',
 'a_loss',
 'h_gf',
 'a_gf',
 'h_ga',
 'a_ga',
 'h_gd',
 'a_gd',
 'hs_avg',
 'as_avg',
 'hw_rat',
 'aw_rat',
 'hl_rat',
 'al_rat',
 'hd_rat',
 'ad_rat',
 'ht_points',
 'at_points',
 'ht_pos',
 'at_pos']