In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

### Load data into pandas dataframes

In [10]:
seasonsData = pd.read_csv('trainingData/seasons.csv')
matchesData = pd.read_csv('trainingData/matches.csv')

# fill missing values with 0 and sort by date

seasonsData = seasonsData.fillna(0)
matchesData = matchesData.fillna(0)

def date_formatter(date):
    months = {'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4, 'May': 5, 'Jun': 6, 'Jul': 7, 'Aug': 8, 'Sep': 9, 'Oct': 10, 'Nov': 11, 'Dec': 12}
    date = date.split(' ')
    return f'{date[3]}-{months[date[2]]}-{date[1]}'

matchesData['Date'] = matchesData['Date'].apply(lambda x: pd.to_datetime(date_formatter(x)))
matchesData = matchesData.sort_values(by='Date')

# drop the 2024/25 season as it is incomplete
seasonsData = seasonsData[seasonsData['Season ID'] != '2024/25']

Unnamed: 0,Club Name,Season ID,Matches Played,Matches Won,Matches Lost,Aerial Battles/Duels Won,Big Chances Created,Blocked shots,Clean sheets,Clearances,...,Passes per match,Penalties scored,Red cards,Saves,Shooting accuracy %,Shots,Shots on target,Tackle success %,Tackles,Yellow cards
1,Arsenal,2023/24,38,28,5,2306,87,218,18,481,...,554.00,10,2,55,35%,657,228,57%,610,62
2,Arsenal,2022/23,38,26,6,2336,73,184,14,567,...,538.16,3,0,95,34%,593,204,60%,568,52
3,Arsenal,2021/22,38,22,13,2148,39,171,13,636,...,481.32,5,4,100,34%,589,198,58%,540,60
4,Arsenal,2020/21,38,18,13,2171,45,125,12,615,...,529.63,6,5,96,33%,459,152,55%,456,47
5,Arsenal,2019/20,38,14,10,2447,48,99,10,703,...,490.76,3,5,147,37%,406,151,58%,584,86
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
647,Wolverhampton-Wanderers,2018/19,38,16,13,2998,65,118,9,824,...,437.32,4,1,100,32%,477,152,57%,720,72
648,Wolverhampton-Wanderers,2011/12,38,5,23,1990,33,128,4,1178,...,391.05,3,4,9,32%,473,152,75%,619,64
649,Wolverhampton-Wanderers,2010/11,38,11,20,2407,38,132,5,1205,...,390.08,3,2,0,30%,459,139,71%,677,62
650,Wolverhampton-Wanderers,2009/10,38,9,18,2397,0,123,8,1618,...,343.08,2,4,7,27%,436,119,78%,705,63


In [13]:
# convert percentages to decimals

for column in matchesData.columns:
    if '%' in column:
        matchesData[column] = matchesData[column].apply(lambda x: float(x)/100)

for column in seasonsData.columns:
    if '%' in column:
        seasonsData[column] = seasonsData[column].apply(lambda x: float(x[:-1])/100)

In [18]:
# remove/convert all non-numeric columns
# convert club to id
clubNames = seasonsData['Club Name'].unique().tolist()

matchesData['Home Team'] = matchesData['Home Team'].apply(lambda x: clubNames.index(x))
matchesData['Away Team'] = matchesData['Away Team'].apply(lambda x: clubNames.index(x))

seasonsData['Club Name'] = seasonsData['Club Name'].apply(lambda x: clubNames.index(x))

# remove season ID
seasonsData = seasonsData.drop(columns=['Season ID'])

# remove date
matchesData = matchesData.drop(columns=['Date'])

ValueError: 'Sheffield-United' is not in list

In [17]:
seasonsData

Unnamed: 0,Club Name,Season ID,Matches Played,Matches Won,Matches Lost,Aerial Battles/Duels Won,Big Chances Created,Blocked shots,Clean sheets,Clearances,...,Passes per match,Penalties scored,Red cards,Saves,Shooting accuracy %,Shots,Shots on target,Tackle success %,Tackles,Yellow cards
1,Arsenal,2023/24,38,28,5,2306,87,218,18,481,...,554.00,10,2,55,0.35,657,228,0.57,610,62
2,Arsenal,2022/23,38,26,6,2336,73,184,14,567,...,538.16,3,0,95,0.34,593,204,0.60,568,52
3,Arsenal,2021/22,38,22,13,2148,39,171,13,636,...,481.32,5,4,100,0.34,589,198,0.58,540,60
4,Arsenal,2020/21,38,18,13,2171,45,125,12,615,...,529.63,6,5,96,0.33,459,152,0.55,456,47
5,Arsenal,2019/20,38,14,10,2447,48,99,10,703,...,490.76,3,5,147,0.37,406,151,0.58,584,86
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
647,Wolverhampton-Wanderers,2018/19,38,16,13,2998,65,118,9,824,...,437.32,4,1,100,0.32,477,152,0.57,720,72
648,Wolverhampton-Wanderers,2011/12,38,5,23,1990,33,128,4,1178,...,391.05,3,4,9,0.32,473,152,0.75,619,64
649,Wolverhampton-Wanderers,2010/11,38,11,20,2407,38,132,5,1205,...,390.08,3,2,0,0.30,459,139,0.71,677,62
650,Wolverhampton-Wanderers,2009/10,38,9,18,2397,0,123,8,1618,...,343.08,2,4,7,0.27,436,119,0.78,705,63
