<a href="https://colab.research.google.com/github/Brandon-Sch/Sports-Score-Simulator/blob/main/PremierLeagueScorePredictor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Parsing

In [6]:
#import packages
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [7]:
#read data in from the csv file using pandas read
#specify encoding because different encoding than UTF-8
df = pd.read_csv("PremierLeagueData.csv", encoding='latin1')
#print the first 5 lines of the dataset
print(df.head())

    Season              DateTime     HomeTeam        AwayTeam  FTHG  FTAG FTR  \
0  1993-94  1993-08-14T00:00:00Z      Arsenal        Coventry     0     3   A   
1  1993-94  1993-08-14T00:00:00Z  Aston Villa             QPR     4     1   H   
2  1993-94  1993-08-14T00:00:00Z      Chelsea       Blackburn     1     2   A   
3  1993-94  1993-08-14T00:00:00Z    Liverpool  Sheffield Weds     2     0   H   
4  1993-94  1993-08-14T00:00:00Z     Man City           Leeds     1     1   D   

   HTHG  HTAG  HTR  ... HST  AST  HC  AC  HF  AF  HY  AY  HR  AR  
0   NaN   NaN  NaN  ... NaN  NaN NaN NaN NaN NaN NaN NaN NaN NaN  
1   NaN   NaN  NaN  ... NaN  NaN NaN NaN NaN NaN NaN NaN NaN NaN  
2   NaN   NaN  NaN  ... NaN  NaN NaN NaN NaN NaN NaN NaN NaN NaN  
3   NaN   NaN  NaN  ... NaN  NaN NaN NaN NaN NaN NaN NaN NaN NaN  
4   NaN   NaN  NaN  ... NaN  NaN NaN NaN NaN NaN NaN NaN NaN NaN  

[5 rows x 23 columns]


In [8]:
print(df.columns)

Index(['Season', 'DateTime', 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'FTR',
       'HTHG', 'HTAG', 'HTR', 'Referee', 'HS', 'AS', 'HST', 'AST', 'HC', 'AC',
       'HF', 'AF', 'HY', 'AY', 'HR', 'AR'],
      dtype='object')


In [9]:
#drop the columns that aren't necessary
df = df.drop(columns = ['HTHG', 'HTAG', 'HTR', 'Referee', 'HS', 'AS', 'HST', 'AST', 'HC', 'AC', 'HF', 'AF', 'HY', 'AY', 'HR', 'AR'], axis = 1)
print(df.head())

    Season              DateTime     HomeTeam        AwayTeam  FTHG  FTAG FTR
0  1993-94  1993-08-14T00:00:00Z      Arsenal        Coventry     0     3   A
1  1993-94  1993-08-14T00:00:00Z  Aston Villa             QPR     4     1   H
2  1993-94  1993-08-14T00:00:00Z      Chelsea       Blackburn     1     2   A
3  1993-94  1993-08-14T00:00:00Z    Liverpool  Sheffield Weds     2     0   H
4  1993-94  1993-08-14T00:00:00Z     Man City           Leeds     1     1   D


In [10]:
# sorting date time
df['DateTime'] = pd.to_datetime(df['DateTime'])
df = df.sort_values('DateTime')

In [11]:
#Label Encoder because a model cannot read names and needs to read them in as numbers
encoder = LabelEncoder()
#goes trough the home teams and assigns them all a number
df['HomeTeam'] = encoder.fit_transform(df['HomeTeam'])
#goes through the away teams and assigns them all the same number they have for the home teams
df['AwayTeam'] = encoder.transform(df['AwayTeam'])
print(df.head())

    Season                  DateTime  HomeTeam  AwayTeam  FTHG  FTAG FTR
0  1993-94 1993-08-14 00:00:00+00:00         0        15     0     3   A
9  1993-94 1993-08-14 00:00:00+00:00        46        48     0     2   A
7  1993-94 1993-08-14 00:00:00+00:00        36        42     3     1   H
6  1993-94 1993-08-14 00:00:00+00:00        32        22     0     3   A
5  1993-94 1993-08-14 00:00:00+00:00        29        43     0     1   A


In [15]:
#Keep track of rolling averages for wins for home/away team

#gets the points.
#if the home team wins the match, and the team we are looking at is home we return 3 points
#if the away team wins and the team we are looking at is away then return 3 points
#if a draw return 1 regardless of team
#otherwise return 0
def get_points(result, is_home):
    if result == 'H' and is_home: return 3
    if result == 'A' and not is_home: return 3
    if result == 'D': return 1
    return 0

# For each match (row) in the DataFrame, calculate how many league points
# the home team earned. The get_points() function returns:
# 3 for a win, 1 for a draw, 0 for a loss.
# axis=1 means "apply this function to each row" instead of each column.
df['HomePoints'] = df.apply(lambda x: get_points(x['FTR'], True), axis=1)

# Do the same for the away team, but pass is_home=False.
# This way, the function returns points from the away team’s perspective.
df['AwayPoints'] = df.apply(lambda x: get_points(x['FTR'], False), axis=1)

print(df.head())

# For each team, look at their past matches as the home team,
# shift() moves results down one row so we don't include the current match
# (avoids "peeking into the future"),
# rolling(5).mean() calculates their average points from the previous 5 matches.
# This gives a "form" rating — how well the team has been doing recently at home.
df['HomeForm'] = (
    df.groupby('HomeTeam')['HomePoints']
      .transform(lambda x: x.shift().rolling(5).mean())
)

# Do the same for away teams — average points over their last 5 away matches.
df['AwayForm'] = (
    df.groupby('AwayTeam')['AwayPoints']
      .transform(lambda x: x.shift().rolling(5).mean())
)


    Season                  DateTime  HomeTeam  AwayTeam  FTHG  FTAG FTR  \
0  1993-94 1993-08-14 00:00:00+00:00         0        15     0     3   A   
9  1993-94 1993-08-14 00:00:00+00:00        46        48     0     2   A   
7  1993-94 1993-08-14 00:00:00+00:00        36        42     3     1   H   
6  1993-94 1993-08-14 00:00:00+00:00        32        22     0     3   A   
5  1993-94 1993-08-14 00:00:00+00:00        29        43     0     1   A   

   HomePoints  AwayPoints  HomeForm  AwayForm  HomeGoalsAvg  AwayGoalsAvg  
0           0           3       0.0       0.0           0.0           0.0  
9           0           3       0.0       0.0           0.0           0.0  
7           3           0       0.0       0.0           0.0           0.0  
6           0           3       0.0       0.0           0.0           0.0  
5           0           3       0.0       0.0           0.0           0.0  


In [16]:
# For each team, calculate how many goals they typically score at home.
# Step-by-step:
# 1. groupby('HomeTeam'): process each team separately.
# 2. ['FTHG']: use the 'Full Time Home Goals' column — how many goals the home team scored in each match.
# 3. transform(...): ensures the result stays aligned with the original DataFrame (so each row gets a value).
# 4. Inside transform:
#     - x.shift(): moves the data down by one row, so the current match does NOT include itself (avoids using future info).
#     - .rolling(5).mean(): computes the average goals over the team’s previous 5 home matches.
# The result is a new column showing each home team’s average goals scored in their last 5 home games before each match.
df['HomeGoalsAvg'] = (
    df.groupby('HomeTeam')['FTHG']
      .transform(lambda x: x.shift().rolling(5).mean())
)

# Do the same for away teams:
# - groupby('AwayTeam'): calculate stats for each team’s away matches.
# - ['FTAG']: use the 'Full Time Away Goals' column — goals scored by the away team in each match.
# - shift() and rolling(5).mean(): same idea — average goals from the last 5 away games, excluding the current one.
# This gives a measure of each team’s attacking strength when playing away.
df['AwayGoalsAvg'] = (
    df.groupby('AwayTeam')['FTAG']
      .transform(lambda x: x.shift().rolling(5).mean())
)


In [14]:
df = df.fillna(0)