In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

%matplotlib inline

# Importing and cleaning data

In [2]:
data = pd.read_csv("datasets/PL  - Match Data.csv", header=None)

# use the team name row to fill in the data
data['Team'] = data[0].where(data[1].isna()).ffill()

# drop the team name rows
data = data.dropna(subset=[1, 2])

# drop the extra header rows
data = data[data[0] != "Date"]

data.columns = [
    "Date", "Opponent", "Venue", "Result", "Score",
    "Goals Scored", "Goals Conceded", "xG", "xGA", "Team"
]

# reset index
data = data.reset_index(drop=True)

data.head(12)

Unnamed: 0,Date,Opponent,Venue,Result,Score,Goals Scored,Goals Conceded,xG,xGA,Team
0,17 Aug 24,Wolverhampton,Home,Win,2-0,2,0,1.2,0.5,Arsenal
1,24 Aug 24,Aston Villa,Away,Win,2-0,2,0,0.9,1.2,Arsenal
2,31 Aug 24,Brighton,Home,Draw,1-1,1,1,2.1,1.7,Arsenal
3,15 Sep 24,Tottenham,Away,Win,1-0,1,0,0.7,0.7,Arsenal
4,22 Sep 24,Manchester City,Away,Draw,2-2,2,2,0.7,2.1,Arsenal
5,28 Sep 24,Leicester City,Home,Win,4-2,4,2,4.4,0.3,Arsenal
6,5 Oct 24,Southampton,Home,Win,3-1,3,1,2.8,0.6,Arsenal
7,19 Oct 24,Bournemouth,Away,Loss,0-2,0,2,0.7,1.8,Arsenal
8,27 Oct 24,Liverpool,Home,Draw,2-2,2,2,0.9,0.8,Arsenal
9,2 Nov 24,Newcastle,Away,Loss,0-1,0,1,1.1,0.5,Arsenal


# Processing data
For Venue: Home = 1, Away = 0

For Result: Win = 1, Draw = 0, Loss = -1

In [3]:
# create a goal difference column
data['Goals Scored'] = pd.to_numeric(data['Goals Scored'])
data['Goals Conceded'] = pd.to_numeric(data['Goals Conceded'])
data['Goal Difference'] = data['Goals Scored'] - data['Goals Conceded']

data['xG'] = pd.to_numeric(data['xG'])
data['xGA'] = pd.to_numeric(data['xGA'])
data['xG Difference'] = data['xG'] - data['xGA']

data['Venue'] = data['Venue'].map({'Home': 1, 'Away': 0})
data['Result'] = data['Result'].map({'Win': 1, 'Draw': 0, 'Loss': -1})

data.head(12)

Unnamed: 0,Date,Opponent,Venue,Result,Score,Goals Scored,Goals Conceded,xG,xGA,Team,Goal Difference,xG Difference
0,17 Aug 24,Wolverhampton,1,1,2-0,2,0,1.2,0.5,Arsenal,2,0.7
1,24 Aug 24,Aston Villa,0,1,2-0,2,0,0.9,1.2,Arsenal,2,-0.3
2,31 Aug 24,Brighton,1,0,1-1,1,1,2.1,1.7,Arsenal,0,0.4
3,15 Sep 24,Tottenham,0,1,1-0,1,0,0.7,0.7,Arsenal,1,0.0
4,22 Sep 24,Manchester City,0,0,2-2,2,2,0.7,2.1,Arsenal,0,-1.4
5,28 Sep 24,Leicester City,1,1,4-2,4,2,4.4,0.3,Arsenal,2,4.1
6,5 Oct 24,Southampton,1,1,3-1,3,1,2.8,0.6,Arsenal,2,2.2
7,19 Oct 24,Bournemouth,0,-1,0-2,0,2,0.7,1.8,Arsenal,-2,-1.1
8,27 Oct 24,Liverpool,1,0,2-2,2,2,0.9,0.8,Arsenal,0,0.1
9,2 Nov 24,Newcastle,0,-1,0-1,0,1,1.1,0.5,Arsenal,-1,0.6


In [4]:
features = ['Venue', 'Goals Scored', 'Goals Conceded', 'xG', 'xGA', 'Goal Difference', 'xG Difference']
X = data[features]
y = data['Result']

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)