In [1]:
import pandas as pd

In [2]:
all_matches = pd.read_csv("all_matches.csv",index_col=0)

In [3]:
all_matches.head(5)

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,sot,sot%,g/sh,g/sot,dist,fk,pk,pkatt,season,team name
0,2022-08-05,20:00,Premier League,Matchweek 1,Fri,Away,W,2.0,0.0,Crystal Palace,...,2.0,20.0,0.1,0.5,14.6,1.0,0.0,0.0,2022,Arsenal
1,2022-08-13,15:00,Premier League,Matchweek 2,Sat,Home,W,4.0,2.0,Leicester City,...,7.0,36.8,0.21,0.57,13.0,0.0,0.0,0.0,2022,Arsenal
2,2022-08-20,17:30,Premier League,Matchweek 3,Sat,Away,W,3.0,0.0,Bournemouth,...,6.0,42.9,0.21,0.5,14.8,0.0,0.0,0.0,2022,Arsenal
3,2022-08-27,17:30,Premier League,Matchweek 4,Sat,Home,W,2.0,1.0,Fulham,...,8.0,36.4,0.09,0.25,15.5,1.0,0.0,0.0,2022,Arsenal
4,2022-08-31,19:30,Premier League,Matchweek 5,Wed,Home,W,2.0,1.0,Aston Villa,...,8.0,36.4,0.09,0.25,16.3,1.0,0.0,0.0,2022,Arsenal


In [4]:
all_matches.shape

(2112, 30)

### There are 38 matches each season, 20 teams playing and since we have datas for 3 seasons

In [5]:
38*20*3

2280

In [6]:
all_matches["team name"].value_counts()

Arsenal                     106
Chelsea                     106
Southampton                 106
Leicester City              106
Everton                     106
Leeds United                106
Crystal Palace              106
Wolverhampton Wanderers     106
Tottenham Hotspur           106
Aston Villa                 106
Manchester United           105
Liverpool                   105
Manchester City             105
West Ham United             105
Newcastle United            105
Brighton and Hove Albion    104
Burnley                      76
Brentford                    68
Fulham                       67
Watford                      38
Norwich City                 38
West Bromwich Albion         38
Sheffield United             38
Nottingham Forest            30
Bournemouth                  30
Name: team name, dtype: int64

In [7]:
all_matches["round"].value_counts()

Matchweek 1     60
Matchweek 18    60
Matchweek 30    60
Matchweek 29    60
Matchweek 27    60
Matchweek 26    60
Matchweek 7     60
Matchweek 24    60
Matchweek 12    60
Matchweek 23    60
Matchweek 2     60
Matchweek 21    60
Matchweek 20    60
Matchweek 19    60
Matchweek 22    60
Matchweek 17    60
Matchweek 9     60
Matchweek 3     60
Matchweek 4     60
Matchweek 16    60
Matchweek 6     60
Matchweek 8     60
Matchweek 5     60
Matchweek 10    60
Matchweek 11    60
Matchweek 13    60
Matchweek 14    60
Matchweek 15    60
Matchweek 25    58
Matchweek 28    54
Matchweek 31    40
Matchweek 32    40
Matchweek 34    40
Matchweek 35    40
Matchweek 36    40
Matchweek 33    40
Matchweek 37    40
Matchweek 38    40
Name: round, dtype: int64

#### There are less number of matches because 
- the season 2022-23  was ongoing while we scraped the data
- 3 teams are relegated each season and 3 teams are promoted 
- So we expect (~ 9 teams) to be having around 38 matches less each

## Cleaning our data for Machine Learning

In [8]:
all_matches.dtypes

date             object
time             object
comp             object
round            object
day              object
venue            object
result           object
gf              float64
ga              float64
opponent         object
xg              float64
xga             float64
poss            float64
attendance      float64
captain          object
formation        object
referee          object
match report     object
notes           float64
sh              float64
sot             float64
sot%            float64
g/sh            float64
g/sot           float64
dist            float64
fk              float64
pk              float64
pkatt           float64
season            int64
team name        object
dtype: object

- Machine Learning algorithms can't work with data that is an object
- Can only use columns that are numeric as predictors 
- Convert relevant feature columns to numeric datatype

In [9]:
# Convert date object to datetime
all_matches["date"] = pd.to_datetime(all_matches["date"])
all_matches.dtypes

date            datetime64[ns]
time                    object
comp                    object
round                   object
day                     object
venue                   object
result                  object
gf                     float64
ga                     float64
opponent                object
xg                     float64
xga                    float64
poss                   float64
attendance             float64
captain                 object
formation               object
referee                 object
match report            object
notes                  float64
sh                     float64
sot                    float64
sot%                   float64
g/sh                   float64
g/sot                  float64
dist                   float64
fk                     float64
pk                     float64
pkatt                  float64
season                   int64
team name               object
dtype: object

## Creating relevant features

In [10]:
n_venue_types = all_matches['venue'].nunique()
n_venue_types            # home/away only

2

In [11]:
all_matches['venue_id'] = all_matches ['venue'].replace({'Home':0,'Away':1})
all_matches['opponent_id'] = all_matches["opponent"].astype("category").cat.codes

In [12]:
all_matches["hour"] = all_matches["time"].str.replace(":.+","", regex=True).astype("int")

In [13]:
all_matches["day_id"] = all_matches["date"].dt.dayofweek

In [14]:
all_matches.head()

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,dist,fk,pk,pkatt,season,team name,venue_id,opponent_id,hour,day_id
0,2022-08-05,20:00,Premier League,Matchweek 1,Fri,Away,W,2.0,0.0,Crystal Palace,...,14.6,1.0,0.0,0.0,2022,Arsenal,1,7,20,4
1,2022-08-13,15:00,Premier League,Matchweek 2,Sat,Home,W,4.0,2.0,Leicester City,...,13.0,0.0,0.0,0.0,2022,Arsenal,0,11,15,5
2,2022-08-20,17:30,Premier League,Matchweek 3,Sat,Away,W,3.0,0.0,Bournemouth,...,14.8,0.0,0.0,0.0,2022,Arsenal,1,2,17,5
3,2022-08-27,17:30,Premier League,Matchweek 4,Sat,Home,W,2.0,1.0,Fulham,...,15.5,1.0,0.0,0.0,2022,Arsenal,0,9,17,5
4,2022-08-31,19:30,Premier League,Matchweek 5,Wed,Home,W,2.0,1.0,Aston Villa,...,16.3,1.0,0.0,0.0,2022,Arsenal,0,1,19,2


In [15]:
# Set up a target to actually predict
all_matches["target"] = (all_matches["result"] == "W").astype("int")

In [16]:
all_matches.head()

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,fk,pk,pkatt,season,team name,venue_id,opponent_id,hour,day_id,target
0,2022-08-05,20:00,Premier League,Matchweek 1,Fri,Away,W,2.0,0.0,Crystal Palace,...,1.0,0.0,0.0,2022,Arsenal,1,7,20,4,1
1,2022-08-13,15:00,Premier League,Matchweek 2,Sat,Home,W,4.0,2.0,Leicester City,...,0.0,0.0,0.0,2022,Arsenal,0,11,15,5,1
2,2022-08-20,17:30,Premier League,Matchweek 3,Sat,Away,W,3.0,0.0,Bournemouth,...,0.0,0.0,0.0,2022,Arsenal,1,2,17,5,1
3,2022-08-27,17:30,Premier League,Matchweek 4,Sat,Home,W,2.0,1.0,Fulham,...,1.0,0.0,0.0,2022,Arsenal,0,9,17,5,1
4,2022-08-31,19:30,Premier League,Matchweek 5,Wed,Home,W,2.0,1.0,Aston Villa,...,1.0,0.0,0.0,2022,Arsenal,0,1,19,2,1


In [17]:
# Non-linearity can be captured using Random forest classifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score
from sklearn.svm import SVC
import xgboost as xgb

In [18]:
# min_samples_split = [6, 8, 10]            # hyperparameter
# random state = > since we have a lot of random parameters
rf_model = RandomForestClassifier(n_estimators=50, min_samples_split=8, random_state=1)   

## TRAIN-TEST SPLIT
- Since, the match data is a time-series data, we need to be careful while splitting the data into Train-test. 
- We must ensure that the test data comes after the training data because in the real world, we can not see the future data to train a model
- There are some (>2000) matches in our dataframe, we will split it to roughly 1600 training data and 500 test data

In [19]:
train_data = all_matches[all_matches["date"] < '2022-09-01']
test_data = all_matches[all_matches["date"] >= '2022-09-01']
print(f"Shape of our training data is {train_data.shape}")
print(f"Shape of our testing data is {test_data.shape}")

Shape of our training data is (1618, 35)
Shape of our testing data is (494, 35)


In [20]:
features = ["venue_id", "opponent_id", "hour", "day_id"]

In [21]:
rf_model.fit(train_data[features], train_data["target"])

In [22]:
pred = rf_model.predict(test_data[features])

### Prediction and Accuracy

In [23]:
accuracy = accuracy_score(test_data["target"], pred)
accuracy

0.6032388663967612

- A peep through into what went wrong

In [24]:
predicted_df = pd.DataFrame({'Predicted_Label': pred})

predicted_df.reset_index(drop=True, inplace=True)
test_data.reset_index(drop=True, inplace=True)
# create a crosstab
ct = pd.crosstab(test_data['target'], predicted_df['Predicted_Label'], rownames=['Actual'], colnames=['Predicted'])
ct

Predicted,0,1
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,234,68
1,128,64


## Observation 
- Our model is more often than not predicting if it would not win a match correctly
- However, It is pretty bad at predicting if a team will win or not
- Recall, precision = TP/(TP+FP)
- Since our model is to predict if a team will win or not given some conditions,we should go back and model better features in order to achieve better accuracy

In [25]:
precision_score(test_data["target"], pred)

0.48484848484848486

### Improving upon the model's accuracy/precision using LEVEL-2 features

### Intuition:
- We can use the data about the **performance of the past few matches** of a team to know about the team's form
- Use rolling average of various features like goals for, goals against, won/lost etc.
- LEVEL-2 features 

In [26]:
grouped_df = all_matches.groupby("team name") 

In [27]:
man_city_group = grouped_df.get_group("Manchester City").sort_values("date")
man_city_group

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,fk,pk,pkatt,season,team name,venue_id,opponent_id,hour,day_id,target
0,2020-09-21,20:15,Premier League,Matchweek 2,Mon,Away,W,3.0,1.0,Wolves,...,2.0,1.0,1.0,2020,Manchester City,1,24,20,0,1
2,2020-09-27,16:30,Premier League,Matchweek 3,Sun,Home,L,2.0,5.0,Leicester City,...,1.0,0.0,0.0,2020,Manchester City,0,11,16,6,0
4,2020-10-03,17:30,Premier League,Matchweek 4,Sat,Away,D,1.0,1.0,Leeds United,...,1.0,0.0,0.0,2020,Manchester City,1,10,17,5,0
5,2020-10-17,17:30,Premier League,Matchweek 5,Sat,Home,W,1.0,0.0,Arsenal,...,0.0,0.0,0.0,2020,Manchester City,0,0,17,5,1
7,2020-10-24,12:30,Premier League,Matchweek 6,Sat,Away,D,1.0,1.0,West Ham,...,1.0,0.0,0.0,2020,Manchester City,1,23,12,5,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37,2023-02-25,17:30,Premier League,Matchweek 25,Sat,Away,W,4.0,1.0,Bournemouth,...,1.0,0.0,0.0,2022,Manchester City,1,2,17,5,1
39,2023-03-04,12:30,Premier League,Matchweek 26,Sat,Home,W,2.0,0.0,Newcastle Utd,...,2.0,0.0,0.0,2022,Manchester City,0,15,12,5,1
40,2023-03-11,17:30,Premier League,Matchweek 27,Sat,Away,W,1.0,0.0,Crystal Palace,...,1.0,1.0,1.0,2022,Manchester City,1,7,17,5,1
43,2023-04-01,12:30,Premier League,Matchweek 29,Sat,Home,W,4.0,1.0,Liverpool,...,1.0,0.0,0.0,2022,Manchester City,0,12,12,5,1


In [28]:
def compute_rolling_averages(df, columns, window_size, new_column_suffix='_rolling_avg'):
    """
    Compute rolling averages for specific columns in a pandas DataFrame and assign them to new columns in the same DataFrame.

    Parameters:
    df (pandas DataFrame): The input DataFrame.
    columns (list of str): The names of the columns to compute rolling averages for.
    window_size (int): The size of the rolling window.
    new_column_suffix (str, optional): The suffix to append to the names of the new columns. Defaults to '_rolling_avg'.

    Returns:
    pandas DataFrame: The updated DataFrame with the rolling average columns.
    """
    df = df.sort_values("date")
    new_column_names = []
    for column in columns:
        new_column_name = column + new_column_suffix
        new_column_names.append(new_column_name)
        df.loc[:, new_column_name] = df[column].rolling(window=window_size, closed='left').mean()
        
    
    df = df.dropna(subset=new_column_names)
    
    return df


In [29]:
cols_list = ["gf", "ga", "sh", "sot", "sot%" , "g/sh", "g/sot", "dist", "fk", "pk", "pkatt", "poss", "xg", "xga"]

In [30]:
import warnings
warnings.filterwarnings('ignore')
compute_rolling_averages(man_city_group, cols_list, 3)

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,sot%_rolling_avg,g/sh_rolling_avg,g/sot_rolling_avg,dist_rolling_avg,fk_rolling_avg,pk_rolling_avg,pkatt_rolling_avg,poss_rolling_avg,xg_rolling_avg,xga_rolling_avg
5,2020-10-17,17:30,Premier League,Matchweek 5,Sat,Home,W,1.0,0.0,Arsenal,...,32.366667,0.106667,0.550000,19.700000,1.333333,0.333333,0.333333,62.000000,1.333333,1.966667
7,2020-10-24,12:30,Premier League,Matchweek 6,Sat,Away,D,1.0,1.0,West Ham,...,24.700000,0.083333,0.533333,18.566667,0.666667,0.000000,0.000000,59.666667,1.133333,2.066667
9,2020-10-31,12:30,Premier League,Matchweek 7,Sat,Away,W,1.0,0.0,Sheffield Utd,...,30.933333,0.063333,0.446667,18.933333,0.666667,0.000000,0.000000,58.666667,1.166667,1.200000
11,2020-11-08,16:30,Premier League,Matchweek 8,Sun,Home,D,1.0,1.0,Liverpool,...,46.166667,0.070000,0.156667,19.033333,1.000000,0.000000,0.000000,64.000000,1.300000,0.566667
12,2020-11-21,17:30,Premier League,Matchweek 9,Sat,Away,L,0.0,2.0,Tottenham,...,44.433333,0.100000,0.256667,20.000000,1.000000,0.000000,0.333333,62.666667,1.333333,0.666667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37,2023-02-25,17:30,Premier League,Matchweek 25,Sat,Away,W,4.0,1.0,Bournemouth,...,41.833333,0.166667,0.353333,15.166667,1.000000,0.333333,0.333333,59.000000,2.500000,0.933333
39,2023-03-04,12:30,Premier League,Matchweek 26,Sat,Home,W,2.0,0.0,Newcastle Utd,...,35.566667,0.173333,0.456667,15.133333,1.000000,0.000000,0.000000,58.666667,2.200000,1.166667
40,2023-03-11,17:30,Premier League,Matchweek 27,Sat,Away,W,1.0,0.0,Crystal Palace,...,26.133333,0.123333,0.480000,16.366667,1.666667,0.000000,0.000000,65.000000,2.033333,0.766667
43,2023-04-01,12:30,Premier League,Matchweek 29,Sat,Home,W,4.0,1.0,Liverpool,...,25.133333,0.110000,0.423333,16.700000,1.333333,0.333333,0.333333,63.000000,1.933333,0.533333


In [31]:
all_matches_rolling = all_matches.groupby("team name").apply(lambda x: compute_rolling_averages(x, cols_list, 3))
all_matches_rolling.index = range(all_matches_rolling.shape[0])
all_matches_rolling.shape

(1868, 49)

In [32]:
all_matches_rolling

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,sot%_rolling_avg,g/sh_rolling_avg,g/sot_rolling_avg,dist_rolling_avg,fk_rolling_avg,pk_rolling_avg,pkatt_rolling_avg,poss_rolling_avg,xg_rolling_avg,xga_rolling_avg
0,2020-10-04,14:00,Premier League,Matchweek 4,Sun,Home,W,2.0,1.0,Sheffield Utd,...,52.133333,0.256667,0.533333,14.633333,0.666667,0.000000,0.000000,50.000000,1.433333,1.600000
1,2020-10-17,17:30,Premier League,Matchweek 5,Sat,Away,L,0.0,1.0,Manchester City,...,67.066667,0.290000,0.466667,15.366667,0.000000,0.000000,0.000000,53.333333,1.000000,1.633333
2,2020-10-25,19:15,Premier League,Matchweek 6,Sun,Home,L,0.0,1.0,Leicester City,...,61.866667,0.193333,0.243333,16.566667,0.666667,0.000000,0.000000,46.666667,0.933333,1.400000
3,2020-11-01,16:30,Premier League,Matchweek 7,Sun,Away,W,1.0,0.0,Manchester Utd,...,47.966667,0.110000,0.133333,16.566667,1.000000,0.000000,0.000000,54.000000,0.733333,0.733333
4,2020-11-08,19:15,Premier League,Matchweek 8,Sun,Home,L,0.0,3.0,Aston Villa,...,25.766667,0.000000,0.000000,19.333333,1.000000,0.333333,0.333333,48.000000,0.866667,0.800000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1863,2023-03-04,15:00,Premier League,Matchweek 26,Sat,Home,W,1.0,0.0,Tottenham,...,25.566667,0.043333,0.166667,14.133333,0.000000,0.000000,0.000000,53.333333,0.966667,1.400000
1864,2023-03-12,16:30,Premier League,Matchweek 27,Sun,Away,L,1.0,2.0,Newcastle Utd,...,37.500000,0.086667,0.233333,16.700000,0.000000,0.000000,0.000000,46.000000,0.700000,1.433333
1865,2023-03-18,15:00,Premier League,Matchweek 28,Sat,Home,L,2.0,4.0,Leeds United,...,48.200000,0.090000,0.150000,17.166667,0.333333,0.000000,0.000000,48.666667,0.800000,1.866667
1866,2023-04-01,15:00,Premier League,Matchweek 29,Sat,Away,D,1.0,1.0,Nott'ham Forest,...,51.466667,0.120000,0.233333,18.866667,0.666667,0.000000,0.000000,56.000000,1.433333,1.533333


In [33]:
all_matches_rolling[all_matches_rolling["team name"] == 'Manchester City']

# We observe that it is indeed in chronological order as desired for the Rolling Average

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,sot%_rolling_avg,g/sh_rolling_avg,g/sot_rolling_avg,dist_rolling_avg,fk_rolling_avg,pk_rolling_avg,pkatt_rolling_avg,poss_rolling_avg,xg_rolling_avg,xga_rolling_avg
1048,2020-10-17,17:30,Premier League,Matchweek 5,Sat,Home,W,1.0,0.0,Arsenal,...,32.366667,0.106667,0.550000,19.700000,1.333333,0.333333,0.333333,62.000000,1.333333,1.966667
1049,2020-10-24,12:30,Premier League,Matchweek 6,Sat,Away,D,1.0,1.0,West Ham,...,24.700000,0.083333,0.533333,18.566667,0.666667,0.000000,0.000000,59.666667,1.133333,2.066667
1050,2020-10-31,12:30,Premier League,Matchweek 7,Sat,Away,W,1.0,0.0,Sheffield Utd,...,30.933333,0.063333,0.446667,18.933333,0.666667,0.000000,0.000000,58.666667,1.166667,1.200000
1051,2020-11-08,16:30,Premier League,Matchweek 8,Sun,Home,D,1.0,1.0,Liverpool,...,46.166667,0.070000,0.156667,19.033333,1.000000,0.000000,0.000000,64.000000,1.300000,0.566667
1052,2020-11-21,17:30,Premier League,Matchweek 9,Sat,Away,L,0.0,2.0,Tottenham,...,44.433333,0.100000,0.256667,20.000000,1.000000,0.000000,0.333333,62.666667,1.333333,0.666667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1145,2023-02-25,17:30,Premier League,Matchweek 25,Sat,Away,W,4.0,1.0,Bournemouth,...,41.833333,0.166667,0.353333,15.166667,1.000000,0.333333,0.333333,59.000000,2.500000,0.933333
1146,2023-03-04,12:30,Premier League,Matchweek 26,Sat,Home,W,2.0,0.0,Newcastle Utd,...,35.566667,0.173333,0.456667,15.133333,1.000000,0.000000,0.000000,58.666667,2.200000,1.166667
1147,2023-03-11,17:30,Premier League,Matchweek 27,Sat,Away,W,1.0,0.0,Crystal Palace,...,26.133333,0.123333,0.480000,16.366667,1.666667,0.000000,0.000000,65.000000,2.033333,0.766667
1148,2023-04-01,12:30,Premier League,Matchweek 29,Sat,Home,W,4.0,1.0,Liverpool,...,25.133333,0.110000,0.423333,16.700000,1.333333,0.333333,0.333333,63.000000,1.933333,0.533333


In [34]:
def predict(train_data, test_data, features, model):
    """
     Fits the model on the training data using the specified features and target column, makes predictions on the test data.
     It then creates a pandas DataFrame that contains both the actual target values and the predicted target values, 
     with the index set to the index of the test data. Computes the precision score as well.
     
    Args:
        train_data: a pandas DataFrame containing the training data
        test_data: a pandas DataFrame containing the testing data
        features: a list of strings containing the names of the features to use 
        model: a machine learning model object with a fit() and predict() method.
    
    Returns:
        error: precision score
        combined: a pandas df that contains both the actual target values and the predicted target values
        accuracy: accuracy score
                                                                                                                        """
    model.fit(train_data[features], train_data["target"])
    preds = model.predict(test_data[features])
    combined = pd.DataFrame(dict(actual=test_data["target"], predicted=preds), index=test_data.index)
    precision = precision_score(test_data["target"], preds)
    accuracy = accuracy_score(test_data["target"], preds)

    return combined, precision, accuracy
    
    

In [35]:
######### Train- test data #########

train_data = all_matches_rolling[all_matches_rolling["date"] < '2022-09-01']
test_data = all_matches_rolling[all_matches_rolling["date"] >= '2022-09-01']
print(f"Shape of our training data is {train_data.shape}")
print(f"Shape of our testing data is {test_data.shape}")


####### Preparing the new features #########
new_columns = []
for column in cols_list:
    new_column_name = column + '_rolling_avg'
    new_columns.append(new_column_name)

    

Shape of our training data is (1422, 49)
Shape of our testing data is (446, 49)


## Random Forest classifier (sample run)

In [36]:
# Deining a random forest model
rf_model = RandomForestClassifier(n_estimators=50, min_samples_split=2, random_state=6)  
new_features = features + new_columns

combined, precision, accuracy = predict(train_data, test_data, new_features, rf_model)

In [37]:
precision

0.6

In [38]:
accuracy

0.6457399103139013

In [39]:
combined

Unnamed: 0,actual,predicted
75,0,1
76,1,1
77,1,1
78,1,0
79,1,0
...,...,...
1863,1,0
1864,0,0
1865,0,0
1866,0,0


##### To get an insight on whether we are predicting some specific instance wrong, like wrong for a specific team only, 
- we need a combined dataframe

In [40]:
# Merge based on index in df1 and df2(wherever they match)
combined = combined.merge(all_matches_rolling[["date", "team name", "opponent", "result"]], left_index=True, right_index=True)
combined

Unnamed: 0,actual,predicted,date,team name,opponent,result
75,0,1,2022-09-04,Arsenal,Manchester Utd,L
76,1,1,2022-09-18,Arsenal,Brentford,W
77,1,1,2022-10-01,Arsenal,Tottenham,W
78,1,0,2022-10-09,Arsenal,Liverpool,W
79,1,0,2022-10-16,Arsenal,Leeds United,W
...,...,...,...,...,...,...
1863,1,0,2023-03-04,Wolverhampton Wanderers,Tottenham,W
1864,0,0,2023-03-12,Wolverhampton Wanderers,Newcastle Utd,L
1865,0,0,2023-03-18,Wolverhampton Wanderers,Leeds United,L
1866,0,0,2023-04-01,Wolverhampton Wanderers,Nott'ham Forest,D


In [41]:
class MissingDict(dict):
    __missing__ = lambda self, key: key

map_values = {
    "Brighton and Hove Albion": "Brighton", 
      "Manchester United": "Manchester Utd", 
      "Newcastle United": "Newcastle Utd",
      "Sheffield United": "Sheffield Utd",
      "Tottenham Hotspur": "Tottenham", 
      "West Ham United": "West Ham", 
      "Wolverhampton Wanderers": "Wolves",
      "Nottingham Forest": "Nott'ham Forest",
     "West Bromwich Albion": "West Brom"
} 

mapping = MissingDict(**map_values)

In [42]:
mapping["Arsenal"]

'Arsenal'

In [43]:
combined["new_team"] = combined["team name"].map(mapping)
combined

Unnamed: 0,actual,predicted,date,team name,opponent,result,new_team
75,0,1,2022-09-04,Arsenal,Manchester Utd,L,Arsenal
76,1,1,2022-09-18,Arsenal,Brentford,W,Arsenal
77,1,1,2022-10-01,Arsenal,Tottenham,W,Arsenal
78,1,0,2022-10-09,Arsenal,Liverpool,W,Arsenal
79,1,0,2022-10-16,Arsenal,Leeds United,W,Arsenal
...,...,...,...,...,...,...,...
1863,1,0,2023-03-04,Wolverhampton Wanderers,Tottenham,W,Wolves
1864,0,0,2023-03-12,Wolverhampton Wanderers,Newcastle Utd,L,Wolves
1865,0,0,2023-03-18,Wolverhampton Wanderers,Leeds United,L,Wolves
1866,0,0,2023-04-01,Wolverhampton Wanderers,Nott'ham Forest,D,Wolves


In [44]:
merged = combined.merge(combined, left_on=["date", "new_team"], right_on=["date", "opponent"])
merged

Unnamed: 0,actual_x,predicted_x,date,team name_x,opponent_x,result_x,new_team_x,actual_y,predicted_y,team name_y,opponent_y,result_y,new_team_y
0,0,1,2022-09-04,Arsenal,Manchester Utd,L,Arsenal,1,1,Manchester United,Arsenal,W,Manchester Utd
1,1,1,2022-09-18,Arsenal,Brentford,W,Arsenal,0,0,Brentford,Arsenal,L,Brentford
2,1,1,2022-10-01,Arsenal,Tottenham,W,Arsenal,0,0,Tottenham Hotspur,Arsenal,L,Tottenham
3,1,0,2022-10-09,Arsenal,Liverpool,W,Arsenal,0,0,Liverpool,Arsenal,L,Liverpool
4,1,0,2022-10-16,Arsenal,Leeds United,W,Arsenal,0,0,Leeds United,Arsenal,L,Leeds United
...,...,...,...,...,...,...,...,...,...,...,...,...,...
399,1,0,2023-03-04,Wolverhampton Wanderers,Tottenham,W,Wolves,0,0,Tottenham Hotspur,Wolves,L,Tottenham
400,0,0,2023-03-12,Wolverhampton Wanderers,Newcastle Utd,L,Wolves,1,0,Newcastle United,Wolves,W,Newcastle Utd
401,0,0,2023-03-18,Wolverhampton Wanderers,Leeds United,L,Wolves,1,0,Leeds United,Wolves,W,Leeds United
402,0,0,2023-04-01,Wolverhampton Wanderers,Nott'ham Forest,D,Wolves,0,0,Nottingham Forest,Wolves,D,Nott'ham Forest


In [45]:
ans = merged[(merged["predicted_x"] == 1) & (merged["predicted_y"] == 0)]["actual_x"].value_counts()
TP = ans[1]
FP = ans[0]          # n_times the model predicted x to win(1) but actually lost(0)
ans 

1    40
0    22
Name: actual_x, dtype: int64

Recall the definition of precision
- Precison = $ \frac{TP}{TP+FP} $

In [46]:
# Precision
precision = TP/(TP+FP)
precision

0.6451612903225806

In [47]:
# Create a SVM classifier with the Gaussian kernel
clf_model = SVC(kernel='rbf')


# create a Gradient Boosting Classifier model
gb_model = GradientBoostingClassifier(n_estimators=35, max_depth=3, learning_rate=0.1)


# LIST OF MODELS
models = [rf_model, clf_model, gb_model]
combined_df_list = []


for model in models:
    combined, precision, accuracy = predict(train_data, test_data, features + new_columns, model)
    combined_df_list.append(combined)
    print(f"Precision score using the model {model} = {precision}")
    print(f"Accuracy score using the model {model} = {accuracy}")
    print("\n")
        

Precision score using the model RandomForestClassifier(n_estimators=50, random_state=6) = 0.6
Accuracy score using the model RandomForestClassifier(n_estimators=50, random_state=6) = 0.6457399103139013


Precision score using the model SVC() = 0.5833333333333334
Accuracy score using the model SVC() = 0.6098654708520179


Precision score using the model GradientBoostingClassifier(n_estimators=35) = 0.6125
Accuracy score using the model GradientBoostingClassifier(n_estimators=35) = 0.6457399103139013




## Further Improvements
- Feature engineering - Add more relevant features
- Tuning Hyperparameters - Perform grid search CV to find the optimal hyperparamter values
- May try Xgboost

In [48]:
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import roc_auc_score

# Define the parameter grid for tuning
param_grid = {
    'learning_rate': [0.01, 0.1, 0.5, 1],
    'max_depth': [3, 4, 5, 6, 7],
    'min_child_weight': [1, 3, 5, 7],
    'subsample': [0.6, 0.7, 0.8, 0.9, 1],
    'colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1],
    'n_estimators': [100, 500, 1000, 2000]
}

# Create an XGBoost classifier
xgb_model = xgb.XGBClassifier(objective='binary:logistic', seed=42)

# Perform a random search for the best hyperparameters
random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_grid,
    n_iter=50,
    scoring='roc_auc',
    cv=5,
    verbose=1,
    random_state=42
)

# Fit the random search to the data
random_search.fit(train_data[new_features], train_data["target"])

# Print the best hyperparameters and the AUC score
print("Best Hyperparameters:", random_search.best_params_)
print("AUC Score:", random_search.best_score_)



Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best Hyperparameters: {'subsample': 0.7, 'n_estimators': 500, 'min_child_weight': 5, 'max_depth': 5, 'learning_rate': 0.01, 'colsample_bytree': 0.7}
AUC Score: 0.65419948722701


In [49]:
# use the best hyperparameters to train a final model on the entire training data
best_model = xgb.XGBClassifier(**random_search.best_params_)
best_model.fit(train_data[new_features], train_data["target"])

# use the final model to make predictions on the test data
preds = best_model.predict(test_data[new_features])

# evaluate the final model using the AUC score
auc_score = roc_auc_score(test_data["target"], preds)
print("AUC score:", auc_score)
print("Precision Score:", precision_score(test_data["target"], preds))
print("Accuracy Score:", accuracy_score(test_data["target"], preds))
                                

AUC score: 0.5762415824915825
Precision Score: 0.5903614457831325
Accuracy Score: 0.6390134529147982


## Trying a simple deep learning model

In [93]:
# define best model
model = Sequential()
model.add(Dense(64, activation='sigmoid', input_dim=X_train.shape[1]))
model.add(Dropout(0.0))
model.add(Dense(32, activation='sigmoid'))
model.add(Dropout(0.0))
model.add(Dense(1, activation='sigmoid'))

# compile model
model.compile(loss='binary_crossentropy', optimizer='Adam', metrics=['accuracy'])

# fit model
model.fit(X_train, train_data["target"] , epochs=100, batch_size=32, validation_data=(X_test, test_data["target"]))

# evaluate model
score = model.evaluate(X_test, test_data["target"], batch_size=32)
print('Test loss:', score[0])
print('Test accuracy:', score[1])
print(score)

# Predict on test data
y_pred = model.predict(X_test)
y_pred_binary = np.where(y_pred >= 0.5, 1, 0)
    
# print(y_pred)
print("Precision Score:", precision_score(test_data["target"], y_pred_binary))


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100


Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100
Test loss: 0.6457798480987549
Test accuracy: 0.6726457476615906
[0.6457798480987549, 0.6726457476615906]
Precision Score: 0.6666666666666666
