# 

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import accuracy_score, classification_report
import warnings


warnings.filterwarnings("ignore")

### Cleaning and Preprocessing


In [2]:
df_20_21 = pd.read_csv("E0.csv")
df_19_20 = pd.read_csv("season_19_20.csv")
df_18_19 = pd.read_csv("season_18_19csv.csv")
df_18_17 = pd.read_csv("season-1718_csv.csv")

In [3]:
df = pd.concat([df_18_17, df_18_19, df_19_20, df_20_21], join="inner")

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1520 entries, 0 to 379
Data columns (total 44 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Div       1520 non-null   object 
 1   Date      1520 non-null   object 
 2   HomeTeam  1520 non-null   object 
 3   AwayTeam  1520 non-null   object 
 4   FTHG      1520 non-null   int64  
 5   FTAG      1520 non-null   int64  
 6   FTR       1520 non-null   object 
 7   HTHG      1520 non-null   int64  
 8   HTAG      1520 non-null   int64  
 9   HTR       1520 non-null   object 
 10  Referee   1520 non-null   object 
 11  HS        1520 non-null   int64  
 12  AS        1520 non-null   int64  
 13  HST       1520 non-null   int64  
 14  AST       1520 non-null   int64  
 15  HF        1520 non-null   int64  
 16  AF        1520 non-null   int64  
 17  HC        1520 non-null   int64  
 18  AC        1520 non-null   int64  
 19  HY        1520 non-null   int64  
 20  AY        1520 non-null   int64

In [5]:
df = df[["HomeTeam", "AwayTeam", "FTHG", "FTAG", "HTHG", "HTAG",
        "HS", "AS", "HST", "AST", "HC", "AC", "HF", "AF", "HR", "AR"]]
        

In [6]:
X = df.drop(["FTHG", "FTAG"], axis = 1)
y = df[["FTHG", "FTAG"]]

In [7]:
X_dummy = pd.get_dummies(X)

In [8]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1520 entries, 0 to 379
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   HomeTeam  1520 non-null   object
 1   AwayTeam  1520 non-null   object
 2   HTHG      1520 non-null   int64 
 3   HTAG      1520 non-null   int64 
 4   HS        1520 non-null   int64 
 5   AS        1520 non-null   int64 
 6   HST       1520 non-null   int64 
 7   AST       1520 non-null   int64 
 8   HC        1520 non-null   int64 
 9   AC        1520 non-null   int64 
 10  HF        1520 non-null   int64 
 11  AF        1520 non-null   int64 
 12  HR        1520 non-null   int64 
 13  AR        1520 non-null   int64 
dtypes: int64(12), object(2)
memory usage: 178.1+ KB


In [9]:
X_train, X_test, y_train, y_test = train_test_split(X_dummy, y, train_size=0.80, random_state=1)

In [10]:
scaler = StandardScaler()

In [11]:
scaler.fit_transform(X_train)

array([[ 1.61179602,  0.56866175,  0.45298074, ..., -0.15630528,
        -0.23375215, -0.19375929],
       [-0.7930432 ,  0.56866175, -2.21229946, ..., -0.15630528,
        -0.23375215, -0.19375929],
       [-0.7930432 ,  1.84683679, -0.79081669, ..., -0.15630528,
        -0.23375215,  5.16104288],
       ...,
       [ 0.40937641,  1.84683679, -0.25776065, ..., -0.15630528,
        -0.23375215, -0.19375929],
       [ 0.40937641, -0.70951328,  1.34140747, ..., -0.15630528,
        -0.23375215, -0.19375929],
       [-0.7930432 , -0.70951328,  0.27529539, ..., -0.15630528,
         4.27803545, -0.19375929]])

### Modelling and Results using Linear Regression

In [12]:
lin_model_dummy = LinearRegression()

In [13]:
lin_model_dummy.fit(X_train, y_train)

LinearRegression()

In [14]:
y_pred = lin_model_dummy.predict(X_test)

In [15]:
print(metrics.mean_absolute_error(y_test, y_pred))
print(metrics.mean_squared_error(y_test, y_pred))
print(np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

0.62385088911182
0.630375831732253
0.7939621097585533


### Modelling without team names

In [16]:
X = X.drop(["HomeTeam", "AwayTeam"], axis=1)

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.80, random_state=1)

In [18]:
lin_model = LinearRegression()

In [19]:
lin_model.fit(X_train, y_train)


LinearRegression()

In [20]:
lin_pred = lin_model.predict(X_test)

In [21]:
print(metrics.mean_absolute_error(y_test, lin_pred))
print(metrics.mean_squared_error(y_test, lin_pred))
print(np.sqrt(metrics.mean_squared_error(y_test, lin_pred)))

0.626516106454666
0.6381344195173791
0.7988331612529485


In [22]:
scaler = StandardScaler()

In [23]:
scaler.fit_transform(X_train)

array([[ 1.61179602,  0.56866175,  0.45298074, ..., -0.15261246,
        -0.22226832, -0.25166528],
       [-0.7930432 ,  0.56866175, -2.21229946, ..., -0.72012546,
         4.20852306, -0.25166528],
       [-0.7930432 ,  1.84683679, -0.79081669, ..., -1.00388196,
        -0.22226832, -0.25166528],
       ...,
       [ 0.40937641,  1.84683679, -0.25776065, ...,  1.26617004,
        -0.22226832, -0.25166528],
       [ 0.40937641, -0.70951328,  1.34140747, ..., -0.15261246,
        -0.22226832, -0.25166528],
       [-0.7930432 , -0.70951328,  0.27529539, ..., -1.00388196,
        -0.22226832, -0.25166528]])

In [24]:
lin_model_scale = LinearRegression()

In [25]:
lin_model_scale.fit(X_train, y_train)

LinearRegression()

In [26]:
lin_scale_predict = lin_model_scale.predict(X_test)

In [27]:
print(metrics.mean_absolute_error(y_test, lin_scale_predict))
print(metrics.mean_squared_error(y_test, lin_scale_predict))
print(np.sqrt(metrics.mean_squared_error(y_test, lin_scale_predict)))

0.626516106454666
0.6381344195173791
0.7988331612529485


In [28]:
lin_scale_predict[0:3]

array([[1.41035491, 1.83452738],
       [1.46456581, 0.57434174],
       [2.33674731, 1.96012422]])

In [29]:
y_test[0:3]

Unnamed: 0,FTHG,FTAG
332,1,2
368,1,0
12,2,5


### Modelling with Logistic Regression


In [30]:
log_model_hg = LogisticRegression()
log_model_ag = LogisticRegression()

In [31]:
y_train_hg = y_train["FTHG"]
y_test_hg = y_test["FTHG"]
y_train_ag = y_train["FTAG"]
y_test_ag = y_test["FTAG"]

In [32]:
X_train.shape

(1216, 12)

In [33]:
y_train_ag.shape

(1216,)

In [34]:
log_model_hg.fit(X_train, y_train_hg)
log_model_ag.fit(X_train, y_train_ag)

LogisticRegression()

In [35]:
y_pred_hg = log_model_hg.predict(X_test)
y_pred_ag = log_model_ag.predict(X_test)

In [36]:
print(accuracy_score(y_test_hg, y_pred_hg))
print(accuracy_score(y_test_ag, y_pred_ag))

0.4506578947368421
0.5197368421052632


In [37]:
print(classification_report(y_test_hg, y_pred_hg))
print(classification_report(y_test_ag, y_pred_ag))

              precision    recall  f1-score   support

           0       0.56      0.63      0.59        73
           1       0.42      0.52      0.46        93
           2       0.43      0.38      0.40        76
           3       0.35      0.27      0.31        33
           4       0.25      0.06      0.10        16
           5       0.40      0.33      0.36        12
           6       0.00      0.00      0.00         1

    accuracy                           0.45       304
   macro avg       0.34      0.31      0.32       304
weighted avg       0.44      0.45      0.44       304

              precision    recall  f1-score   support

           0       0.63      0.84      0.72        93
           1       0.49      0.52      0.50        98
           2       0.48      0.29      0.36        75
           3       0.23      0.23      0.23        26
           4       0.25      0.20      0.22         5
           5       0.00      0.00      0.00         6
           7       0.00 

### Logistic Regression with dummy columns

In [38]:
X_dummy.shape

(1520, 66)

In [39]:
y.shape

(1520, 2)

In [40]:
y_hg_dummy = y["FTHG"]
y_ag_dummy = y["FTAG"]

In [41]:
X_train_hg_dum, X_test_hg_dum, y_train_hg_dum, y_test_hg_dum = train_test_split(X_dummy, y_hg_dummy, train_size=0.80,
                                                                               random_state=1)

In [42]:
X_train_ag_dum, X_test_ag_dum, y_train_ag_dum, y_test_ag_dum = train_test_split(X_dummy, y_ag_dummy, train_size=0.80,
                                                                               random_state=1)

In [43]:
log_dummy_hg = LogisticRegression()
log_dummy_ag = LogisticRegression()

In [44]:
log_dummy_hg.fit(X_train_hg_dum, y_train_hg_dum)
log_dummy_ag.fit(X_train_ag_dum, y_train_ag_dum)

LogisticRegression()

In [45]:
y_pred_hg_dum = log_dummy_hg.predict(X_test_hg_dum)
y_pred_ag_dum = log_dummy_ag.predict(X_test_ag_dum)

In [46]:
print(accuracy_score(y_test_hg_dum, y_pred_hg_dum))
print(accuracy_score(y_test_ag_dum, y_pred_ag_dum))

0.4375
0.4934210526315789


In [47]:
print(classification_report(y_test_hg_dum, y_pred_hg_dum))
print(classification_report(y_test_ag_dum, y_pred_ag_dum))

              precision    recall  f1-score   support

           0       0.55      0.56      0.55        73
           1       0.41      0.51      0.45        93
           2       0.43      0.38      0.40        76
           3       0.32      0.30      0.31        33
           4       0.33      0.12      0.18        16
           5       0.40      0.33      0.36        12
           6       0.00      0.00      0.00         1

    accuracy                           0.44       304
   macro avg       0.35      0.32      0.32       304
weighted avg       0.43      0.44      0.43       304

              precision    recall  f1-score   support

           0       0.61      0.76      0.68        93
           1       0.45      0.49      0.47        98
           2       0.50      0.32      0.39        75
           3       0.23      0.23      0.23        26
           4       0.20      0.20      0.20         5
           5       0.00      0.00      0.00         6
           6       0.00 