In [1]:
import pandas as pd
import chardet

In [2]:
with open("./Datasets/merged_cleaned.csv", "rb") as f:
    result = chardet.detect(f.read())
    
matches = pd.read_csv("./Datasets/merged_cleaned.csv", index_col=0, encoding=result["encoding"])

In [3]:
matches["Date"] = pd.to_datetime(matches["Date"])

In [4]:
matches["htr_code"] = matches["HTR"].astype("category").cat.codes

In [5]:
matches["ftr_code"] = matches["FTR"].astype("category").cat.codes

In [6]:
matches["opp_code"] = matches["AwayTeam"].astype("category").cat.codes

In [7]:
matches["Time"] = matches["Time"].astype(str).str.replace(":.+", "", regex=True).astype(int)

In [8]:
matches["day_code"] = matches["Date"].dt.day_of_week

In [9]:
matches["target"] = (matches["ftr_code"] == 2)

In [10]:
predictors = ["FTHG", "FTAG", "HTHG","HTAG","HS","AS","HST","AST","HF","AF","HC","AC","HY","AY","HR","AR","B365H","B365D","B365A","BWH","BWD","BWA","IWH","IWD","IWA","PSH","PSD","PSA","WHH","WHD","WHA","VCH","VCD","VCA","MaxH","MaxD","MaxA","AvgH","AvgD","AvgA","B365>2.5","B365<2.5","P>2.5","P<2.5","Max>2.5","Max<2.5","Avg>2.5","Avg<2.5","AHh","B365AHH","B365AHA","PAHH","PAHA","MaxAHH","MaxAHA","AvgAHH","AvgAHA","B365CH","B365CD","B365CA","BWCH","BWCD","BWCA","IWCH","IWCD","IWCA","PSCH","PSCD","PSCA","WHCH","WHCD","WHCA","VCCH","VCCD","VCCA","MaxCH","MaxCD","MaxCA","AvgCH","AvgCD","AvgCA","B365C>2.5","B365C<2.5","PC>2.5","PC<2.5","MaxC>2.5","MaxC<2.5","AvgC>2.5","AvgC<2.5","AHCh","B365CAHH","B365CAHA","PCAHH","PCAHA","MaxCAHH","MaxCAHA","AvgCAHH","AvgCAHA", "opp_code", "day_code"]

In [11]:
matches.head()

Unnamed: 0_level_0,Date,Time,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,...,PCAHA,MaxCAHH,MaxCAHA,AvgCAHH,AvgCAHA,htr_code,ftr_code,opp_code,day_code,target
Div,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
D1,2019-08-16,19,Bayern Munich,Hertha,2,2,D,1,2,A,...,1.93,2.04,1.93,1.98,1.91,0,1,10,4,False
D1,2019-08-17,14,Dortmund,Augsburg,5,1,H,1,1,D,...,2.0,1.98,2.04,1.91,1.97,1,2,0,5,True
D1,2019-08-17,14,Freiburg,Mainz,3,0,H,0,0,D,...,1.97,1.97,2.06,1.9,1.99,1,2,14,5,True
D1,2019-08-17,14,Leverkusen,Paderborn,3,2,H,2,2,D,...,1.86,2.15,1.91,2.03,1.85,1,2,15,5,True
D1,2019-08-17,14,Werder Bremen,Fortuna Dusseldorf,1,3,A,0,1,A,...,2.0,1.95,2.11,1.89,2.0,0,0,7,5,False


In [12]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report

In [13]:
if all(col in matches.columns for col in predictors):
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(matches[predictors], matches["target"], test_size=0.2, random_state=42)

    # Create a Gradient Boosting Classifier and train it on the data
    gbm = GradientBoostingClassifier(random_state=42)
    gbm.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = gbm.predict(X_test)

    # Evaluate the model
    acc = accuracy_score(y_test, y_pred)
    class_report = classification_report(y_test, y_pred)
    
    acc, class_report
else:
    "Some predictor columns are missing in the dataset."

In [14]:
X_train, X_test, y_train, y_test = train_test_split(matches[predictors], matches["target"], test_size=0.2, random_state=42)

In [15]:
# Create a Gradient Boosting Classifier and train it on the cleaned data
gbm = GradientBoostingClassifier(random_state=42)

In [16]:
gbm.fit(X_train, y_train)

In [17]:
y_pred = gbm.predict(X_test)

In [18]:
# Evaluate the model
acc = accuracy_score(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

acc, class_report

(1.0,
 '              precision    recall  f1-score   support\n\n       False       1.00      1.00      1.00       102\n        True       1.00      1.00      1.00        82\n\n    accuracy                           1.00       184\n   macro avg       1.00      1.00      1.00       184\nweighted avg       1.00      1.00      1.00       184\n')

In [19]:
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
import numpy as np

In [20]:
# 1. Feature Importance
feature_importance = gbm.feature_importances_
sorted_idx = np.argsort(feature_importance)[::-1]

In [21]:
# 2. Cross-Validation
cross_val_scores = cross_val_score(gbm, X_train, y_train, cv=5)

In [22]:
# 3. Data Inspection: Already done during the data cleaning and transformation steps.

feature_importance[:10], cross_val_scores, np.mean(cross_val_scores)

(array([6.21601668e-01, 3.77751396e-01, 1.29014075e-05, 0.00000000e+00,
        0.00000000e+00, 2.61016132e-04, 0.00000000e+00, 2.30421494e-06,
        0.00000000e+00, 0.00000000e+00]),
 array([1., 1., 1., 1., 1.]),
 1.0)

In [23]:
# 1. Correlation Analysis
# Calculate the correlation between the target variable and the highly important features
highly_important_features = np.array(predictors)[sorted_idx[:2]]
correlation_matrix = matches[["target"] + list(highly_important_features)].corr()

In [24]:
# 2. Model Exclusion of Suspect Features
# Remove the highly important features and retrain the model
reduced_predictors = [p for p in predictors if p not in highly_important_features]
X_train_reduced, X_test_reduced = X_train[reduced_predictors], X_test[reduced_predictors]

In [25]:
# Create a new Gradient Boosting Classifier and train it on the reduced data
gbm_reduced = GradientBoostingClassifier(random_state=42)
gbm_reduced.fit(X_train_reduced, y_train)

In [26]:
# Make predictions on the reduced test set
y_pred_reduced = gbm_reduced.predict(X_test_reduced)

In [27]:
# Evaluate the new model
acc_reduced = accuracy_score(y_test, y_pred_reduced)
class_report_reduced = classification_report(y_test, y_pred_reduced)

In [28]:
correlation_matrix, acc_reduced, class_report_reduced

(          target      FTHG      FTAG
 target  1.000000  0.666652 -0.534130
 FTHG    0.666652  1.000000 -0.196793
 FTAG   -0.534130 -0.196793  1.000000,
 0.8369565217391305,
 '              precision    recall  f1-score   support\n\n       False       0.83      0.89      0.86       102\n        True       0.85      0.77      0.81        82\n\n    accuracy                           0.84       184\n   macro avg       0.84      0.83      0.83       184\nweighted avg       0.84      0.84      0.84       184\n')

In [29]:
from sklearn.metrics import roc_auc_score, f1_score
from sklearn.model_selection import StratifiedKFold
import numpy as np

In [30]:
# Initialize StratifiedKFold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [31]:
# Initialize lists to store metrics
auc_scores = []
f1_scores = []

In [32]:
# Initialize predictors and target
X = matches[reduced_predictors]
y = matches['target']

In [33]:
# Perform Stratified K-Fold cross-validation
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # Train the model
    gbm_reduced.fit(X_train, y_train)
    
    # Make predictions
    y_pred_proba = gbm_reduced.predict_proba(X_test)[:, 1]
    y_pred = gbm_reduced.predict(X_test)
    
    # Calculate metrics and append to lists
    auc_scores.append(roc_auc_score(y_test, y_pred_proba))
    f1_scores.append(f1_score(y_test, y_pred))

# Calculate average metrics
avg_auc = np.mean(auc_scores)
avg_f1 = np.mean(f1_scores)

avg_auc, avg_f1

(0.857618583947698, 0.7249007437557154)

In [34]:
# Filter the data to only include matches where either Bayern Munich or Leipzig were playing
bayern_leipzig_matches = matches[(matches['HomeTeam'].str.contains('Bayern Munich|Leipzig', case=False)) |
                                 (matches['AwayTeam'].str.contains('Bayern Munich|Leipzig', case=False))]

# Display the first few rows of the filtered data
bayern_leipzig_matches.head()

Unnamed: 0_level_0,Date,Time,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,...,PCAHA,MaxCAHH,MaxCAHA,AvgCAHH,AvgCAHA,htr_code,ftr_code,opp_code,day_code,target
Div,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
D1,2019-08-16,19,Bayern Munich,Hertha,2,2,D,1,2,A,...,1.93,2.04,1.93,1.98,1.91,0,1,10,4,False
D1,2019-08-18,17,Union Berlin,RB Leipzig,0,4,A,0,3,A,...,2.01,2.08,2.04,1.92,1.97,0,0,16,6,False
D1,2019-08-24,17,Schalke 04,Bayern Munich,0,3,A,0,1,A,...,1.85,2.07,2.35,1.98,1.9,0,0,1,5,False
D1,2019-08-25,14,RB Leipzig,Ein Frankfurt,2,1,H,1,0,H,...,2.0,2.08,2.02,1.93,1.96,2,2,5,6,True
D1,2019-08-30,19,M'gladbach,RB Leipzig,1,3,A,0,1,A,...,2.0,2.03,2.02,1.92,1.97,0,0,16,4,False


In [35]:
# Check if all predictor columns are present in the dataset
if all(col in bayern_leipzig_matches.columns for col in predictors):
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        bayern_leipzig_matches[predictors], 
        bayern_leipzig_matches["target"], 
        test_size=0.2, 
        random_state=42
    )

    # Create a Gradient Boosting Classifier and train it on the filtered data
    gbm_filtered = GradientBoostingClassifier(random_state=42)
    gbm_filtered.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred_filtered = gbm_filtered.predict(X_test)

    # Evaluate the model
    acc_filtered = accuracy_score(y_test, y_pred_filtered)
    class_report_filtered = classification_report(y_test, y_pred_filtered)
else:
    acc_filtered = "Some predictor columns are missing in the dataset."
    class_report_filtered = "Some predictor columns are missing in the dataset."

acc_filtered, class_report_filtered

(1.0,
 '              precision    recall  f1-score   support\n\n       False       1.00      1.00      1.00        25\n        True       1.00      1.00      1.00        15\n\n    accuracy                           1.00        40\n   macro avg       1.00      1.00      1.00        40\nweighted avg       1.00      1.00      1.00        40\n')

In [36]:
def prepare_features(home_team, away_team, model_features, data):
    # Calculate average stats for home team and away team
    home_team_data = data[data['HomeTeam'] == home_team]
    away_team_data = data[data['AwayTeam'] == away_team]
    
    home_team_avg = home_team_data[model_features].mean()
    away_team_avg = away_team_data[model_features].mean()
    
    # Prepare the feature vector based on the average stats
    feature_vector = (home_team_avg - away_team_avg).values.reshape(1, -1)
    
    return feature_vector