In [1]:
import sys
import os

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.metrics import davies_bouldin_score
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV, StratifiedKFold, RandomizedSearchCV, StratifiedShuffleSplit
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
import shap 
from sklearn.feature_selection import mutual_info_classif, RFE, SelectKBest, chi2, f_classif, SelectFromModel


from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier,  VotingClassifier, AdaBoostClassifier, BaggingClassifier, RandomForestClassifier


from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, auc


import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import statsmodels.api as sma
from statsmodels.stats.outliers_influence import variance_inflation_factor



In [2]:
df = pd.read_csv("C:/Users/kisha/Documents/Uni-Stuff/Dissertation/books/git/data_cleaned_no_weight.csv")
X = df.drop(columns=['Winner'])
y= df['Winner']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.45, random_state=42)

# #oversampling minority class
# smote = SMOTE(sampling_strategy='minority')
# X_train, y_train = smote.fit_resample(X_train, y_train)

# scaler = StandardScaler()
# X_train = scaler.fit_transform(X_train)
# X_test = scaler.transform(X_test)

In [3]:
features = df.drop(columns=['Winner'])

In [4]:
features.dtypes

title_bout                int64
no_of_rounds              int64
B_avg_KD                float64
B_avg_opp_KD            float64
B_avg_SIG_STR_pct       float64
                         ...   
B_Stance_Switch            bool
R_Stance_Open Stance       bool
R_Stance_Orthodox          bool
R_Stance_Southpaw          bool
R_Stance_Switch            bool
Length: 145, dtype: object

In [5]:
print(features.isna().sum())

# Check for infinite values
print(np.isinf(features).sum())

title_bout              0
no_of_rounds            0
B_avg_KD                0
B_avg_opp_KD            0
B_avg_SIG_STR_pct       0
                       ..
B_Stance_Switch         0
R_Stance_Open Stance    0
R_Stance_Orthodox       0
R_Stance_Southpaw       0
R_Stance_Switch         0
Length: 145, dtype: int64
title_bout              0
no_of_rounds            0
B_avg_KD                0
B_avg_opp_KD            0
B_avg_SIG_STR_pct       0
                       ..
B_Stance_Switch         0
R_Stance_Open Stance    0
R_Stance_Orthodox       0
R_Stance_Southpaw       0
R_Stance_Switch         0
Length: 145, dtype: int64


In [6]:
features = features.apply(pd.to_numeric, errors='coerce')


In [7]:
features.dtypes

title_bout                int64
no_of_rounds              int64
B_avg_KD                float64
B_avg_opp_KD            float64
B_avg_SIG_STR_pct       float64
                         ...   
B_Stance_Switch            bool
R_Stance_Open Stance       bool
R_Stance_Orthodox          bool
R_Stance_Southpaw          bool
R_Stance_Switch            bool
Length: 145, dtype: object

In [8]:
categorical_columns = features.select_dtypes(include=['bool', 'object']).columns

# Drop the categorical columns
features_numerical = features.drop(columns=categorical_columns)

In [9]:
features_with_constant = sma.add_constant(features_numerical)

# Calculate VIF for each feature
vif_data = pd.DataFrame()
vif_data["Feature"] = features_numerical.columns
vif_data["VIF"] = [variance_inflation_factor(features_with_constant.values, i + 1) for i in range(features_numerical.shape[1])]

# Display the VIFs
print(vif_data)

  return 1 - self.ssr/self.centered_tss


               Feature        VIF
0           title_bout   1.944659
1         no_of_rounds   2.135400
2             B_avg_KD   1.475868
3         B_avg_opp_KD   1.297556
4    B_avg_SIG_STR_pct   1.942018
..                 ...        ...
131       R_Height_cms   6.619507
132        R_Reach_cms   5.708666
133       R_Weight_lbs  10.955959
134              B_age   1.410912
135              R_age   1.672330

[136 rows x 2 columns]


In [10]:
#features with VIF > 10
features_high_vif = vif_data[vif_data['VIF'] > 10]
features_high_vif

#putting them in a list for better readability
features_high_vif_list = list(features_high_vif['Feature'])
features_high_vif_list

['B_avg_SIG_STR_att',
 'B_avg_SIG_STR_landed',
 'B_avg_opp_SIG_STR_att',
 'B_avg_opp_SIG_STR_landed',
 'B_avg_TOTAL_STR_att',
 'B_avg_TOTAL_STR_landed',
 'B_avg_opp_TOTAL_STR_att',
 'B_avg_opp_TOTAL_STR_landed',
 'B_avg_HEAD_att',
 'B_avg_HEAD_landed',
 'B_avg_opp_HEAD_att',
 'B_avg_opp_HEAD_landed',
 'B_avg_BODY_att',
 'B_avg_BODY_landed',
 'B_avg_opp_BODY_att',
 'B_avg_opp_BODY_landed',
 'B_avg_LEG_att',
 'B_avg_LEG_landed',
 'B_avg_opp_LEG_att',
 'B_avg_opp_LEG_landed',
 'B_avg_DISTANCE_att',
 'B_avg_DISTANCE_landed',
 'B_avg_opp_DISTANCE_att',
 'B_avg_opp_DISTANCE_landed',
 'B_avg_CLINCH_att',
 'B_avg_CLINCH_landed',
 'B_avg_opp_CLINCH_att',
 'B_avg_opp_CLINCH_landed',
 'B_avg_GROUND_att',
 'B_avg_GROUND_landed',
 'B_avg_opp_GROUND_att',
 'B_avg_opp_GROUND_landed',
 'B_total_rounds_fought',
 'B_wins',
 'B_losses',
 'B_win_by_Decision_Split',
 'B_win_by_Decision_Unanimous',
 'B_win_by_KO/TKO',
 'B_win_by_Submission',
 'B_win_by_TKO_Doctor_Stoppage',
 'B_Weight_lbs',
 'R_avg_SIG_STR_

In [11]:
corr_matrix = features_numerical.corr()
print(corr_matrix)

                   title_bout  no_of_rounds  B_avg_KD  B_avg_opp_KD  \
title_bout           1.000000      0.617936  0.096560     -0.038431   
no_of_rounds         0.617936      1.000000  0.169399     -0.000093   
B_avg_KD             0.096560      0.169399  1.000000     -0.036863   
B_avg_opp_KD        -0.038431     -0.000093 -0.036863      1.000000   
B_avg_SIG_STR_pct    0.064935      0.079720  0.132166     -0.133168   
...                       ...           ...       ...           ...   
R_Height_cms         0.005974      0.042395  0.081372      0.084710   
R_Reach_cms          0.022647      0.065818  0.100163      0.078117   
R_Weight_lbs         0.039781      0.068914  0.087277      0.067718   
B_age                0.037049      0.139914  0.042549      0.154729   
R_age                0.007678      0.120000  0.088272      0.037698   

                   B_avg_SIG_STR_pct  B_avg_opp_SIG_STR_pct  B_avg_TD_pct  \
title_bout                  0.064935              -0.072812      0.049

In [12]:
mi_scores = mutual_info_classif(X_train, y_train)

mi_scores_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Mutual_Info': mi_scores
})

mi_scores_df = mi_scores_df.sort_values(by='Mutual_Info', ascending=False)

# Display the MI scores
print(mi_scores_df)

                        Feature  Mutual_Info
85   R_avg_opp_TOTAL_STR_landed     0.021943
111         R_avg_GROUND_landed     0.020732
19   B_avg_opp_TOTAL_STR_landed     0.020590
25            B_avg_HEAD_landed     0.019827
81     R_avg_opp_SIG_STR_landed     0.019827
..                          ...          ...
78            R_avg_SIG_STR_att     0.000000
79         R_avg_SIG_STR_landed     0.000000
84      R_avg_opp_TOTAL_STR_att     0.000000
106            R_avg_CLINCH_att     0.000000
144             R_Stance_Switch     0.000000

[145 rows x 2 columns]


In [13]:
#merging vif and mi scores into one dataframe
comparison_df = pd.merge(vif_data, mi_scores_df, on='Feature')
#vif_mi_df

high_vif_threshold = 10
low_mi_threshold = comparison_df['Mutual_Info'].quantile(0.40)  # Below the 25th percentile


In [14]:
features_to_drop = comparison_df[(comparison_df['VIF'] > high_vif_threshold) & (comparison_df['Mutual_Info'] < low_mi_threshold)]['Feature']

X_train = X_train.drop(columns=features_to_drop)
X_test = X_test.drop(columns=features_to_drop)

print(f"Dropped features: {features_to_drop.tolist()}")

Dropped features: ['B_avg_SIG_STR_att', 'B_avg_SIG_STR_landed', 'B_avg_opp_SIG_STR_att', 'B_avg_TOTAL_STR_landed', 'B_avg_opp_HEAD_att', 'B_avg_BODY_landed', 'B_avg_opp_DISTANCE_landed', 'B_avg_opp_CLINCH_landed', 'B_avg_GROUND_landed', 'B_avg_opp_GROUND_att', 'B_avg_opp_GROUND_landed', 'B_total_rounds_fought', 'B_wins', 'B_losses', 'B_win_by_Decision_Unanimous', 'B_win_by_Submission', 'B_win_by_TKO_Doctor_Stoppage', 'R_avg_SIG_STR_att', 'R_avg_SIG_STR_landed', 'R_avg_opp_TOTAL_STR_att', 'R_avg_BODY_landed', 'R_avg_CLINCH_att', 'R_avg_CLINCH_landed', 'R_avg_opp_CLINCH_att', 'R_avg_opp_CLINCH_landed', 'R_avg_opp_GROUND_att', 'R_avg_opp_GROUND_landed', 'R_wins', 'R_win_by_Decision_Unanimous', 'R_win_by_Submission']


In [15]:
#X_train_reduced class imbalance
y_train.value_counts()

Winner
0    2602
1    1442
Name: count, dtype: int64

In [16]:
#oversampling minority class
smote = SMOTE(sampling_strategy='minority')
X_train, y_train = smote.fit_resample(X_train, y_train)


y_train.value_counts()

Winner
0    2602
1    2602
Name: count, dtype: int64

In [17]:

#scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [18]:
model_lr = LogisticRegression(
    C=100,
    class_weight=None,
    max_iter=10000,
    penalty='l1',
    solver='liblinear'
)

#evaluating model
model_lr.fit(X_train, y_train)
y_pred_lr = model_lr.predict(X_test)
y_pred_prob_lr = model_lr.predict_proba(X_test)[:,1]

print('Accuracy: ', accuracy_score(y_test, y_pred_lr))
print('Precision: ', precision_score(y_test, y_pred_lr))
print('Recall: ', recall_score(y_test, y_pred_lr))
print('F1: ', f1_score(y_test, y_pred_lr))
print('ROC AUC:', roc_auc_score(y_test, y_pred_lr))
print('Confusion Matrix: \n', confusion_matrix(y_test, y_pred_lr))
print('Classification Report: \n', classification_report(y_test, y_pred_lr))

Accuracy:  0.6557872469023874
Precision:  0.5264830508474576
Recall:  0.4179983179142136
F1:  0.46601031411157995
ROC AUC: 0.60357463065522
Confusion Matrix: 
 [[1673  447]
 [ 692  497]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.71      0.79      0.75      2120
           1       0.53      0.42      0.47      1189

    accuracy                           0.66      3309
   macro avg       0.62      0.60      0.61      3309
weighted avg       0.64      0.66      0.65      3309



In [19]:
#saving model
import joblib
joblib.dump(model_lr, 'model_lr_vif.pkl')

['model_lr_vif.pkl']

In [25]:

best_logistic = joblib.load("C:/Users/kisha/Documents/Uni-Stuff/Dissertation/books/git/DS-UoN-Sports-ML-bhamidipati/savedModels/best_model_lr.pkl")

In [26]:
df_actual = pd.read_csv("C:/Users/kisha/Documents/Uni-Stuff/Dissertation/books/git/data_cleaned_no_weight.csv")
X_actual = df_actual.drop(columns=['Winner'])
y_actual = df_actual['Winner']

#split
X_train_actual, X_test_actual, y_train_actual, y_test_actual = train_test_split(X_actual, y_actual, test_size=0.45, random_state=42)

#smote
smote = SMOTE(sampling_strategy='minority', random_state=42)
X_train_actual, y_train_actual = smote.fit_resample(X_train_actual, y_train_actual)

#scaling
scaler = StandardScaler()
X_train_actual = scaler.fit_transform(X_train_actual)
X_test_actual = scaler.transform(X_test_actual)


In [27]:
#testing the model
y_pred_best = best_logistic.predict(X_test_actual)
y_pred_prob_best = best_logistic.predict_proba(X_test_actual)[:,1]

print('Accuracy: ', accuracy_score(y_test, y_pred_best))
print('Precision: ', precision_score(y_test, y_pred_best))
print('Recall: ', recall_score(y_test, y_pred_best))
print('F1: ', f1_score(y_test, y_pred_best))
print('ROC AUC:', roc_auc_score(y_test, y_pred_best))
print('Confusion Matrix: \n', confusion_matrix(y_test, y_pred_best))
print('Classification Report: \n', classification_report(y_test, y_pred_best))


Accuracy:  0.670897552130553
Precision:  0.5536480686695279
Recall:  0.4339781328847771
F1:  0.48656294200848654
ROC AUC: 0.6188758588952188
Confusion Matrix: 
 [[1704  416]
 [ 673  516]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.72      0.80      0.76      2120
           1       0.55      0.43      0.49      1189

    accuracy                           0.67      3309
   macro avg       0.64      0.62      0.62      3309
weighted avg       0.66      0.67      0.66      3309

