In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,f1_score
from sklearn.tree import export_text
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from scipy.stats import chi2_contingency
from statsmodels.stats.outliers_influence import variance_inflation_factor
import pickle



data = "../data/PremierProcessed/mergeDataOriginal.csv"

X = data.drop('FTR',axis = 1)
y= data['FTR']


correlation_matrix = X.corr()

vif_data = pd.DataFrame()
vif_data["Variable"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

print("\nVIF Values:")
print(vif_data)


attribute_list = ['HomeTeam', 'AwayTeam', 'FTHG', 'FTAG','HTAG','HTHG','HTR','HS','AS','HST','AST','HC','AC','HF','AF','HY','AY','HR','AR']

# Set the significance level
alpha = 0.05

# Loop through each attribute
for attribute in attribute_list:
    # Create a contingency table
    contingency_table = pd.crosstab(data[attribute], data['FTR'])

    # Perform chi-square test
    chi2_stat, p_val, dof, expected = chi2_contingency(contingency_table)
    if p_val < alpha:
        print(f"The association between {attribute} and the target is significant.")
    else:
        print(f"There is no significant association between {attribute} and the target.")

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Decision Tree
dt_classifier = DecisionTreeClassifier(random_state=42,max_features='sqrt')
dt_classifier.fit(X_train, y_train)
y_pred_tr = dt_classifier.predict(X_train)
y_pred = dt_classifier.predict(X_test)
accuracy_dt = accuracy_score(y_test, y_pred)
accuracy_dt_tr = accuracy_score(y_train, y_pred_tr)
f1_dt = f1_score(y_test, y_pred, average='micro')
f1_dt_tr = f1_score(y_train, y_pred_tr, average='micro')
print(f'Decision Tree Training Accuracy: {accuracy_dt_tr:.2f}')
print(f'Decision Tree Testing Accuracy: {accuracy_dt:.2f}')
print(f'Decision Tree Training F1 score: {f1_dt_tr:.2f}')
print(f'Decision Tree Testing F1 score: {f1_dt:.2f}')
pickle.dump(dt_classifier, open('../src/model/decision_tree.py', 'wb'))

# Random Forest
rf_classifier = RandomForestClassifier(n_estimators=200, random_state=42,max_depth=10)
rf_classifier.fit(X_train, y_train)
y_pred_tr = rf_classifier.predict(X_train)
y_pred = rf_classifier.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred)
accuracy_rf_tr = accuracy_score(y_train, y_pred_tr)
f1_rf = f1_score(y_test, y_pred, average='micro')
f1_rf_tr = f1_score(y_train, y_pred_tr, average='micro')
print(f'Random Forest Training Accuracy: {accuracy_rf_tr:.2f}')
print(f'Random Forest Testing Accuracy: {accuracy_rf:.2f}')
print(f'Random Forest Training F1 score: {f1_rf_tr:.2f}')
print(f'Random Forest Testing F1 score: {f1_rf:.2f}')
pickle.dump(dt_classifier, open('../src/model/random_forest.py', 'wb'))

# Naive Bayes
naive_bayes_model = GaussianNB()
naive_bayes_model.fit(X_train, y_train)
y_pred_tr = naive_bayes_model.predict(X_train)
y_pred = naive_bayes_model.predict(X_test)
accuracy_nb = accuracy_score(y_test, y_pred)
accuracy_nb_tr = accuracy_score(y_train, y_pred_tr)
f1_nb = f1_score(y_test, y_pred, average='micro')
f1_nb_tr = f1_score(y_train, y_pred_tr, average='micro')
print(f'Naive Bayes Training Accuracy: {accuracy_nb_tr:.2f}')
print(f'Naive Bayes Testing Accuracy: {accuracy_nb:.2f}')
print(f'Naive Bayes Training F1 score: {f1_nb_tr:.2f}')
print(f'Naive Bayes Testing F1 score: {f1_nb:.2f}')
pickle.dump(dt_classifier, open('../src/model/naive_bayes.py', 'wb'))

# Logistic Regression
logistic_regression_model = LogisticRegression(random_state=42,multi_class='ovr',max_iter=500)
logistic_regression_model.fit(X_train, y_train)
y_pred_tr = naive_bayes_model.predict(X_train)
y_pred = logistic_regression_model.predict(X_test)
accuracy_lr = accuracy_score(y_test, y_pred)
accuracy_lr_tr = accuracy_score(y_train, y_pred_tr)
f1_lr = f1_score(y_test, y_pred, average='micro')
f1_lr_tr = f1_score(y_train, y_pred_tr, average='micro')
print(f'Logistic Regression Training Accuracy: {accuracy_lr_tr:.2f}')
print(f'Logistic Regression Testing Accuracy: {accuracy_lr:.2f}')
print(f'Logistic Regression Training F1 score: {f1_lr_tr:.2f}')
print(f'Logistic Regression Testing F1 score: {f1_lr:.2f}')
pickle.dump(dt_classifier, open('../src/model/logistic_regression.py', 'wb'))


: 

In [7]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,f1_score
from sklearn.tree import export_text
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from scipy.stats import chi2_contingency
from statsmodels.stats.outliers_influence import variance_inflation_factor
import matplotlib.pyplot as plt
import seaborn as sns

data = pd.read_csv("mergedDataOriginal.csv")

data.head()


Unnamed: 0,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,HS,AS,...,AY,HR,AR,FTGoalDiff,HTGoalDiff,RedCardDifference,HShotRatio,AShotRatio,HCornersRatio,ACornersRatio
0,12,25,0.444444,0.0,0,0.4,0.0,0,0.395349,0.258065,...,0.222222,0.0,0.0,0.722222,0.7,0.4,0.164706,0.1875,0.5,0.5
1,13,42,0.444444,0.222222,0,0.2,0.0,0,0.395349,0.387097,...,0.222222,0.0,0.0,0.611111,0.6,0.4,0.117647,0.15625,0.5,0.5
2,14,28,0.111111,0.333333,1,0.2,0.2,2,0.139535,0.516129,...,0.333333,0.333333,0.0,0.388889,0.5,0.6,0.1,0.210938,0.666667,0.333333
3,16,35,0.222222,0.222222,2,0.2,0.4,1,0.139535,0.419355,...,0.111111,0.0,0.0,0.5,0.4,0.4,0.133333,0.173077,0.384615,0.615385
4,22,17,0.222222,0.0,0,0.4,0.0,0,0.395349,0.387097,...,0.333333,0.0,0.0,0.611111,0.7,0.4,0.094118,0.1875,0.6,0.4
