In [1]:
import os
import sys
from src.footballproject.exception import CustomException
from src.footballproject.logger import logging
from src.footballproject.utils import save_object
import pandas as pd
import numpy as np

from dataclasses import dataclass
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, MinMaxScaler, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin


In [40]:
df = pd.read_csv('data/rawdata.csv')

In [41]:
# Assuming df is your original DataFrame

# Bootstrapping to increase dataset size
def bootstrap_data(data, num_samples):
    bootstrapped_data = pd.concat([data.sample(frac=1, replace=True) for _ in range(num_samples)], ignore_index=True)
    return bootstrapped_data

# Increase the training size by bootstrapping
bootstrapped_df = bootstrap_data(df, num_samples=4)  # Example: Increase the size by 10 times


In [42]:
df = bootstrapped_df
df.to_csv('model.csv')

In [5]:
df.columns

Index(['Date', 'Home', 'xG_Home', 'xG_Away', 'Away', 'Referee', 'xGA_Home',
       'xGA_Away', 'Home_Fatigue', 'Away_Fatigue', 'Temp', 'Humidity', 'Wind',
       'Referee_Bias', 'G_Home', 'G_Away', 'Result'],
      dtype='object')

In [6]:
df['xG_Diff'] = df['xG_Home'] - df['xG_Away']
df['xGA_Diff'] = df['xGA_Home'] - df['xGA_Away']

df['xGA_Ratio_Home'] = df['xGA_Home'] / (df['xG_Home'] + 0.1)
df['xGA_Ratio_Away'] = df['xGA_Away'] / (df['xG_Away'] + 0.1)

In [7]:
df.drop(['G_Home', 'G_Away','Date','Referee'], axis=1, inplace=True)
df.drop(['Home','Away'], axis=1,inplace=True)
            

In [8]:
target_column_name="Result"
X = df.drop(columns = target_column_name, axis = 1)
y = df[target_column_name]


In [9]:
numerical_columns = [
    'xG_Home','xG_Away','xGA_Home','xGA_Away','xG_Diff','xGA_Diff','xGA_Ratio_Home','xGA_Ratio_Away'
]

ordinal_columns = [
    "Temp", "Humidity", "Wind", "Home_Fatigue", "Away_Fatigue","Referee_Bias"
]

# categorical_columns = ["Result"]

categories_1 = [['Low', 'Moderate', 'High']] * 5
categories_2 = ['Home','Away']
categories_1.append(categories_2)


num_pipeline=Pipeline(steps=[
    ('scalar',StandardScaler())

])

# cat_pipeline=Pipeline(steps=[
#     ("one_hot_encoder",LabelEncoder()),
#     ("scaler",StandardScaler())
# ])

ord_pipeline=Pipeline(steps=[
    ('ordinal', OrdinalEncoder(categories=categories_1)),
    ("scaler",StandardScaler())
])


preprocessor = ColumnTransformer([
    ('numerical_pipeline', num_pipeline,numerical_columns ),
    # ('categorical_pipeline', cat_pipeline,categorical_columns ),
    ('ordinal_pipeline', ord_pipeline,ordinal_columns )
])


In [10]:
X = preprocessor.fit_transform(X)

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [12]:

# Generate and plot a synthetic imbalanced classification dataset
from collections import Counter
counter = Counter(y)
print(counter)

Counter({'Home': 3310, 'Away': 2427, 'Draw': 1711})


In [13]:
smote = SMOTE(k_neighbors=5, random_state=42)
X, y = smote.fit_resample(X, y)

In [14]:

# Generate and plot a synthetic imbalanced classification dataset
from collections import Counter
counter = Counter(y)
print(counter)

Counter({'Home': 3310, 'Away': 3310, 'Draw': 3310})


In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42)

In [56]:
models = {
            "Random Forest": RandomForestClassifier(n_estimators=120, max_depth=9, random_state=42,min_samples_split=3),
            "Logistic Regression": LogisticRegression(random_state=42,C = 1, max_iter= 10000, penalty= 'l1', solver = 'saga'),
            "KNN": KNeighborsClassifier(n_neighbors=4),
            "Linear SVC": LinearSVC(C=10, random_state=42,dual=False, max_iter=1000, tol=1e-9,class_weight='balanced',penalty='l1'),
            "Naive Bayes": GaussianNB(var_smoothing=1e-11)

            }



In [19]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

In [82]:
rf_model = RandomForestClassifier(n_estimators=150, max_depth=10, random_state=42,min_samples_split=3)

In [83]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)
scores = []
for train_index, val_index in kf.split(X,y):
    X_train_fold, X_val_fold = X[train_index], X[val_index]
    y_train_fold, y_val_fold = y[train_index], y[val_index]
    
    # Train the model on the training fold
    rf_model.fit(X_train_fold, y_train_fold)
    
    # Evaluate the model on the validation fold
    score = rf_model.score(X_val_fold, y_val_fold)
    scores.append(score)

# Print the cross-validation scores
print("Cross-validation scores:", scores)

print("Mean CV Score:", np.mean(scores))

Cross-validation scores: [0.8872104733131924, 0.8992950654582075, 0.8801611278952669, 0.892245720040282, 0.8937562940584088]
Mean CV Score: 0.8905337361530716


In [79]:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
# Split data using stratified KFold
skf = KFold(n_splits=5, shuffle=True, random_state=42)


true_labels = []
predicted_labels = []
accuracy_scores = []
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Train the model on the current fold
    rf_model.fit(X_train, y_train)

    # Predict and calculate accuracy on the test set
    y_pred = rf_model.predict(X_test)
    accuracy_scores.append(accuracy_score(y_test, y_pred))
    # print(classification_report(y_test, y_pred))
    # print(confusion_matrix(y_test, y_pred))
    true_labels.extend(y_test)
    predicted_labels.extend(y_pred)

# Convert lists to numpy arrays

    # print(classification_report(y_test, y_pred))
    # print(confusion_matrix(y_test, y_pred))

# Calculate and print the average accuracy and standard deviation
average_accuracy = sum(accuracy_scores) / len(accuracy_scores)
std_accuracy = np.std(accuracy_scores)

print("Average accuracy:", average_accuracy)
# print(classification_report(y_test, y_pred))
# print(confusion_matrix(y_test, y_pred))

true_labels = np.array(true_labels)
predicted_labels = np.array(predicted_labels)

# Generate the final classification report
classification_rep = classification_report(true_labels, predicted_labels)

print(classification_rep)

Average accuracy: 0.8905337361530716
              precision    recall  f1-score   support

        Away       0.92      0.89      0.90      3310
        Draw       0.86      0.92      0.89      3310
        Home       0.90      0.86      0.88      3310

    accuracy                           0.89      9930
   macro avg       0.89      0.89      0.89      9930
weighted avg       0.89      0.89      0.89      9930



In [64]:
randomforest_grid = {
    'n_estimators': [50, 100, 150],  # Number of trees in the forest
    'max_depth': [3,4,5,6,7,8,9,10],  # Maximum depth of the tree
    'min_samples_split': [2,3,4,5,6,7,8,9,10],  # Minimum number of samples required to split an internal node
    
}

In [65]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
cv = 5

randomforest_clf = GridSearchCV(rf_model, randomforest_grid, cv=cv, scoring='f1_macro')
randomforest_clf.fit(X_train, y_train)

print("Best random forest parameters:", randomforest_clf.best_params_)
print("Best Score:", randomforest_clf.best_score_)

Best random forest parameters: {'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 150}
Best Score: 0.8827278459293838


In [71]:
knn_model = KNeighborsClassifier(n_neighbors=7)

In [72]:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
# Split data using stratified KFold
skf = KFold(n_splits=5, shuffle=True, random_state=42)


true_labels = []
predicted_labels = []
accuracy_scores = []
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    knn_model.fit(X_train, y_train)

    y_pred = knn_model.predict(X_test)
    accuracy_scores.append(accuracy_score(y_test, y_pred))

    true_labels.extend(y_test)
    predicted_labels.extend(y_pred)


average_accuracy = sum(accuracy_scores) / len(accuracy_scores)
std_accuracy = np.std(accuracy_scores)

print("Average accuracy:", average_accuracy)


true_labels = np.array(true_labels)
predicted_labels = np.array(predicted_labels)

classification_rep = classification_report(true_labels, predicted_labels)

print(classification_rep)

Average accuracy: 0.8465256797583083
              precision    recall  f1-score   support

        Away       0.86      0.88      0.87      3310
        Draw       0.81      0.91      0.86      3310
        Home       0.89      0.75      0.81      3310

    accuracy                           0.85      9930
   macro avg       0.85      0.85      0.85      9930
weighted avg       0.85      0.85      0.85      9930

