In [34]:
# 
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBRegressor, XGBClassifier
from sklearn.metrics import (
    mean_squared_error,
    mean_absolute_error,
    r2_score,
    classification_report,
    confusion_matrix,
    accuracy_score
  
)
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt


In [35]:
# Read the CSV file into a DataFrame
df = pd.read_csv('data/dataset.csv')
# Specify the columns you want to keep
columns_to_keep = ['HomeTeam', 'AwayTeam', 'FTHG', 'FTAG','FTR','B365H','B365D','B365A','BWH','BWD','BWA']
df = df[columns_to_keep]

In [36]:
# Remove rows with any NaN values
df = df.dropna()

# Count NaN values per column
nan_count = df.isna().sum()
print("\nCount of NaN values in each column:")
print(nan_count)

zero_count = (df == 0).sum()
print("\nCount of 0 values in each column:")
print(zero_count)

duplicates = df.duplicated(keep=False)  
duplicates_count = duplicates.sum()  

print(f"\nNumber of duplicate rows: {duplicates_count}")


Count of NaN values in each column:
HomeTeam    0
AwayTeam    0
FTHG        0
FTAG        0
FTR         0
B365H       0
B365D       0
B365A       0
BWH         0
BWD         0
BWA         0
dtype: int64

Count of 0 values in each column:
HomeTeam      0
AwayTeam      0
FTHG        328
FTAG        412
FTR           0
B365H         0
B365D         0
B365A         0
BWH           0
BWD           0
BWA           0
dtype: int64

Number of duplicate rows: 0


In [37]:
# Translate FTR to numerical values: H -> 0, D -> 1, A -> 2
ftr_mapping = {'H': 0, 'D': 1, 'A': 2}
df['FTR'] = df['FTR'].map(ftr_mapping)

# Features and target variable
X = df.drop(columns=['FTR'])
y = df['FTR']

# Encode categorical variables and scale numeric features
# Column names for the preprocessing
categorical_features = ['HomeTeam', 'AwayTeam']
numeric_features = ['FTHG', 'FTAG', 'B365H', 'B365D', 'B365A', 'BWH', 'BWD', 'BWA']

# Create a transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(drop='first'), categorical_features)
    ]
)

# Preprocess the data
X_transformed = preprocessor.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.2, random_state=42)

# Step 6: Train XGBoost classifier
classifier = XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
classifier.fit(X_train, y_train)

# Step 7: Make predictions
y_pred = classifier.predict(X_test)

# Perform 5-fold cross-validation
cv_scores = cross_val_score(classifier, X_train, y_train, cv=5)

# Step 8: Evaluate the classification model
# Evaluate the model
print(f"Mean Cross-Validation Accuracy: {cv_scores.mean():.4f}")
print('\nClassification Evaluation:')
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))



Mean Cross-Validation Accuracy: 0.9967

Classification Evaluation:
[[136   0   0]
 [  2  67   0]
 [  0   0  96]]
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       136
           1       1.00      0.97      0.99        69
           2       1.00      1.00      1.00        96

    accuracy                           0.99       301
   macro avg       1.00      0.99      0.99       301
weighted avg       0.99      0.99      0.99       301



In [38]:
X_reg_train, X_reg_test, y_reg_train, y_reg_test = train_test_split(X_transformed, y, test_size=0.2, random_state=42)

# Step 6: Train regression model
regressor = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
regressor.fit(X_reg_train, y_reg_train)

# Make regression predictions
y_reg_pred = regressor.predict(X_reg_test)

# Step 7: Evaluate the regression model
mse = mean_squared_error(y_reg_test, y_reg_pred)
mae = mean_absolute_error(y_reg_test, y_reg_pred)
r2 = r2_score(y_reg_test, y_reg_pred)

print(f'\nRegression Evaluation:')
print(f'Mean Squared Error: {mse:.2f}')
print(f'Mean Absolute Error: {mae:.2f}')
print(f'R² Score: {r2:.2f}')


Regression Evaluation:
Mean Squared Error: 0.00
Mean Absolute Error: 0.02
R² Score: 1.00
