In [14]:
#Here we first load our data sets for cleaning and preprocessing
#     ==================================================================    
#     ==================================================================    


import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib
import numpy as np


# List of the CSV file paths
files = ['EPL-season-2020-2021.csv',
         'EPL-season-2021-2022.csv',
         'EPL-season-2022-2023.csv',
         'EPL-season-2023-2024.csv',
         'EPL-season-2024-2025.csv']

# List to hold dataframes
dfs = []

# Set pandas to display all columns without truncation
pd.set_option('display.max_columns', None)

# Read each CSV file into a DataFrame and append it to the list
for file in files:
    df = pd.read_csv(file)
    dfs.append(df)

# Concatenate all DataFrames vertically (row-wise)
combined_df = pd.concat(dfs, ignore_index=True)

# Step 1: Standardize date format (if 'Date' column exists)
if 'Date' in combined_df.columns:
    # Convert to datetime, specifying that the day appears first in the format (day/month/year)
    combined_df['Date'] = pd.to_datetime(combined_df['Date'], dayfirst=True)

    # Optionally, you can format it to a specific date string format (e.g., 'YYYY-MM-DD')
    combined_df['Date'] = combined_df['Date'].dt.strftime('%Y-%m-%d')

# Step 2: Fill missing values
# You can choose to fill missing values based on column types:
# For numerical columns, fill with 0 (or mean/median if necessary)
combined_df = combined_df.fillna(0)  # for numerical columns
# For categorical columns, you could fill with 'Unknown'
# combined_df = combined_df.fillna('Unknown')  # for categorical columns

# Step 3: Remove duplicate rows
combined_df = combined_df.drop_duplicates()

# Step 4: Drop columns with all missing data
combined_df = combined_df.dropna(axis=1, how='all')

# Step 5: Save the combined DataFrame to a new CSV file
combined_df.to_csv('combined_file.csv', index=False)

#write the column names in a txt file for better analysis
with open('column_names.txt', 'w') as f:
    for col in combined_df.columns:
        f.write(f"{col}\n")

#     ==================================================================    
#     ==================================================================    

        
#Here we train the models now

# Step 1: Load the cleaned data
combined_df = pd.read_csv('combined_file.csv')

# Step 2: Identify the target and features
target = 'FTR'  # Example: Full Time Result (Home Win, Away Win, Draw)
features = combined_df.columns.difference([target])

# Step 3: Prepare the data
X = combined_df[features]  # Features (input)
y = combined_df[target]    # Target (output)

# Step 4: Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 5: Preprocess categorical and numerical features
# Define categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object']).columns
numerical_cols = X.select_dtypes(exclude=['object']).columns

# Define a column transformer to handle preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),  # Scale numerical features
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)  # One-hot encode categorical features
    ])

# Step 6: Create a pipeline with preprocessing and model
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))  # RandomForest for classification
])

# Step 7: Train the model
model_pipeline.fit(X_train, y_train)

# Step 8: Make predictions on the test set
y_pred = model_pipeline.predict(X_test)

#dump to a file using joblib
joblib.dump(model_pipeline, "model.pkl")

# Step 9: Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"\n==================================================================\n")

print(f"Accuracy: {accuracy:.4f}")

print(f"\n==================================================================\n")


# Classification Report (Precision, Recall, F1-score)
print("Classification Report:\n", classification_report(y_test, y_pred))

print(f"\n==================================================================\n")

# Confusion Matrix
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

print(f"\n==================================================================\n")
    
#     ==================================================================    
#     ==================================================================    

#Here we test the model with some data

# Load the saved model
model_pipeline = joblib.load("model.pkl")

# Prepare sample match data
sample_match_data = {
    'FTHG': 0,  # No home goals
    'FTAG': 3,  # More away goals
    
    'HTHG': 0,  
    'HTAG': 2, # Strong away team half-time lead
    'HTR': 'H',
    
    'HS': 3,  # Few home team shots
    'AS': 14, # High away team shots
    'HST': 1, # Few home shots on target
    'AST': 10, # More away shots on target
    
    'HF': 7,   # More home team fouls
    'AF': 2,   # Less away team fouls
    'HC': 2,   # Few home team corners
    'AC': 7,   # Many away team corners
    
    'HY': 3,   # More yellow cards
    'AY': 1,   # Fewer away team yellow cards
    
    'B365H': 3.5,  # Lower home win odds
    'B365D': 4.0,
    'B365A': 2.0   # Higher away win odds
}

# Create DataFrame with template from first training row
sample_match = X.iloc[[0]].copy()
for col, val in sample_match_data.items():
   if col in sample_match.columns:
       sample_match.loc[0, col] = val

        
# Make prediction
prediction = model_pipeline.predict(sample_match)
prediction_proba = model_pipeline.predict_proba(sample_match)

# After prediction
class_labels = model_pipeline.named_steps['classifier'].classes_
for label, prob in zip(class_labels, prediction_proba[0]):
    print(f"{label} Win Probability: {prob:.2f}\n")

print(f"\n==================================================================\n")
    
print("Predicted Result:", prediction[0])

print(f"\n==================================================================\n")

print(f"Probabilities - \n\nHome Win: {prediction_proba[0][2]:.2f},\nDraw: {prediction_proba[0][1]:.2f},\nAway Win: {prediction_proba[0][0]:.2f}")





Accuracy: 0.7414


Classification Report:
               precision    recall  f1-score   support

           A       0.77      0.88      0.82       123
           D       0.67      0.19      0.30        83
           H       0.73      0.94      0.82       142

    accuracy                           0.74       348
   macro avg       0.72      0.67      0.65       348
weighted avg       0.73      0.74      0.70       348



Confusion Matrix:
 [[108   3  12]
 [ 30  16  37]
 [  3   5 134]]


A Win Probability: 0.78

D Win Probability: 0.09

H Win Probability: 0.13



Predicted Result: A


Probabilities - 

Home Win: 0.13,
Draw: 0.09,
Away Win: 0.78
