In [None]:
# Import required libraries
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
import numpy as np

# Load your DataFrame (assuming it's already loaded as 'df')
df = pd.read_csv("England CSV.csv")
df.head()
# Drop duplicates
df.drop_duplicates(inplace=True)
df.dropna(inplace=True)
# Check for impossible values (e.g., negative goals)
print(df[['FTH Goals', 'FTA Goals']].describe())
# Clip extreme shots/corners (if needed)
df['HS'] = df['H Shots'].clip(upper=df['H Shots'].quantile(0.99))
# Feature engineering (better than outlier removal)
df['Home_GoalDiff'] = df['FTH Goals'] - df['FTA Goals']
# Keep key stats and engineer ratios:
df['Shot_Ratio'] = df['H Shots'] / (df['A Shots'] + 1e-6)  # Avoid division by zero
df['Corner_Ratio'] = df['H Corners'] / (df['A Corners'] + 1e-6)
columns_to_drop = ['Season', 'Referee', 'H SOT', 'A SOT', 'H Shots', 'A Shots', 'H Fouls', 'A Fouls', 'H Corners', 'A Corners', 'A Yellow', 'H Yellow', 'H Red', 'A Red', 'League', 'Display_Order']
df_reduced = df.drop(columns=columns_to_drop)

# 1. Encode team names
le_teams = LabelEncoder()
df_reduced['HomeTeam'] = le_teams.fit_transform(df_reduced['HomeTeam'])
df_reduced['AwayTeam'] = le_teams.transform(df_reduced['AwayTeam'])

# 2. Scale numerical features
scaler = StandardScaler()
df_reduced[['HS', 'Home_GoalDiff']] = scaler.fit_transform(df_reduced[['HS', 'Home_GoalDiff']])

# 3. Extract time features
df_reduced['Date'] = pd.to_datetime(df_reduced['Date'])
df_reduced['Year'] = df_reduced['Date'].dt.year
df_reduced['Month'] = df_reduced['Date'].dt.month
df_reduced.drop('Date', axis=1, inplace=True)

# Encode the target variable
label_encoder_result = LabelEncoder()
df_reduced['FT Result Encoded'] = label_encoder_result.fit_transform(df_reduced['FT Result'])
df_reduced.drop('FT Result', axis=1, inplace=True)

from sklearn.model_selection import train_test_split

# Sort by date
df_reduced = df_reduced.sort_values('Year') # Sorting by year as 'Date' is already removed

# Split into features (X) and target (y)
X = df_reduced.drop('FT Result Encoded', axis=1)  # Features
y = df_reduced['FT Result Encoded']             # Target (0, 1, 2)

# Time-based split
train_size = int(0.8 * len(df_reduced))
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

# 2. Data Preparation (Scaling - moved here to be consistent)
scaler_final = StandardScaler()
X_train_scaled = scaler_final.fit_transform(X_train)
X_test_scaled = scaler_final.transform(X_test)

# Convert scaled arrays back to DataFrames (optional, but good practice for clarity)
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)

# 3. Model Initialization
models = {
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "XGBoost": XGBClassifier(n_estimators=100, random_state=42, eval_metric='mlogloss'),
    "Logistic Regression": LogisticRegression(multi_class='multinomial',
                                               solver='lbfgs',
                                               max_iter=1000)
}

# 4. Model Training and Evaluation
results = {}
for name, model in models.items():
    try:
        if name == "Logistic Regression":
            model.fit(X_train_scaled, y_train) # Use scaled data and original y_train (which is now encoded)
            y_pred = model.predict(X_test_scaled)
        else:
            model.fit(X_train_scaled, y_train) # Use scaled data and original y_train (which is now encoded)
            y_pred = model.predict(X_test_scaled)

        results[name] = {
            'accuracy': accuracy_score(y_test, y_pred),
            'report': classification_report(y_test, y_pred,
                                            target_names=label_encoder_result.classes_) # Use the correct encoder's classes
        }
    except Exception as e:
        print(f"Error with {name}: {str(e)}")
        continue

# Print the results
for name, result in results.items():
    print(f"\n--- {name} ---")
    print(f"Accuracy: {result['accuracy']:.4f}")
    print("Classification Report:\n", result['report'])