In [3]:
import pandas as pd
df = pd.read_csv("England CSV.csv")

In [5]:
df.head()

Unnamed: 0,Date,Season,HomeTeam,AwayTeam,FTH Goals,FTA Goals,FT Result,HTH Goals,HTA Goals,HT Result,...,H Fouls,A Fouls,H Corners,A Corners,H Yellow,A Yellow,H Red,A Red,Display_Order,League
0,16/01/2025,2024/25,Ipswich Town,Brighton & Hove Albion,0,2,A,0.0,1.0,A,...,13.0,14.0,1.0,9.0,2.0,2.0,0.0,0.0,20250116,Premier League
1,16/01/2025,2024/25,Man United,Southampton,3,1,H,0.0,1.0,A,...,7.0,10.0,4.0,4.0,1.0,3.0,0.0,0.0,20250116,Premier League
2,15/01/2025,2024/25,Everton,Aston Villa,0,1,A,0.0,0.0,D,...,17.0,10.0,8.0,5.0,2.0,1.0,0.0,0.0,20250115,Premier League
3,15/01/2025,2024/25,Leicester,Crystal Palace,0,2,A,0.0,0.0,D,...,7.0,6.0,4.0,3.0,0.0,0.0,0.0,0.0,20250115,Premier League
4,15/01/2025,2024/25,Newcastle,Wolves,3,0,H,1.0,0.0,H,...,10.0,13.0,4.0,2.0,0.0,2.0,0.0,0.0,20250115,Premier League


In [7]:
df.isna().sum()

Date                0
Season              0
HomeTeam            0
AwayTeam            0
FTH Goals           0
FTA Goals           0
FT Result           0
HTH Goals         924
HTA Goals         924
HT Result         924
Referee          2824
H Shots          2824
A Shots          2824
H SOT            2824
A SOT            2824
H Fouls          2824
A Fouls          2824
H Corners        2824
A Corners        2824
H Yellow         2824
A Yellow         2824
H Red            2824
A Red            2824
Display_Order       0
League              0
dtype: int64

In [9]:
# Drop duplicates
df.drop_duplicates(inplace=True)

In [11]:
df.dropna(inplace=True)

In [13]:
df.isna().sum()

Date             0
Season           0
HomeTeam         0
AwayTeam         0
FTH Goals        0
FTA Goals        0
FT Result        0
HTH Goals        0
HTA Goals        0
HT Result        0
Referee          0
H Shots          0
A Shots          0
H SOT            0
A SOT            0
H Fouls          0
A Fouls          0
H Corners        0
A Corners        0
H Yellow         0
A Yellow         0
H Red            0
A Red            0
Display_Order    0
League           0
dtype: int64

In [15]:
# Check for impossible values (e.g., negative goals)
print(df[['FTH Goals', 'FTA Goals']].describe())

         FTH Goals    FTA Goals
count  9329.000000  9329.000000
mean      1.536178     1.179333
std       1.305748     1.157003
min       0.000000     0.000000
25%       1.000000     0.000000
50%       1.000000     1.000000
75%       2.000000     2.000000
max       9.000000     9.000000


In [17]:
# Clip extreme shots/corners (if needed)
df['HS'] = df['H Shots'].clip(upper=df['H Shots'].quantile(0.99))

In [21]:
# Feature engineering (better than outlier removal)
df['Home_GoalDiff'] = df['FTH Goals'] - df['FTA Goals']

In [23]:
# Keep key stats and engineer ratios:
df['Shot_Ratio'] = df['H Shots'] / (df['A Shots'] + 1e-6)  # Avoid division by zero
df['Corner_Ratio'] = df['H Corners'] / (df['A Corners'] + 1e-6)

In [25]:
columns_to_drop = ['Season', 'Referee', 'H SOT', 'A SOT', 'H Shots', 'A Shots', 'H Fouls', 'A Fouls', 'H Corners', 'A Corners', 'A Yellow', 'H Yellow', 'H Red', 'A Red', 'League', 'Display_Order']
df_reduced = df.drop(columns=columns_to_drop)

In [27]:
df_reduced 

Unnamed: 0,Date,HomeTeam,AwayTeam,FTH Goals,FTA Goals,FT Result,HTH Goals,HTA Goals,HT Result,HS,Home_GoalDiff,Shot_Ratio,Corner_Ratio
0,16/01/2025,Ipswich Town,Brighton & Hove Albion,0,2,A,0.0,1.0,A,5.0,-2,0.454545,0.111111
1,16/01/2025,Man United,Southampton,3,1,H,0.0,1.0,A,23.0,2,1.769231,1.000000
2,15/01/2025,Everton,Aston Villa,0,1,A,0.0,0.0,D,10.0,-1,0.909091,1.600000
3,15/01/2025,Leicester,Crystal Palace,0,2,A,0.0,0.0,D,21.0,-2,2.333333,1.333333
4,15/01/2025,Newcastle,Wolves,3,0,H,1.0,0.0,H,17.0,3,1.307692,1.999999
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9324,19/08/2000,Leicester,Aston Villa,0,0,D,0.0,0.0,D,5.0,0,1.000000,1.250000
9325,19/08/2000,Leeds,Everton,2,0,H,2.0,0.0,H,17.0,2,1.416667,1.500000
9326,19/08/2000,Derby,Southampton,2,2,D,1.0,2.0,A,6.0,0,0.461538,0.625000
9327,19/08/2000,Coventry,Middlesbrough,1,3,A,1.0,1.0,D,6.0,-2,0.375000,2.000000


In [29]:
from sklearn.preprocessing import LabelEncoder, StandardScaler

# 1. Encode team names
le = LabelEncoder()
df['HomeTeam'] = le.fit_transform(df['HomeTeam'])
df['AwayTeam'] = le.transform(df['AwayTeam'])

# 2. Scale numerical features
scaler = StandardScaler()
df[['HS', 'Home_GoalDiff']] = scaler.fit_transform(df[['HS', 'Home_GoalDiff']])

# 3. Extract time features
df['Year'] = pd.to_datetime(df['Date']).dt.year
df['Month'] = pd.to_datetime(df['Date']).dt.month
df.drop('Date', axis=1, inplace=True)

  df['Year'] = pd.to_datetime(df['Date']).dt.year
  df['Month'] = pd.to_datetime(df['Date']).dt.month


In [31]:
df['FT Result'] = df['FT Result'].map({'H': 0, 'D': 1, 'A': 2})

In [33]:
from sklearn.model_selection import train_test_split

# Sort by date (if not already sorted)
df_reduced  = df_reduced .sort_values('Date')

# Split into features (X) and target (y)
X = df_reduced .drop(['FT Result', 'Date'], axis=1)  # Features
y = df_reduced ['FT Result']                         # Target (H=0, D=1, A=2)

# Time-based split (e.g., 80% train, 20% test)
train_size = int(0.8 * len(df))
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

In [None]:
pip install xgboost

In [None]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, classification_report

# 1. Encode categorical variables (team names)
categorical_cols = ['HomeTeam', 'AwayTeam']

# Label Encoding for tree-based models
le = LabelEncoder()
for col in categorical_cols:
    X_train[col] = le.fit_transform(X_train[col])
    X_test[col] = le.transform(X_test[col])

# 2. Scale features for Logistic Regression (not needed for tree-based models)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 3. Encode target variable (if not already done)
y_train_encoded = y_train.map({'H': 0, 'D': 1, 'A': 2})
y_test_encoded = y_test.map({'H': 0, 'D': 1, 'A': 2})

# 4. Initialize models
models = {
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "XGBoost": XGBClassifier(n_estimators=100, random_state=42, use_label_encoder=False, eval_metric='mlogloss'),
    "Logistic Regression": LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)
}

# 5. Train and evaluate models
results = {}
for name, model in models.items():
    # Use scaled data for Logistic Regression, raw for others
    if name == "Logistic Regression":
        model.fit(X_train_scaled, y_train_encoded)
        y_pred = model.predict(X_test_scaled)
    else:
        model.fit(X_train, y_train_encoded)
        y_pred = model.predict(X_test)
    
    # Store results
    results[name] = {
        'accuracy': accuracy_score(y_test_encoded, y_pred),
        'report': classification_report(y_test_encoded, y_pred, target_names=['Home', 'Draw', 'Away'])
    }

# 6. Print results
for model_name, metrics in results.items():
    print(f"\n{model_name} Performance:")
    print(f"Accuracy: {metrics['accuracy']:.4f}")
    print("Classification Report:")
    print(metrics['report'])

# 7. Feature Importance (for tree-based models)
print("\nFeature Importances:")
rf_importances = models["Random Forest"].feature_importances_
xgb_importances = models["XGBoost"].feature_importances_

importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'RF Importance': rf_importances,
    'XGB Importance': xgb_importances
}).sort_values('RF Importance', ascending=False)

print(importance_df.head(10))