<a href="https://colab.research.google.com/github/FieryCatalyst/ML-Models/blob/main/Premier_League_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Premier Leauge Prediction Model Training
Training the PL prediction model by using the historical data present from 1993 to 2024 seasons



In [None]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib
import numpy as np
import shutil # Import shutil for directory operations

# ===== Load football-data.co.uk CSV (historical) =====
def load_historical_csv(directory_path="/content/football data"):
    all_files = os.listdir(directory_path)
    csv_files = [os.path.join(directory_path, f) for f in all_files if f.endswith('.csv')]

    if not csv_files:
        print(f"No CSV files found in {directory_path}")
        return pd.DataFrame()

    all_dataframes = []
    for csv_file in csv_files:
        try:
            # Read with error handling and low memory
            df = pd.read_csv(csv_file, encoding="latin1", low_memory=False, on_bad_lines='skip')
            all_dataframes.append(df)
            print(f"Loaded {os.path.basename(csv_file)}")
        except Exception as e:
            print(f"Error loading {csv_file}: {e}")

    if all_dataframes:
        combined_df = pd.concat(all_dataframes, ignore_index=True)
        combined_df = combined_df.rename(columns=lambda x: x.strip())
        print("✅ All historical CSV files combined.")
        return combined_df
    else:
        print("❌ No historical data could be loaded.")
        return pd.DataFrame()

# ===== Create target =====
def create_target(df):
    print("ℹ️ Columns available in create_target:", df.columns.tolist()) # Debug print

    # Use the goals from historical data (lowercase column names after standardization)
    if "fthg" in df.columns and "ftag" in df.columns:
        df["result"] = df.apply(
            lambda row: 1 if row["fthg"] > row["ftag"]
                        else (-1 if row["fthg"] < row["ftag"] else 0),
            axis=1
        )
        print("✅ Target variable 'result' created based on historical data.")
    else:
        print("❌ Could not create target variable: Historical goal columns (fthg, ftag) not found after standardization.")
        df["result"] = np.nan # Or handle appropriately

    return df

# ===== Feature Engineering (Simplified) =====
def add_features(df):
    print("ℹ️ Columns available in add_features:", df.columns.tolist()) # Debug print

    # Use lowercase column names for goal calculations after standardization
    home_goals_col = "fthg"
    away_goals_col = "ftag"

    if home_goals_col not in df.columns or away_goals_col not in df.columns:
         print("❌ Could not find suitable goal columns for feature engineering after standardization.")
         return df # Return DataFrame without adding features if goal columns are missing

    print(f"ℹ️ Using '{home_goals_col}' and '{away_goals_col}' for goal calculations.")

    # Use historical goals for goal difference calculation
    df["home_goal_diff"] = df[home_goals_col] - df[away_goals_col]
    df["away_goal_diff"] = df[away_goals_col] - df[home_goals_col]
    print("✅ 'home_goal_diff' and 'away_goal_diff' features created.")

    return df

# ===== Train Model (Simplified Feature Set) =====
def train_model(df):
    print("ℹ️ Columns available in train_model:", df.columns.tolist()) # Debug print

    # Select features based on the simplified set (basic historical stats and goal difference)
    # Use lowercase column names as standardized in the main block
    feature_candidates = {
        "shots": ("hs", "as"),
        "shots_on_target": ("hst", "ast"),
        "fouls": ("hf", "af"),
        "corners": ("hc", "ac"),
        "yellows": ("hy", "ay"),
        "reds": ("hr", "ar"),
        "goal_diff": ("home_goal_diff", "away_goal_diff")
    }

    features = []
    for stat_type, cols in feature_candidates.items():
        for col in cols:
            if col in df.columns and col not in features: # Avoid adding duplicates
                 features.append(col)
                # Add both home and away if one exists and it's a paired stat (not diff)
                 if stat_type not in ["goal_diff"]:
                     if col.startswith("h"):
                         away_col = "a" + col[1:] # Replace first 'h' with 'a'
                         if away_col in df.columns and away_col not in features:
                             features.append(away_col)
                     elif col.startswith("a"):
                         home_col = "h" + col[1:] # Replace first 'a' with 'h'
                         if home_col in df.columns and home_col not in features:
                             features.append(home_col)
            # Special case for goal_diff which is already paired
            elif stat_type in ["goal_diff"] and col in df.columns and col not in features:
                features.append(col)

    # Remove goal columns used for the target
    features_to_remove = ["fthg", "ftag"]
    features = [f for f in features if f not in features_to_remove]

    if not features:
        print("❌ No suitable features found for training.")
        return None

    print(f"Using features: {features}")

    X = df[features].fillna(0)
    y = df["result"]

    # Drop rows where result is NaN if create_target failed
    valid_indices = y.dropna().index
    X = X.loc[valid_indices]
    y = y.loc[valid_indices]

    if X.empty or y.empty:
        print("❌ No valid data with target variable to train on.")
        return None

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Randomized hyperparameter search
    param_dist = {
        "n_estimators": [200, 300, 400],
        "max_depth": [None, 10, 20, 30],
        "min_samples_split": [2, 5, 10],
        "min_samples_leaf": [1, 2, 4],
        "max_features": ["sqrt", "log2", None],
        "class_weight": ["balanced"]
    }
    rf = RandomForestClassifier(random_state=42)
    clf = RandomizedSearchCV(rf, param_distributions=param_dist,
                             n_iter=10, cv=5, n_jobs=-1, scoring="accuracy")
    clf.fit(X_train_scaled, y_train)

    y_pred = clf.predict(X_test_scaled)
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.2%}")
    print("\nClassification Report:\n")
    print(classification_report(y_test, y_pred))

    os.makedirs("models", exist_ok=True)
    joblib.dump(clf.best_estimator_, "models/premier_league_winner_model.pkl")
    joblib.dump(scaler, "models/scaler.pkl")
    print("✅ Model and scaler saved successfully!")

    return clf.best_estimator_

# ===== Main pipeline =====
# Ensure the data directory exists
os.makedirs("data", exist_ok=True)
os.makedirs("/content/football data", exist_ok=True) # Ensure the source directory for historical data exists

# Directly load historical data from the specified directory
df = load_historical_csv(directory_path="/content/football data")

# Check if historical data was loaded successfully
if not df.empty:
    # Standardize column names after loading using a more robust approach
    original_columns = df.columns.tolist()
    df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")

    # Mapping from *potential* original lowercase names to desired lowercase names
    # This handles variations in spacing and capitalization in original CSVs
    rename_map_lower = {
        "fthg": "fthg", "ftag": "ftag", "hs": "hs", "as": "as", "hst": "hst",
        "ast": "ast", "hf": "hf", "af": "af", "hc": "hc", "ac": "ac",
        "hy": "hy", "ay": "ay", "hr": "hr", "ar": "ar", "full_time_result": "result",
        "hometeam": "home_team", "awayteam": "away_team", "date": "match_date",
        "red_card": "ar", "yellow_card": "ay", # Add mappings for common variations
        # Add other potentially relevant columns from historical data if needed
        # "b365h": "b365h", "b365d": "b365d", "b365a": "b365a", # Example betting odds
    }

    # Apply renaming based on the standardized lowercase column names
    new_columns = {}
    for col in df.columns:
        # If the standardized column name is in our rename map, use the desired name
        if col in rename_map_lower:
            new_columns[col] = rename_map_lower[col]
        else:
            new_columns[col] = col # Keep the standardized name if not in map

    df.rename(columns=new_columns, inplace=True)

    print("ℹ️ Columns after standardization and renaming:", df.columns.tolist()) # Debug print

    # Convert date column to datetime
    if "match_date" in df.columns:
        # Attempt parsing with dayfirst=True, coerce errors
        df["match_date"] = pd.to_datetime(df["match_date"], errors="coerce", dayfirst=True)
        # If still NaT values, try parsing without dayfirst
        if df["match_date"].isnull().any():
             df["match_date"] = pd.to_datetime(df["match_date"], errors="coerce")

    # Create target variable (will use standardized fthg/ftag)
    df = create_target(df)

    # Check if target was created successfully before adding features and training
    if "result" in df.columns and not df["result"].isnull().all():
        df = add_features(df)
        model = train_model(df)
    else:
        print("❌ Target variable could not be created. Skipping model training.")
else:
    print("❌ Historical data could not be loaded. Skipping further steps.")


Loaded E0(12).csv
Loaded E0(23).csv
Loaded E0(17).csv
Loaded E0(14).csv
Loaded E0(31).csv
Loaded E0(26).csv
Loaded E0(2).csv
Loaded E0(15).csv
Loaded E0(4).csv
Loaded E0(30).csv
Loaded E0(7).csv
Loaded E0(18).csv
Loaded E0(24).csv
Loaded E0(32).csv
Loaded E0(11).csv
Loaded E0(10).csv
Loaded E0(19).csv
Loaded E0(1).csv
Loaded E0(3).csv
Loaded E0(20).csv
Loaded E0(21).csv
Loaded E0.csv
Loaded E0(16).csv
Loaded E0(8).csv
Loaded E0(22).csv
Loaded E0(29).csv
Loaded E0(6).csv
Loaded E0(25).csv
Loaded E0(28).csv
Loaded E0(5).csv
Loaded E0(9).csv
Loaded E0(13).csv
Loaded E0(27).csv
✅ All historical CSV files combined.
ℹ️ Columns after standardization and renaming: ['div', 'match_date', 'home_team', 'away_team', 'fthg', 'ftag', 'ftr', 'hthg', 'htag', 'htr', 'referee', 'hs', 'as', 'hst', 'ast', 'hf', 'af', 'hc', 'ac', 'hy', 'ay', 'hr', 'ar', 'b365h', 'b365d', 'b365a', 'bwh', 'bwd', 'bwa', 'gbh', 'gbd', 'gba', 'iwh', 'iwd', 'iwa', 'lbh', 'lbd', 'lba', 'psh', 'psd', 'psa', 'whh', 'whd', 'wha', 'sj

  df["match_date"] = pd.to_datetime(df["match_date"], errors="coerce", dayfirst=True)


ℹ️ Columns available in create_target: ['div', 'match_date', 'home_team', 'away_team', 'fthg', 'ftag', 'ftr', 'hthg', 'htag', 'htr', 'referee', 'hs', 'as', 'hst', 'ast', 'hf', 'af', 'hc', 'ac', 'hy', 'ay', 'hr', 'ar', 'b365h', 'b365d', 'b365a', 'bwh', 'bwd', 'bwa', 'gbh', 'gbd', 'gba', 'iwh', 'iwd', 'iwa', 'lbh', 'lbd', 'lba', 'psh', 'psd', 'psa', 'whh', 'whd', 'wha', 'sjh', 'sjd', 'sja', 'vch', 'vcd', 'vca', 'bsh', 'bsd', 'bsa', 'bb1x2', 'bbmxh', 'bbavh', 'bbmxd', 'bbavd', 'bbmxa', 'bbava', 'bbou', 'bbmx>2.5', 'bbav>2.5', 'bbmx<2.5', 'bbav<2.5', 'bbah', 'bbahh', 'bbmxahh', 'bbavahh', 'bbmxaha', 'bbavaha', 'psch', 'pscd', 'psca', 'attendance', 'hhw', 'ahw', 'ho', 'ao', 'hbp', 'abp', 'sbh', 'sbd', 'sba', 'syh', 'syd', 'sya', 'unnamed:_7', 'unnamed:_8', 'unnamed:_9', 'unnamed:_10', 'unnamed:_11', 'unnamed:_12', 'unnamed:_13', 'unnamed:_14', 'unnamed:_15', 'unnamed:_16', 'unnamed:_17', 'unnamed:_18', 'unnamed:_19', 'unnamed:_20', 'unnamed:_21', 'unnamed:_22', 'unnamed:_23', 'unnamed:_24',