# IPL Match Outcome: Model Training

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
import joblib

# Load the reliable dataset
matches = pd.read_csv('IPL Matches 2008-2020.csv')

# --- Data Cleaning and Preprocessing ---

# Handle team name inconsistencies (a common real-world data problem)
matches.replace(['Delhi Daredevils'], 'Delhi Capitals', inplace=True)
matches.replace(['Deccan Chargers'], 'Sunrisers Hyderabad', inplace=True)
matches.replace(['Rising Pune Supergiant'], 'Rising Pune Supergiants', inplace=True) # One season name change

# Drop rows with no result (e.g., washed out matches)
matches.dropna(subset=['winner'], inplace=True)

# --- Feature Engineering ---
# We will use a simple, robust feature set.
df = matches[['team1', 'team2', 'venue', 'toss_winner', 'toss_decision', 'winner']].copy()

# Encode categorical features
encoders = {}
for col in ['team1', 'team2', 'venue', 'toss_winner', 'toss_decision', 'winner']:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    encoders[col] = le # Store the fitted encoder

# --- Model Training ---
X = df.drop('winner', axis=1)
y = df['winner']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
model.fit(X_train, y_train)

print(f"Model Accuracy on Test Set: {model.score(X_test, y_test) * 100:.2f}%")

# --- Save Model and Encoders ---
joblib.dump(model, 'cricket_model.pkl')
joblib.dump(encoders, 'encoders.pkl')

print("Model and encoders saved successfully.")