In [1]:

# IPL Winner Predictor - Fixed for Overfitting
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, classification_report, confusion_matrix)
import pickle


In [2]:

# Load datasets
deliveries = pd.read_csv('deliveries.csv')
matches = pd.read_csv('matches.csv')


In [3]:

# Rename teams to consistent names
team_rename = {
    'Delhi Daredevils': 'Delhi Capitals',
    'Deccan Chargers': 'Sunrisers Hyderabad',
    'Kings XI Punjab': 'Punjab Kings',
    'Gujarat Lions': 'Gujarat Titans'
}
for col in ['team1', 'team2', 'winner']:
    matches[col] = matches[col].replace(team_rename)


In [4]:

first_innings = deliveries[deliveries['inning'] == 1]
total_scores = first_innings.groupby(['match_id', 'batting_team'])['total_runs'].sum().reset_index()
total_scores.rename(columns={'total_runs': 'target'}, inplace=True)
matches = matches.merge(total_scores, left_on='id', right_on='match_id')


In [5]:

second_innings = deliveries[deliveries['inning'] == 2]
merged = matches.merge(second_innings, left_on='id', right_on='match_id')
merged = merged.rename(columns={
    'batting_team_y': 'batting_team',
    'bowling_team': 'bowling_team',
    'city_x': 'city'
})


In [6]:

merged['current_score'] = merged.groupby('id')['total_runs'].cumsum()
merged['balls_left'] = 120 - (merged['over'] * 6 + merged['ball'])
merged['runs_left'] = merged['target'] - merged['current_score']
merged['player_dismissed'] = merged['player_dismissed'].fillna(0)
merged['player_dismissed'] = merged['player_dismissed'].apply(lambda x: 0 if x == 0 else 1)
merged['wickets'] = 10 - merged.groupby('id')['player_dismissed'].cumsum()
merged['crr'] = merged['current_score'] * 6 / (120 - merged['balls_left'])
merged['rrr'] = merged['runs_left'] * 6 / merged['balls_left']


In [7]:

merged['result'] = merged.apply(lambda row: 1 if row['batting_team'] == row['winner'] else 0, axis=1)
final_df = merged[['batting_team', 'bowling_team', 'city', 'runs_left', 'balls_left',
                   'wickets', 'crr', 'rrr', 'result']].dropna()
final_df = final_df[final_df['balls_left'] > 30]  # Avoid extreme easy cases


In [8]:

X = final_df.drop('result', axis=1)
y = final_df['result']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [9]:

cat_features = ['batting_team', 'bowling_team', 'city']
preprocessor = ColumnTransformer([
    ('onehot', OneHotEncoder(drop='first', handle_unknown='ignore'), cat_features)
], remainder='passthrough')

pipe = Pipeline([
    ('transform', preprocessor),
    ('model', LogisticRegression(max_iter=1000))
])

pipe.fit(X_train, y_train)


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [10]:

y_pred = pipe.predict(X_test)
print("Train Accuracy:", accuracy_score(y_train, pipe.predict(X_train)))
print("Test Accuracy :", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))


Train Accuracy: 0.8384842676888676
Test Accuracy : 0.8345932708688245
Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.84      0.85     10531
           1       0.80      0.83      0.81      8253

    accuracy                           0.83     18784
   macro avg       0.83      0.83      0.83     18784
weighted avg       0.84      0.83      0.83     18784

Confusion Matrix:
 [[8858 1673]
 [1434 6819]]
Precision: 0.8029910504003768
Recall: 0.8262450018175209
F1 Score: 0.8144520752463422


In [11]:

from sklearn.dummy import DummyClassifier
dummy = DummyClassifier(strategy='most_frequent')
dummy.fit(X_train, y_train)
print("Baseline Dummy Accuracy:", dummy.score(X_test, y_test))


Baseline Dummy Accuracy: 0.5606367120954003


In [12]:

cv_score = cross_val_score(pipe, X, y, cv=5)
print("Cross-validated Accuracy:", cv_score.mean())


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Cross-validated Accuracy: 0.7604766989995394


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [13]:

def predict_manual(batting_team, bowling_team, city, runs_left, balls_left, wickets, crr, rrr):
    abbrev = {
        'RCB': 'Royal Challengers Bangalore',
        'MI': 'Mumbai Indians',
        'CSK': 'Chennai Super Kings',
        'DC': 'Delhi Capitals',
        'KKR': 'Kolkata Knight Riders',
        'RR': 'Rajasthan Royals',
        'PBKS': 'Punjab Kings',
        'SRH': 'Sunrisers Hyderabad',
        'GT': 'Gujarat Titans',
        'LSG': 'Lucknow Super Giants'
    }
    data = pd.DataFrame({
        'batting_team': [abbrev.get(batting_team.upper(), batting_team)],
        'bowling_team': [abbrev.get(bowling_team.upper(), bowling_team)],
        'city': [city],
        'runs_left': [runs_left],
        'balls_left': [balls_left],
        'wickets': [wickets],
        'crr': [crr],
        'rrr': [rrr]
    })
    pred = pipe.predict_proba(data)[0]
    print(f"Win Probability: {round(pred[1]*100, 2)}% | Loss Probability: {round(pred[0]*100, 2)}%")


In [14]:

pickle.dump(pipe, open('ipl_model_final.pkl', 'wb'))
