# IPL Match Winner Prediction - Complete ML Pipeline

In [2]:
import pandas as pd  # Data manipulation
import numpy as np  # Numerical operations
import matplotlib.pyplot as plt  # Plotting
import seaborn as sns  # Statistical visualizations
from sklearn.model_selection import train_test_split  # Split data into train/test
from sklearn.preprocessing import LabelEncoder  # Encode target variable
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier  # Tree-based models
from sklearn.linear_model import LogisticRegression  # Linear baseline model
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report  # Evaluation metrics
import warnings  # Suppress unnecessary warnings
warnings.filterwarnings('ignore')

#### 1. LOAD DATA

In [5]:
df = pd.read_csv('IPL.csv')  # Load the IPL dataset
print("Dataset Shape:", df.shape)  # Show rows x columns
print("\nFirst 5 rows:")  # Preview data
print(df.head())


Dataset Shape: (74, 20)

First 5 rows:
   match_id           date                                         venue  \
0         1  March 26,2022                      Wankhede Stadium, Mumbai   
1         2  March 27,2022                     Brabourne Stadium, Mumbai   
2         3  March 27,2022            Dr DY Patil Sports Academy, Mumbai   
3         4  March 28,2022                      Wankhede Stadium, Mumbai   
4         5  March 29,2022  Maharashtra Cricket Association Stadium,Pune   

       team1      team2  stage toss_winner toss_decision  first_ings_score  \
0    Chennai    Kolkata  Group     Kolkata         Field               131   
1      Delhi     Mumbai  Group       Delhi         Field               177   
2   Banglore     Punjab  Group      Punjab         Field               205   
3    Gujarat    Lucknow  Group     Gujarat         Field               158   
4  Hyderabad  Rajasthan  Group   Hyderabad         Field               210   

   first_ings_wkts  second_ings_sco

In [8]:
df.columns

Index(['match_id', 'date', 'venue', 'team1', 'team2', 'stage', 'toss_winner',
       'toss_decision', 'first_ings_score', 'first_ings_wkts',
       'second_ings_score', 'second_ings_wkts', 'match_winner', 'won_by',
       'margin', 'player_of_the_match', 'top_scorer', 'highscore',
       'best_bowling', 'best_bowling_figure'],
      dtype='object')

### 2. DATA CLEANING

In [6]:
# Strip whitespace from all string columns
for col in df.select_dtypes(include=['object']).columns:  # Loop through text columns
    df[col] = df[col].str.strip()  # Remove leading/trailing spaces

In [12]:
# Convert date to datetime format
df['date'] = pd.to_datetime(
    df['date'],
    format='%B %d,%Y',
    errors='coerce'
)


In [13]:
# Check for missing values
print("\nMissing values:")
print(df.isnull().sum())  # Count nulls per column


Missing values:
match_id               0
date                   1
venue                  0
team1                  0
team2                  0
stage                  0
toss_winner            0
toss_decision          0
first_ings_score       0
first_ings_wkts        0
second_ings_score      0
second_ings_wkts       0
match_winner           0
won_by                 0
margin                 0
player_of_the_match    0
top_scorer             0
highscore              0
best_bowling           0
best_bowling_figure    0
dtype: int64


#### 3. REMOVE DATA LEAKAGE (POST-MATCH INFO)