In [35]:
#  Install Dependencies
!pip install xgboost scikit-learn pandas numpy

#  Imports
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, mean_squared_error, accuracy_score
from xgboost import XGBRegressor, XGBClassifier
import joblib

#  Load and Add 'match_type' Column Before Combining
ipl = pd.read_csv('/content/ipl_matches.csv')
ipl['match_type'] = 'IPL'

t20i = pd.read_csv('/content/t20i_matches.csv')
t20i['match_type'] = 'T20I'

combined_df = pd.concat([ipl, t20i], ignore_index=True)

combined_df.head()



Unnamed: 0,match_id,team1,team2,venue,date,winner,won_by,win_margin,innings_1_team,innings_1_runs,...,innings_2_over_12_runs,innings_2_over_13_runs,innings_2_over_14_runs,innings_2_over_15_runs,innings_2_over_16_runs,innings_2_over_17_runs,innings_2_over_18_runs,innings_2_over_19_runs,innings_2_over_20_runs,match_type
0,1178427,Kings XI Punjab,Kolkata Knight Riders,Punjab Cricket Association IS Bindra Stadium,2019-05-03,Kolkata Knight Riders,chasing,by 7 wickets,Kings XI Punjab,183,...,8,18,17,6,6,9,19,0,0,IPL
1,1178428,Rajasthan Royals,Delhi Capitals,Arun Jaitley Stadium,2019-05-04,Delhi Capitals,chasing,by 5 wickets,Rajasthan Royals,115,...,2,4,3,16,15,6,0,0,0,IPL
2,1178429,Sunrisers Hyderabad,Royal Challengers Bangalore,M.Chinnaswamy Stadium,2019-05-04,Royal Challengers Bangalore,chasing,by 4 wickets,Sunrisers Hyderabad,175,...,10,13,6,9,16,5,14,5,8,IPL
3,1178430,Chennai Super Kings,Kings XI Punjab,Punjab Cricket Association IS Bindra Stadium,2019-05-05,Kings XI Punjab,chasing,by 6 wickets,Chennai Super Kings,170,...,4,9,12,13,12,3,7,0,0,IPL
4,1178431,Kolkata Knight Riders,Mumbai Indians,Wankhede Stadium,2019-05-05,Mumbai Indians,chasing,by 9 wickets,Kolkata Knight Riders,133,...,3,5,12,13,7,6,0,0,0,IPL


In [36]:
#  Clean Team Names
def clean_team_name(team):
    replacements = {
        'Kings XI Punjab': 'Punjab Kings',
        'Delhi Daredevils': 'Delhi Capitals',
        'Royal Challengers Bengaluru': 'Royal Challengers Bangalore',
        'Deccan Chargers': None,
        'Gujarat Lions': None,
        'Kochi Tuskers Kerala': None,
        'Pune Warriors': None,
        'Rising Pune Supergiant': None,
        'Rising Pune Supergiants': None
    }
    return replacements.get(team, team)

for col in ['innings_1_team', 'innings_2_team', 'winner']:
    combined_df[col] = combined_df[col].apply(clean_team_name)

combined_df = combined_df.dropna(subset=['innings_1_team', 'innings_2_team'])

In [37]:
#  Team Strength Dictionaries
t20i_team_strength = {
    'India': 10, 'Australia': 9.5, 'England': 9, 'New Zealand': 8.5, 'Pakistan': 7.5,
    'South Africa': 8.5, 'Sri Lanka': 7.5, 'West Indies': 7.5, 'Bangladesh': 6.5,
    'Afghanistan': 7, 'Ireland': 5, 'Zimbabwe': 5, 'Scotland': 3.5,
    'Nepal': 5, 'Netherlands': 3, 'UAE': 2.5, 'Namibia': 4, 'USA': 3,
    'Germany': 1.5, 'Argentina': 1, 'Oman': 2, 'Bermuda': 1.5, 'Hong Kong': 2, 'Kenya': 2.5, 'Canada': 2, 'Singapore': 1.5
}

ipl_team_strength = {
    'Chennai Super Kings': 8, 'Mumbai Indians': 9, 'Gujarat Titans': 8.5,
    'Rajasthan Royals': 8, 'Lucknow Super Giants': 8, 'Royal Challengers Bangalore': 8.5,
    'Delhi Capitals': 8, 'Punjab Kings': 8.5, 'Sunrisers Hyderabad': 8,
    'Kolkata Knight Riders': 8
}


In [38]:
#  Assign Strengths
def get_team_strength(row, team_column):
    team = row[team_column]
    match_type = row['match_type']
    return (t20i_team_strength if match_type == 'T20I' else ipl_team_strength).get(team, 7)

combined_df['batting_team_rating'] = combined_df.apply(lambda row: get_team_strength(row, 'innings_1_team'), axis=1)
combined_df['bowling_team_rating'] = combined_df.apply(lambda row: get_team_strength(row, 'innings_2_team'), axis=1)

combined_df.head()

Unnamed: 0,match_id,team1,team2,venue,date,winner,won_by,win_margin,innings_1_team,innings_1_runs,...,innings_2_over_14_runs,innings_2_over_15_runs,innings_2_over_16_runs,innings_2_over_17_runs,innings_2_over_18_runs,innings_2_over_19_runs,innings_2_over_20_runs,match_type,batting_team_rating,bowling_team_rating
0,1178427,Kings XI Punjab,Kolkata Knight Riders,Punjab Cricket Association IS Bindra Stadium,2019-05-03,Kolkata Knight Riders,chasing,by 7 wickets,Punjab Kings,183,...,17,6,6,9,19,0,0,IPL,8.5,8.0
1,1178428,Rajasthan Royals,Delhi Capitals,Arun Jaitley Stadium,2019-05-04,Delhi Capitals,chasing,by 5 wickets,Rajasthan Royals,115,...,3,16,15,6,0,0,0,IPL,8.0,8.0
2,1178429,Sunrisers Hyderabad,Royal Challengers Bangalore,M.Chinnaswamy Stadium,2019-05-04,Royal Challengers Bangalore,chasing,by 4 wickets,Sunrisers Hyderabad,175,...,6,9,16,5,14,5,8,IPL,8.0,8.5
3,1178430,Chennai Super Kings,Kings XI Punjab,Punjab Cricket Association IS Bindra Stadium,2019-05-05,Punjab Kings,chasing,by 6 wickets,Chennai Super Kings,170,...,12,13,12,3,7,0,0,IPL,8.0,8.5
4,1178431,Kolkata Knight Riders,Mumbai Indians,Wankhede Stadium,2019-05-05,Mumbai Indians,chasing,by 9 wickets,Kolkata Knight Riders,133,...,12,13,7,6,0,0,0,IPL,8.0,9.0


In [39]:
#  Standardize Venue Names
venue_map = {
    "Arun Jaitley Stadium": "Arun Jaitley Stadium, Delhi",
    "Brabourne Stadium": "Brabourne Stadium, Mumbai",
    "Dr. DY Patil Sports Academy": "Dr. DY Patil Sports Academy, Mumbai",
    "Dr. Y.S. Rajasekhara Reddy ACA-VDCA Cricket Stadium": "Dr. Y.S. Rajasekhara Reddy ACA-VDCA Cricket Stadium, Visakhapatnam",
    "Eden Gardens": "Eden Gardens, Kolkata",
    "Himachal Pradesh Cricket Association Stadium": "Himachal Pradesh Cricket Association Stadium, Dharamsala",
    "M Chinnaswamy Stadium": "M Chinnaswamy Stadium, Bengaluru",
    "MA Chidambaram Stadium": "MA Chidambaram Stadium, Chepauk, Chennai",
    "MA Chidambaram Stadium, Chepauk": "MA Chidambaram Stadium, Chepauk, Chennai",
    "Maharaja Yadavindra Singh International Cricket Stadium, Mullanpur": "Maharaja Yadavindra Singh International Cricket Stadium, New Chandigarh",
    "Maharashtra Cricket Association Stadium": "Maharashtra Cricket Association Stadium, Pune",
    "Punjab Cricket Association IS Bindra Stadium": "Punjab Cricket Association IS Bindra Stadium, Mohali, Chandigarh",
    "Punjab Cricket Association Stadium": "Punjab Cricket Association IS Bindra Stadium, Mohali, Chandigarh",
    "Rajiv Gandhi International Stadium": "Rajiv Gandhi International Stadium, Uppal, Hyderabad",
    "Rajiv Gandhi International Stadium, Uppal": "Rajiv Gandhi International Stadium, Uppal, Hyderabad",
    "Sawai Mansingh Stadium": "Sawai Mansingh Stadium, Jaipur",
    "Wankhede Stadium": "Wankhede Stadium, Mumbai"
}
combined_df['venue'] = combined_df['venue'].replace(venue_map)


In [40]:
#  Encode Categorical
le_venue = LabelEncoder()
le_match_type = LabelEncoder()
le_team = LabelEncoder()

combined_df['venue_code'] = le_venue.fit_transform(combined_df['venue'])
combined_df['match_type_code'] = le_match_type.fit_transform(combined_df['match_type'])

In [41]:
#  Powerplay Columns

combined_df.rename(columns={
    'innings_1_powerplay_runs': 'powerplay_runs',
    'innings_1_powerplay_wkts': 'powerplay_wickets'
}, inplace=True)

In [42]:
#  Venue Averages

# Encode venue as numerical code
combined_df['venue_code'] = combined_df['venue'].astype('category').cat.codes

# Create total_runs and total_boundaries columns (from innings 1 only)
combined_df['total_runs'] = combined_df['innings_1_runs']
combined_df['total_boundaries'] = combined_df['innings_1_fours'] + combined_df['innings_1_sixes']

venue_stats = combined_df.groupby('venue_code').agg({
    'total_runs': 'mean',
    'total_boundaries': ['mean', 'std']
})
venue_stats.columns = ['venue_avg_first_innings_score', 'venue_avg_boundaries', 'venue_boundary_std']
venue_stats.reset_index(inplace=True)
combined_df = combined_df.merge(venue_stats, on='venue_code', how='left')

In [43]:
#  Head-to-Head Win Ratio
h2h = combined_df.groupby(['innings_1_team', 'innings_2_team'])['winner'].value_counts().unstack().fillna(0)
h2h['win_ratio'] = h2h.max(axis=1) / h2h.sum(axis=1)
combined_df['head_to_head_win_ratio'] = combined_df.apply(lambda row: h2h.loc[(row['innings_1_team'], row['innings_2_team'])]['win_ratio']
                                                           if (row['innings_1_team'], row['innings_2_team']) in h2h.index else 0.5, axis=1)

In [44]:
#  Recent Form (Last 5 Matches)
combined_df = combined_df.sort_values(by=['date'])
combined_df['last_5_avg_score'] = combined_df.groupby('innings_1_team')['total_runs'].transform(lambda x: x.shift().rolling(5).mean())
combined_df['last_5_avg_score'].fillna(combined_df['batting_team_rating'], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  combined_df['last_5_avg_score'].fillna(combined_df['batting_team_rating'], inplace=True)


In [45]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3642 entries, 2918 to 1936
Data columns (total 94 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   match_id                       3642 non-null   object 
 1   team1                          3642 non-null   object 
 2   team2                          3642 non-null   object 
 3   venue                          3642 non-null   object 
 4   date                           3642 non-null   object 
 5   winner                         3614 non-null   object 
 6   won_by                         3613 non-null   object 
 7   win_margin                     3613 non-null   object 
 8   innings_1_team                 3642 non-null   object 
 9   innings_1_runs                 3642 non-null   int64  
 10  innings_1_fours                3642 non-null   int64  
 11  innings_1_sixes                3642 non-null   int64  
 12  innings_1_top_scorer           3642 non-null   obj

In [46]:
combined_df['innings_1_boundaries'] = combined_df['innings_1_fours'] + combined_df['innings_1_sixes']
combined_df['innings_2_boundaries'] = combined_df['innings_2_fours'] + combined_df['innings_2_sixes']
combined_df['match_total_boundaries'] = combined_df['innings_1_boundaries'] + combined_df['innings_2_boundaries']

In [47]:
def bucket(x):
    if x < 25: return '<25'
    elif x < 40: return '25-40'
    elif x < 55: return '41-55'
    else: return '>55'

combined_df['boundary_bucket'] = combined_df['match_total_boundaries'].apply(bucket)

In [48]:
# Encode match_type as a category
combined_df['match_type_code'] = combined_df['match_type'].astype('category').cat.codes

In [49]:
combined_df.rename(columns={
    'powerplay_runs': 'innings_1_powerplay_runs',
    'powerplay_wickets': 'innings_1_powerplay_wkts'
}, inplace=True)

In [50]:
#  Define Features
# Assume combined_df is already loaded and cleaned
df = combined_df.copy()

# Remove legacy teams
old_teams = ['Pune Warriors', 'Deccan Chargers', 'Gujarat Lions',
             'Rising Pune Supergiant', 'Rising Pune Supergiants', 'Kochi Tuskers Kerala']
df = df[~df['innings_1_team'].isin(old_teams)]
df = df[~df['innings_2_team'].isin(old_teams)]

# Create target
df["is_winner"] = (df["innings_1_team"] == df["winner"]).astype(int)

# Feature engineering
df["net_team_rating"] = df["batting_team_rating"] - df["bowling_team_rating"]
df["innings_1_powerplay_loss_rate"] = df["innings_1_powerplay_wkts"] / 6

venue_pp = df.groupby("venue")["innings_1_powerplay_runs"].mean().reset_index()
venue_pp.columns = ["venue", "venue_avg_pp"]
df = df.merge(venue_pp, on="venue", how="left")

df["net_run_rate_pp"] = df["innings_1_powerplay_runs"] - df["venue_avg_pp"]
df["collapse_flag"] = df["innings_1_powerplay_wkts"].apply(lambda x: 1 if x >= 3 else 0)

# Final features
features = [
    'venue_code', 'match_type_code',
    'innings_1_powerplay_runs', 'innings_1_powerplay_wkts',
    'batting_team_rating', 'bowling_team_rating',
    'venue_avg_first_innings_score', 'venue_avg_boundaries', 'venue_boundary_std',
    'head_to_head_win_ratio', 'last_5_avg_score',
    'net_team_rating', 'innings_1_powerplay_loss_rate',
    'net_run_rate_pp', 'collapse_flag'
]

# Now extract features and target
X = df[features]
y = df["is_winner"]


In [51]:
#  Save Cleaned CSV
combined_df.to_csv('/content/combined_t20i_and_ipl_matches.csv', index=False)

In [52]:
# === Imports ===
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from xgboost import XGBRegressor, XGBClassifier
from sklearn.metrics import accuracy_score, mean_squared_error
import joblib
import numpy as np

# === Shared Features and Input ===
features = [
    'venue_code', 'match_type_code',
    'innings_1_powerplay_runs', 'innings_1_powerplay_wkts',
    'batting_team_rating', 'bowling_team_rating',
    'venue_avg_first_innings_score', 'venue_avg_boundaries', 'venue_boundary_std',
    'head_to_head_win_ratio', 'last_5_avg_score',
    'net_team_rating', 'innings_1_powerplay_loss_rate',
    'net_run_rate_pp', 'collapse_flag'
]

X = df[features]

# ============================================================
# 🎯 1. Predict First Innings Score (Regression)
# ============================================================

y_score = df["innings_1_runs"]
score_model = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=6, random_state=42)
score_model.fit(X, y_score)

joblib.dump(score_model, "score_model.pkl")


# ============================================================
# 🎯 2. Predict Boundary Count (Regression)
# ============================================================

from xgboost import XGBRegressor

y_boundary = df["innings_1_fours"] + df["innings_1_sixes"]
boundary_model = XGBRegressor(
    n_estimators=200,
    learning_rate=0.05,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)
boundary_model.fit(X, y_boundary)

joblib.dump(boundary_model, "boundary_model.pkl")


# ============================================================
# 🎯 3. Predict Boundary Bucket (Multi-class Classification)
# ============================================================
from xgboost import XGBClassifier

df_bucket = df.dropna(subset=['boundary_bucket'])
X_bucket = df_bucket[features]
y_bucket = df_bucket['boundary_bucket']

bucket_encoder = LabelEncoder()
y_bucket_encoded = bucket_encoder.fit_transform(y_bucket)

bucket_model = XGBClassifier(
    n_estimators=200,
    learning_rate=0.05,
    max_depth=5,
    subsample=0.9,
    colsample_bytree=0.9,
    use_label_encoder=False,
    eval_metric='mlogloss',
    random_state=42
)

bucket_model.fit(X_bucket, y_bucket_encoded)

joblib.dump(bucket_model, 'bucket_model.pkl')
joblib.dump(bucket_encoder, 'bucket_label_encoder.pkl')


# ============================================================
# 🎯 4. Predict Match Outcome (Binary Classification)
# ============================================================

df_win = df.dropna(subset=['is_winner'])  # Safe filter
X_win = df_win[features]
y_win = df_win['is_winner']

# Scale for better model convergence
scaler = StandardScaler()
X_win_scaled = scaler.fit_transform(X_win)

win_model = RandomForestClassifier(n_estimators=100, random_state=42)
win_model.fit(X_win_scaled, y_win)

joblib.dump(win_model, 'win_model.pkl')
joblib.dump(scaler, 'win_model_scaler.pkl')

Parameters: { "use_label_encoder" } are not used.



['win_model_scaler.pkl']