## Imports

In [None]:
import pandas as pd
import numpy as np
import glob
import io
from tqdm import trange

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.expand_frame_repr', False)
# pd.set_option('max_colwidth', -1)

In [None]:
matches = pd.read_csv('matches_updated_mens_odi_upto_feb_2025.csv')
deliveries = pd.read_csv('deliveries_updated_mens_odi_upto_feb_2025.csv')

In [None]:
# Create a list to store the playing 11 for each match
playing_11_data = []

# Iterate over each match
for match_id in deliveries['matchId'].unique():
    match_deliveries = deliveries[deliveries['matchId'] == match_id]

    # Get unique teams in the match
    teams = match_deliveries['batting_team'].unique()

    #ensure that we have 2 teams in each match.
    if len(teams) != 2:
        continue

    team1 = teams[0]
    team2 = teams[1]

    # Get the playing 11 for each team
    team1_players = match_deliveries[match_deliveries['batting_team'] == team1]['batsman'].unique().tolist()
    team2_players = match_deliveries[match_deliveries['batting_team'] == team2]['batsman'].unique().tolist()


    team1_bowlers = match_deliveries[match_deliveries['bowling_team'] == team1]['bowler'].unique().tolist()
    team2_bowlers = match_deliveries[match_deliveries['bowling_team'] == team2]['bowler'].unique().tolist()

    team1_non_strikers = match_deliveries[match_deliveries['batting_team'] == team1]['non_striker'].unique().tolist()
    team2_non_strikers = match_deliveries[match_deliveries['batting_team'] == team2]['non_striker'].unique().tolist()



    team1_playing_11 = list(set(team1_players + team1_bowlers + team1_non_strikers))
    team2_playing_11 = list(set(team2_players + team2_bowlers + team2_non_strikers))

    # Append the playing 11 to the list
    playing_11_data.append({'matchId': match_id, 'team1': team1, 'team1_playing_11': team1_playing_11, 'team2': team2, 'team2_playing_11': team2_playing_11})

# Create a DataFrame from the list
playing_11_df = pd.DataFrame(playing_11_data)

# Display the DataFrame
playing_11_df.head()

Unnamed: 0,matchId,team1,team1_playing_11,team2,team2_playing_11
0,64814,New Zealand,"[NJ Astle, JDP Oram, CD McMillan, KD Mills, MS...",India,"[Yuvraj Singh, R Dravid, J Srinath, SC Ganguly..."
1,64815,India,"[Yuvraj Singh, AB Agarkar, R Dravid, J Srinath...",New Zealand,"[NJ Astle, JDP Oram, CD McMillan, KD Mills, MS..."
2,64816,India,"[Yuvraj Singh, AB Agarkar, R Dravid, J Srinath...",New Zealand,"[SB Styris, NJ Astle, JDP Oram, CD McMillan, K..."
3,64817,New Zealand,"[SE Bond, SB Styris, NJ Astle, CZ Harris, MS S...",India,"[Yuvraj Singh, M Kaif, R Dravid, J Srinath, SC..."
4,65634,Sri Lanka,"[DPMD Jayawardene, MS Atapattu, RP Arnold, M M...",Australia,"[RT Ponting, MG Bevan, JP Maher, ML Hayden, B ..."


In [None]:
matches = pd.merge(matches, playing_11_df, on='matchId', suffixes=('', '_y'))
matches.drop(columns=['team1_y', 'team2_y'], inplace=True)
matches.head()

Unnamed: 0,gender,season,toss_decision,team2,city,neutralvenue,player_of_match1,date3,umpire2,toss_winner,event,date1,winner,team1,reserve_umpire1,venue,date2,reserve_umpire2,winner_wickets,match_referee,balls_per_over,method,match_number,umpire1,eliminator,outcome,player_of_match,winner_runs,date,tv_umpire,reserve_umpire,player_of_match2,matchId,team1_playing_11,team2_playing_11
0,male,2002/03,field,India,Napier,,,,DB Cowie,India,India tour of New Zealand,,New Zealand,New Zealand,,"McLean Park, Napier",,,,RS Madugalle,6,,2.0,EAR de Silva,,,V Sehwag,35.0,2002-12-29,BF Bowden,,,64814,"[NJ Astle, JDP Oram, CD McMillan, KD Mills, MS...","[Yuvraj Singh, R Dravid, J Srinath, SC Ganguly..."
1,male,2002/03,bat,New Zealand,Christchurch,,,,DB Cowie,India,India tour of New Zealand,,New Zealand,India,,"Jade Stadium, Christchurch",,,5.0,RS Madugalle,6,,3.0,EAR de Silva,,,DR Tuffey,,2003-01-01,AL Hill,,,64815,"[Yuvraj Singh, AB Agarkar, R Dravid, J Srinath...","[NJ Astle, JDP Oram, CD McMillan, KD Mills, MS..."
2,male,2002/03,field,New Zealand,Queenstown,,,,DB Cowie,New Zealand,India tour of New Zealand,,New Zealand,India,,"Davies Park, Queenstown",,,7.0,RS Madugalle,6,,4.0,EAR de Silva,,,AR Adams,,2003-01-04,AL Hill,,,64816,"[Yuvraj Singh, AB Agarkar, R Dravid, J Srinath...","[SB Styris, NJ Astle, JDP Oram, CD McMillan, K..."
3,male,2002/03,bat,India,Wellington,,,,DJ Harper,New Zealand,India tour of New Zealand,,India,New Zealand,,"Westpac Stadium, Wellington",,,2.0,RS Madugalle,6,,5.0,BF Bowden,,,Z Khan,,2003-01-08,DB Cowie,,,64817,"[SE Bond, SB Styris, NJ Astle, CZ Harris, MS S...","[Yuvraj Singh, M Kaif, R Dravid, J Srinath, SC..."
4,male,2002/03,field,Australia,Sydney,,,,SJA Taufel,Australia,VB Series,,Sri Lanka,Sri Lanka,,Sydney Cricket Ground,,,,CH Lloyd,6,,6.0,RB Tiffin,,,ST Jayasuriya,79.0,2003-01-09,DB Hair,,,65634,"[DPMD Jayawardene, MS Atapattu, RP Arnold, M M...","[RT Ponting, MG Bevan, JP Maher, ML Hayden, B ..."


In [None]:
cols = ['matchId', 'date', 'team1', 'team2', 'winner', 'winner_runs', 'winner_wickets', 'toss_winner', 'player_of_match', 'venue', 'city', 'neutralvenue','team1_playing_11','team2_playing_11']

In [None]:
matches = matches[cols]

In [None]:
matches.tail()

Unnamed: 0,matchId,date,team1,team2,winner,winner_runs,winner_wickets,toss_winner,player_of_match,venue,city,neutralvenue,team1_playing_11,team2_playing_11
1911,1277086,2022-01-13,West Indies,Ireland,Ireland,,5.0,Ireland,AR McBrine,"Sabina Park, Kingston, Jamaica",Kingston,,"[AS Joseph, OF Smith, KA Pollard, SSJ Brooks, ...","[PR Stirling, GJ Delany, WTS Porterfield, AR M..."
1912,1277087,2022-01-16,West Indies,Ireland,Ireland,,2.0,Ireland,AR McBrine,"Sabina Park, Kingston, Jamaica",Kingston,,"[AS Joseph, OF Smith, KA Pollard, SSJ Brooks, ...","[PR Stirling, NA Rock, GJ Delany, WTS Porterfi..."
1913,1294969,2022-01-16,Zimbabwe,Sri Lanka,Sri Lanka,,5.0,Zimbabwe,LD Chandimal,Pallekele International Cricket Stadium,Kandy,,"[TL Chatara, SC Williams, CR Ervine, RW Chakab...","[C Gunasekara, JDF Vandersay, LD Chandimal, N ..."
1914,1294970,2022-01-18,Zimbabwe,Sri Lanka,Zimbabwe,22.0,,Zimbabwe,CR Ervine,Pallekele International Cricket Stadium,Kandy,,"[TL Chatara, SC Williams, CR Ervine, RW Chakab...","[JDF Vandersay, LD Chandimal, N Pradeep, M The..."
1915,1277082,2022-01-19,South Africa,India,South Africa,31.0,,South Africa,HE van der Dussen,"Boland Park, Paarl",Paarl,,"[Q de Kock, T Bavuma, AK Markram, M Jansen, JN...","[R Ashwin, V Kohli, SN Thakur, JJ Bumrah, KL R..."


In [None]:
unique_teams = set(matches['team1']).union(set(matches['team2']))

In [None]:
len(unique_teams)

26

In [None]:
matches.groupby('team1')['team1'].count().sort_values(ascending=False).head()

Unnamed: 0_level_0,team1
team1,Unnamed: 1_level_1
Australia,234
India,225
England,203
Bangladesh,166
Sri Lanka,165


## Team Rating System

In [None]:
team_ratings = {}

In [None]:
matches_list = list(matches.matchId)

In [None]:
def init_rating_dict(unique_teams):
    d = {}
    for t in unique_teams:
        d[t] = {'matches': 10, 'points':1000, 'ratings':100}

    return d

In [None]:
matches['t1_rating_old'] = 0
matches['t2_rating_old'] = 0

matches['t1_rating_new'] = 0
matches['t2_rating_new'] = 0

In [None]:
for i in range(len(matches_list)):

    m = matches_list[i]

    if not team_ratings:
        team_ratings[m] = init_rating_dict(unique_teams)
    else:
        team_ratings[m] = team_ratings[matches_list[i-1]]

    # update for every match
    t1 = matches['team1'][i]
    t2 = matches['team2'][i]
    winner = matches['winner'][i]

    t1_old_matches = team_ratings[m][t1]['matches']
    t1_new_matches = team_ratings[m][t1]['matches'] + 1

    t2_old_matches = team_ratings[m][t2]['matches']
    t2_new_matches = team_ratings[m][t2]['matches'] + 1

    t1_old_points = team_ratings[m][t1]['points']
    t2_old_points = team_ratings[m][t2]['points']

    t1_old_ratings = team_ratings[m][t1]['ratings']
    t2_old_ratings = team_ratings[m][t2]['ratings']

    if t1 == winner:
        t1_new_points = t1_old_points + t2_old_ratings + 50
        t2_new_points = t2_old_points + t1_old_ratings - 50

    elif t2 == winner:
        t2_new_points = t2_old_points + t1_old_ratings + 50
        t1_new_points = t1_old_points + t1_old_ratings - 50

    else:
        t2_new_points = t2_old_points + t1_old_ratings
        t1_new_points = t1_old_points + t1_old_ratings

    t1_new_ratings = t1_new_points/t1_new_matches
    t2_new_ratings = t2_new_points/t2_new_matches

    team_ratings[m][t1]['ratings'] = t1_new_ratings
    team_ratings[m][t2]['ratings'] = t2_new_ratings

    team_ratings[m][t1]['points'] = t1_new_points
    team_ratings[m][t2]['points'] = t2_new_points

    team_ratings[m][t1]['matches'] = t1_new_matches
    team_ratings[m][t2]['matches'] = t2_new_matches

    matches['t1_rating_old'][i] = t1_old_ratings
    matches['t2_rating_old'][i] = t2_old_ratings

    matches['t1_rating_new'][i] = t1_new_ratings
    matches['t2_rating_new'][i] = t2_new_ratings

In [None]:
matches.head()

Unnamed: 0,matchId,date,team1,team2,winner,winner_runs,winner_wickets,toss_winner,player_of_match,venue,city,neutralvenue,team1_playing_11,team2_playing_11,t1_rating_old,t2_rating_old,t1_rating_new,t2_rating_new
0,64814,2002-12-29,New Zealand,India,New Zealand,35.0,,India,V Sehwag,"McLean Park, Napier",Napier,,"[NJ Astle, JDP Oram, CD McMillan, KD Mills, MS...","[Yuvraj Singh, R Dravid, J Srinath, SC Ganguly...",100.0,100.0,104.545455,95.454545
1,64815,2003-01-01,India,New Zealand,New Zealand,,5.0,India,DR Tuffey,"Jade Stadium, Christchurch",Christchurch,,"[Yuvraj Singh, AB Agarkar, R Dravid, J Srinath...","[NJ Astle, JDP Oram, CD McMillan, KD Mills, MS...",95.454545,104.545455,91.287879,107.954545
2,64816,2003-01-04,India,New Zealand,New Zealand,,7.0,New Zealand,AR Adams,"Davies Park, Queenstown",Queenstown,,"[Yuvraj Singh, AB Agarkar, R Dravid, J Srinath...","[SB Styris, NJ Astle, JDP Oram, CD McMillan, K...",91.287879,107.954545,87.441725,110.518648
3,64817,2003-01-08,New Zealand,India,India,,2.0,New Zealand,Z Khan,"Westpac Stadium, Wellington",Wellington,,"[SE Bond, SB Styris, NJ Astle, CZ Harris, MS S...","[Yuvraj Singh, M Kaif, R Dravid, J Srinath, SC...",110.518648,87.441725,106.947219,92.661505
4,65634,2003-01-09,Sri Lanka,Australia,Sri Lanka,79.0,,Australia,ST Jayasuriya,Sydney Cricket Ground,Sydney,,"[DPMD Jayawardene, MS Atapattu, RP Arnold, M M...","[RT Ponting, MG Bevan, JP Maher, ML Hayden, B ...",100.0,100.0,104.545455,95.454545


## Team Recent form

In [None]:
def populate_recent_form(team_matches, team_result_d):

    recent_form_d = {}

    team_matches['recent_form'] = 0.0

    for k, v in team_result_d.items():

        if k <= 5:
            rf = 0
            recent_form_d[k] = 0
        else:
            rf = ((team_result_d[k-1]*5) + (team_result_d[k-2]*4) + (team_result_d[k-3]*3) + (team_result_d[k-4]*2) + (team_result_d[k-5]*1))/(5 + 4 + 3 + 2 + 1)

        recent_form_d[k] = rf

    team_matches['recent_form'] = team_matches['match_no'].map(recent_form_d)

    return team_matches

In [None]:
unique_teams = set(matches['team1']).union(set(matches['team2']))

In [None]:

overall_team_recent_form_d = {}

for team_name in unique_teams:

    team_matches = matches.loc[(matches['team1'] == team_name) | (matches['team2'] == team_name), :]
    team_matches = team_matches[['matchId', 'date', 'team1', 'team2', 'winner']]

    team_matches['played'] = 1
    team_matches['won'] = 0

    team_matches.loc[(team_matches['winner'] == team_name), 'won'] = 1
    team_matches['match_no'] = [i for i in range(1, len(team_matches) + 1)]

    team_result_d = dict(zip(team_matches['match_no'], team_matches['won']))
    team_matches = populate_recent_form(team_matches, team_result_d)
    team_recent_form_d = dict(zip(team_matches['matchId'], team_matches['recent_form']))

    overall_team_recent_form_d[team_name] = team_recent_form_d

In [None]:
matches.head()

Unnamed: 0,matchId,date,team1,team2,winner,winner_runs,winner_wickets,toss_winner,player_of_match,venue,city,neutralvenue,team1_playing_11,team2_playing_11,t1_rating_old,t2_rating_old,t1_rating_new,t2_rating_new
0,64814,2002-12-29,New Zealand,India,New Zealand,35.0,,India,V Sehwag,"McLean Park, Napier",Napier,,"[NJ Astle, JDP Oram, CD McMillan, KD Mills, MS...","[Yuvraj Singh, R Dravid, J Srinath, SC Ganguly...",100.0,100.0,104.545455,95.454545
1,64815,2003-01-01,India,New Zealand,New Zealand,,5.0,India,DR Tuffey,"Jade Stadium, Christchurch",Christchurch,,"[Yuvraj Singh, AB Agarkar, R Dravid, J Srinath...","[NJ Astle, JDP Oram, CD McMillan, KD Mills, MS...",95.454545,104.545455,91.287879,107.954545
2,64816,2003-01-04,India,New Zealand,New Zealand,,7.0,New Zealand,AR Adams,"Davies Park, Queenstown",Queenstown,,"[Yuvraj Singh, AB Agarkar, R Dravid, J Srinath...","[SB Styris, NJ Astle, JDP Oram, CD McMillan, K...",91.287879,107.954545,87.441725,110.518648
3,64817,2003-01-08,New Zealand,India,India,,2.0,New Zealand,Z Khan,"Westpac Stadium, Wellington",Wellington,,"[SE Bond, SB Styris, NJ Astle, CZ Harris, MS S...","[Yuvraj Singh, M Kaif, R Dravid, J Srinath, SC...",110.518648,87.441725,106.947219,92.661505
4,65634,2003-01-09,Sri Lanka,Australia,Sri Lanka,79.0,,Australia,ST Jayasuriya,Sydney Cricket Ground,Sydney,,"[DPMD Jayawardene, MS Atapattu, RP Arnold, M M...","[RT Ponting, MG Bevan, JP Maher, ML Hayden, B ...",100.0,100.0,104.545455,95.454545


In [None]:
matches['t1_recent_form'] = 0
matches['t2_recent_form'] = 0

for i in range(len(matches)):

    mid = matches['matchId'][i]
    t1 = matches['team1'][i]
    t2 = matches['team2'][i]

    matches['t1_recent_form'][i] = overall_team_recent_form_d[t1][mid]
    matches['t2_recent_form'][i] = overall_team_recent_form_d[t2][mid]

In [None]:
matches.tail(10)

Unnamed: 0,matchId,date,team1,team2,winner,winner_runs,winner_wickets,toss_winner,player_of_match,venue,city,neutralvenue,team1_playing_11,team2_playing_11,t1_rating_old,t2_rating_old,t1_rating_new,t2_rating_new,t1_recent_form,t2_recent_form
1906,1275258,2021-09-29,Papua New Guinea,Scotland,Scotland,,4.0,Papua New Guinea,MA Leask,Al Amerat Cricket Ground Oman Cricket (Ministr...,Al Amarat,,"[A Vala, N Vanua, S Bau, D Ravu, H Hiri, G Tok...","[RD Berrington, HG Munsey, MA Leask, KJ Coetze...",63.708568,76.762181,62.319679,77.330587,0.0,0.8
1907,1275259,2021-10-01,Papua New Guinea,Oman,Oman,,3.0,Papua New Guinea,Khawar Ali,Al Amerat Cricket Ground Oman Cricket (Ministr...,Al Amarat,,"[A Vala, N Vanua, S Bau, D Ravu, H Hiri, TP Ur...","[Ayan Khan, Mohammad Nadeem, Sandeep Goud, Nas...",62.319679,101.963815,60.968328,102.347366,0.0,0.466667
1908,1290881,2021-11-26,Namibia,Oman,Namibia,40.0,,Oman,JJ Smit,"Wanderers Cricket Ground, Windhoek",Windhoek,,"[M van Lingen, SJ Baard, MD du Preez, JJ Smit,...","[Ayan Khan, Mohammad Nadeem, Sandeep Goud, Nas...",84.313348,102.347366,87.148099,99.917579,0.466667,0.6
1909,1277073,2021-11-26,South Africa,Netherlands,,,,Netherlands,,"SuperSport Park, Centurion",Centurion,,"[K Verreynne, AL Phehlukwayo, K Zondo, RR Hend...","[VJ Kingma, SJ Myburgh, CN Ackermann, PM Seela...",115.351178,77.406953,115.351178,78.289377,0.4,0.6
1910,1277085,2022-01-08,West Indies,Ireland,West Indies,24.0,,Ireland,SSJ Brooks,"Sabina Park, Kingston, Jamaica",Kingston,,"[AS Joseph, OF Smith, KA Pollard, SSJ Brooks, ...","[L Tucker, A Balbirnie, NA Rock, GJ Delany, WT...",88.284772,75.891351,88.401201,75.536572,0.466667,0.466667
1911,1277086,2022-01-13,West Indies,Ireland,Ireland,,5.0,Ireland,AR McBrine,"Sabina Park, Kingston, Jamaica",Kingston,,"[AS Joseph, OF Smith, KA Pollard, SSJ Brooks, ...","[PR Stirling, GJ Delany, WTS Porterfield, AR M...",88.401201,75.536572,88.24688,76.124092,0.6,0.333333
1912,1277087,2022-01-16,West Indies,Ireland,Ireland,,2.0,Ireland,AR McBrine,"Sabina Park, Kingston, Jamaica",Kingston,,"[AS Joseph, OF Smith, KA Pollard, SSJ Brooks, ...","[PR Stirling, NA Rock, GJ Delany, WTS Porterfi...",88.24688,76.124092,88.093034,76.699303,0.4,0.533333
1913,1294969,2022-01-16,Zimbabwe,Sri Lanka,Sri Lanka,,5.0,Zimbabwe,LD Chandimal,Pallekele International Cricket Stadium,Kandy,,"[TL Chatara, SC Williams, CR Ervine, RW Chakab...","[C Gunasekara, JDF Vandersay, LD Chandimal, N ...",55.229701,100.453012,55.025619,100.464522,0.266667,0.666667
1914,1294970,2022-01-18,Zimbabwe,Sri Lanka,Zimbabwe,22.0,,Zimbabwe,CR Ervine,Pallekele International Cricket Stadium,Kandy,,"[TL Chatara, SC Williams, CR Ervine, RW Chakab...","[JDF Vandersay, LD Chandimal, N Pradeep, M The...",55.025619,100.464522,55.413582,100.235102,0.2,0.8
1915,1277082,2022-01-19,South Africa,India,South Africa,31.0,,South Africa,HE van der Dussen,"Boland Park, Paarl",Paarl,,"[Q de Kock, T Bavuma, AK Markram, M Jansen, JN...","[R Ashwin, V Kohli, SN Thakur, JJ Bumrah, KL R...",115.351178,114.752947,115.500428,114.640671,0.266667,0.6


In [None]:
#team_name = 'India'
#matches.loc[(matches['team1'] == team_name) | (matches['team2'] == team_name), :].tail(10)

##Player Rating System

In [None]:
# Calculate batting average, strike rate, bowling average, and economy for each player
player_stats = deliveries.groupby('batsman').agg(
    total_runs=('batsman_runs', 'sum'),
    total_balls_faced=('ball', 'count'),
    dismissals=('player_dismissed', 'count'),

).reset_index()

player_stats['batting_average'] = player_stats['total_runs'] / player_stats['dismissals'].replace(0,1)  #avoid division by zero
player_stats['batting_strike_rate'] = (player_stats['total_runs'] / player_stats['total_balls_faced']) * 100

player_bowling_stats = deliveries.groupby('bowler').agg(
    total_runs_conceded=('total_runs', 'sum'),
    total_balls_bowled=('ball', 'count'),
    wickets=('isWicket', 'sum'),
).reset_index()

player_bowling_stats['bowling_average'] = player_bowling_stats['total_runs_conceded'] / player_bowling_stats['wickets'].replace(0,1) #avoid division by zero
player_bowling_stats['bowling_economy'] = (player_bowling_stats['total_runs_conceded'] / (player_bowling_stats['total_balls_bowled'] / 6))

# Merge batting and bowling stats
player_stats = pd.merge(player_stats, player_bowling_stats, left_on='batsman', right_on='bowler', how='outer')
player_stats.fillna(0,inplace=True)
player_stats.drop(columns=['total_runs','total_balls_faced','dismissals','total_runs_conceded','total_balls_bowled','wickets'], inplace=True)
player_stats.head()


Unnamed: 0,batsman,batting_average,batting_strike_rate,bowler,bowling_average,bowling_economy
0,A Bagai,27.909091,61.708543,0,0.0,0.0
1,A Balbirnie,34.581818,75.267115,A Balbirnie,31.0,7.75
2,A Bohara,13.0,92.857143,A Bohara,51.0,4.636364
3,A Codrington,5.5,41.772152,A Codrington,28.166667,5.42246
4,A Dananjaya,14.45,70.316302,A Dananjaya,29.836364,5.171218


In [None]:
player_match_stats = []

for index, row in matches.iterrows():
    match_id = row['matchId']
    team1_players = row['team1_playing_11']
    team2_players = row['team2_playing_11']

    # Combine players from both teams
    all_players = team1_players + team2_players

    # Filter player stats for players in the current match
    match_player_stats = player_stats[player_stats['batsman'].isin(all_players)]

    # Add matchId to the player stats
    match_player_stats['matchId'] = match_id

    # Append to the list
    player_match_stats.append(match_player_stats)

# Concatenate the list of DataFrames into a single DataFrame
player_match_stats_df = pd.concat(player_match_stats)




In [None]:
# Now merge this with matches
matches1 = pd.merge(matches, player_match_stats_df, on='matchId', how='left')
matches1.head()

Unnamed: 0,matchId,date,team1,team2,winner,winner_runs,winner_wickets,toss_winner,player_of_match,venue,city,neutralvenue,team1_playing_11,team2_playing_11,t1_rating_old,t2_rating_old,t1_rating_new,t2_rating_new,t1_recent_form,t2_recent_form,batsman,batting_average,batting_strike_rate,bowler,bowling_average,bowling_economy
0,64814,2002-12-29,New Zealand,India,New Zealand,35.0,,India,V Sehwag,"McLean Park, Napier",Napier,,"[NJ Astle, JDP Oram, CD McMillan, KD Mills, MS...","[Yuvraj Singh, R Dravid, J Srinath, SC Ganguly...",100.0,100.0,104.545455,95.454545,0.0,0.0,A Nehra,4.5,51.724138,A Nehra,29.243478,5.387984
1,64814,2002-12-29,New Zealand,India,New Zealand,35.0,,India,V Sehwag,"McLean Park, Napier",Napier,,"[NJ Astle, JDP Oram, CD McMillan, KD Mills, MS...","[Yuvraj Singh, R Dravid, J Srinath, SC Ganguly...",100.0,100.0,104.545455,95.454545,0.0,0.0,BB McCullum,31.115152,94.496595,0,0.0,0.0
2,64814,2002-12-29,New Zealand,India,New Zealand,35.0,,India,V Sehwag,"McLean Park, Napier",Napier,,"[NJ Astle, JDP Oram, CD McMillan, KD Mills, MS...","[Yuvraj Singh, R Dravid, J Srinath, SC Ganguly...",100.0,100.0,104.545455,95.454545,0.0,0.0,CD McMillan,29.026316,81.703704,CD McMillan,29.076923,5.641791
3,64814,2002-12-29,New Zealand,India,New Zealand,35.0,,India,V Sehwag,"McLean Park, Napier",Napier,,"[NJ Astle, JDP Oram, CD McMillan, KD Mills, MS...","[Yuvraj Singh, R Dravid, J Srinath, SC Ganguly...",100.0,100.0,104.545455,95.454545,0.0,0.0,DL Vettori,20.422535,83.142202,DL Vettori,25.684466,4.000756
4,64814,2002-12-29,New Zealand,India,New Zealand,35.0,,India,V Sehwag,"McLean Park, Napier",Napier,,"[NJ Astle, JDP Oram, CD McMillan, KD Mills, MS...","[Yuvraj Singh, R Dravid, J Srinath, SC Ganguly...",100.0,100.0,104.545455,95.454545,0.0,0.0,DR Tuffey,11.058824,94.0,DR Tuffey,31.254902,4.909651


##Player Recent Form

In [None]:
def calculate_last_five_matches(player_name, matches_data):
    # Filter matches where the player participated
    player_matches = matches_data[matches_data['team1_playing_11'].apply(lambda x: player_name in x) | matches_data['team2_playing_11'].apply(lambda x: player_name in x)]

    # Sort matches by date in descending order
    player_matches = player_matches.sort_values(by='date', ascending=False)

    # Select the last 5 matches
    last_five_matches = player_matches.head(5)

    return last_five_matches

# Function to calculate stats for the last 5 matches
def calculate_player_stats_last_five(player_name, last_five_matches):

    # Calculate batting stats
    batting_average = last_five_matches['batting_average'].mean()
    batting_strike_rate = last_five_matches['batting_strike_rate'].mean()

    # Calculate bowling stats
    bowling_average = last_five_matches['bowling_average'].mean()
    bowling_economy = last_five_matches['bowling_economy'].mean()

    return batting_average, batting_strike_rate, bowling_average, bowling_economy


# Example usage for a specific player:
player_name = 'VVS Laxman' # Replace with any player's name

last_five_matches = calculate_last_five_matches(player_name, matches1)

if not last_five_matches.empty:
  batting_average, batting_strike_rate, bowling_average, bowling_economy = calculate_player_stats_last_five(player_name, last_five_matches)
  print(f"Player: {player_name}")
  print(f"Batting Average (last 5 matches): {batting_average}")
  print(f"Batting Strike Rate (last 5 matches): {batting_strike_rate}")
  print(f"Bowling Average (last 5 matches): {bowling_average}")
  print(f"Bowling Economy (last 5 matches): {bowling_economy}")

else:
  print(f"Player {player_name} not found in the last five matches or not enough data is available.")


Player: VVS Laxman
Batting Average (last 5 matches): 23.425310256685897
Batting Strike Rate (last 5 matches): 71.90658925447026
Bowling Average (last 5 matches): 23.44472364868036
Bowling Economy (last 5 matches): 4.087440590566653


In [None]:
def calculate_player_stats_last_five_all_players(matches_data):
    all_players = set()
    for _, row in matches_data.iterrows():
        all_players.update(row['team1_playing_11'])
        all_players.update(row['team2_playing_11'])

    player_stats_dict = {}
    for player_name in all_players:
        last_five_matches = calculate_last_five_matches(player_name, matches_data)
        if not last_five_matches.empty:
            batting_average, batting_strike_rate, bowling_average, bowling_economy = calculate_player_stats_last_five(player_name, last_five_matches)
            player_stats_dict[player_name] = {
                'Batting Average': batting_average,
                'Batting Strike Rate': batting_strike_rate,
                'Bowling Average': bowling_average,
                'Bowling Economy': bowling_economy
            }
        else:
            player_stats_dict[player_name] = {
                'Batting Average': 0,
                'Batting Strike Rate': 0,
                'Bowling Average': 0,
                'Bowling Economy': 0
            }
    return player_stats_dict

player_stats_dict = calculate_player_stats_last_five_all_players(matches1)


In [None]:
player_recent_stats = pd.DataFrame.from_dict(player_stats_dict, orient='index')
player_recent_stats.rename(columns={'Batting Average': 'latest_5_batting_average', 'Batting Strike Rate': 'latest_5_batting_strike_rate', 'Bowling Average': 'latest_5_bowling_average', 'Bowling Economy': 'latest_5_bowling_economy'}, inplace=True)
player_recent_stats.rename_axis(['player'], axis=1, inplace=True)
player_recent_stats['player'] = player_recent_stats.index
player_recent_stats.head()

player,latest_5_batting_average,latest_5_batting_strike_rate,latest_5_bowling_average,latest_5_bowling_economy,player.1
HWR Cartwright,34.497571,100.333919,30.356044,4.908805,HWR Cartwright
Ahsan Malik,21.434652,55.659649,42.177778,6.154795,Ahsan Malik
DBL Powell,25.585601,81.472072,30.346266,4.625071,DBL Powell
RK Whelan,22.578073,69.663947,21.449264,3.878566,RK Whelan
Faisal Iqbal,32.80281,75.935794,17.489578,2.964588,Faisal Iqbal


In [None]:
player_last_5_match_stats = []

for index, row in matches.iterrows():
    match_id = row['matchId']
    team1_players = row['team1_playing_11']
    team2_players = row['team2_playing_11']

    # Combine players from both teams
    all_players = team1_players + team2_players

    # Filter player stats for players in the current match
    match_player_stats = player_recent_stats[player_recent_stats['player'].isin(all_players)]

    # Add matchId to the player stats
    match_player_stats['matchId'] = match_id

    # Append to the list
    player_last_5_match_stats.append(match_player_stats)

# Concatenate the list of DataFrames into a single DataFrame
player_match_stats_df = pd.concat(player_last_5_match_stats)


In [None]:
player_match_stats_df.head()

player,latest_5_batting_average,latest_5_batting_strike_rate,latest_5_bowling_average,latest_5_bowling_economy,player.1,matchId
DL Vettori,27.494678,98.414622,21.834457,3.362211,DL Vettori,64814
JDP Oram,29.471451,79.238676,12.478731,1.932608,JDP Oram,64814
SC Ganguly,27.736966,79.541456,29.07352,5.992845,SC Ganguly,64814
SB Bangar,22.746607,71.629259,33.214183,5.068855,SB Bangar,64814
M Kaif,14.463483,64.788661,22.747958,3.840055,M Kaif,64814


In [None]:
# Now merge this with matches
matches = pd.merge(matches, player_match_stats_df, on='matchId', how='left')
matches.head()

Unnamed: 0,matchId,date,team1,team2,winner,winner_runs,winner_wickets,toss_winner,player_of_match,venue,city,neutralvenue,team1_playing_11,team2_playing_11,t1_rating_old,t2_rating_old,t1_rating_new,t2_rating_new,t1_recent_form,t2_recent_form,latest_5_batting_average,latest_5_batting_strike_rate,latest_5_bowling_average,latest_5_bowling_economy,player
0,64814,2002-12-29,New Zealand,India,New Zealand,35.0,,India,V Sehwag,"McLean Park, Napier",Napier,,"[NJ Astle, JDP Oram, CD McMillan, KD Mills, MS...","[Yuvraj Singh, R Dravid, J Srinath, SC Ganguly...",100.0,100.0,104.545455,95.454545,0.0,0.0,27.494678,98.414622,21.834457,3.362211,DL Vettori
1,64814,2002-12-29,New Zealand,India,New Zealand,35.0,,India,V Sehwag,"McLean Park, Napier",Napier,,"[NJ Astle, JDP Oram, CD McMillan, KD Mills, MS...","[Yuvraj Singh, R Dravid, J Srinath, SC Ganguly...",100.0,100.0,104.545455,95.454545,0.0,0.0,29.471451,79.238676,12.478731,1.932608,JDP Oram
2,64814,2002-12-29,New Zealand,India,New Zealand,35.0,,India,V Sehwag,"McLean Park, Napier",Napier,,"[NJ Astle, JDP Oram, CD McMillan, KD Mills, MS...","[Yuvraj Singh, R Dravid, J Srinath, SC Ganguly...",100.0,100.0,104.545455,95.454545,0.0,0.0,27.736966,79.541456,29.07352,5.992845,SC Ganguly
3,64814,2002-12-29,New Zealand,India,New Zealand,35.0,,India,V Sehwag,"McLean Park, Napier",Napier,,"[NJ Astle, JDP Oram, CD McMillan, KD Mills, MS...","[Yuvraj Singh, R Dravid, J Srinath, SC Ganguly...",100.0,100.0,104.545455,95.454545,0.0,0.0,22.746607,71.629259,33.214183,5.068855,SB Bangar
4,64814,2002-12-29,New Zealand,India,New Zealand,35.0,,India,V Sehwag,"McLean Park, Napier",Napier,,"[NJ Astle, JDP Oram, CD McMillan, KD Mills, MS...","[Yuvraj Singh, R Dravid, J Srinath, SC Ganguly...",100.0,100.0,104.545455,95.454545,0.0,0.0,14.463483,64.788661,22.747958,3.840055,M Kaif




```
# This is formatted as code
```

## Win% Logistic Regression Model

In [None]:
df = matches.copy()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [None]:
df['date'] = pd.to_datetime(df['date'])

In [None]:
df['winner'] = df['winner'].fillna('No result')

In [None]:
# Remove matches with "No result"
df = df[df['winner'] != "No result"]

In [None]:
# Encode Winner: team1 wins (1), team2 wins (0)
df['winner'] = (df['winner'] == df['team1']).astype(int)

# Define a cutoff date to split train and test data
cutoff_date = pd.to_datetime('2022-01-01')
train_df = df[df['date'] < cutoff_date]
test_df = df[df['date'] >= cutoff_date]

# Define features and target
features = ['t1_recent_form', 't2_recent_form', 't1_rating_old', 't2_rating_old']
X_train = train_df[features]
y_train = train_df['winner']
X_test = test_df[features]
y_test = test_df['winner']

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train Logistic Regression Model
model = LogisticRegression()
model.fit(X_train_scaled, y_train)

# Predict on test data
y_pred = model.predict(X_test_scaled)

In [None]:
# Evaluate Model
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")
print(classification_report(y_test, y_pred))

Model Accuracy: 0.48
              precision    recall  f1-score   support

           0       0.50      0.34      0.40        65
           1       0.48      0.64      0.55        61

    accuracy                           0.48       126
   macro avg       0.49      0.49      0.47       126
weighted avg       0.49      0.48      0.47       126



In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

# Function to predict win probabilities
def predict_match_outcome(input_json, model, scaler):
    # Convert JSON input to DataFrame
    input_df = pd.DataFrame([input_json])

    # Define feature columns
    features = ['t1_recent_form', 't2_recent_form', 't1_rating_old', 't2_rating_old']

    # Scale input features
    input_scaled = scaler.transform(input_df[features])

    # Get probability predictions
    probabilities = model.predict_proba(input_scaled)[0]

    # Team 1 win probability = Prob[1], Team 2 win probability = Prob[0]
    return {
        'team1_win_probability': probabilities[1] * 100,
        'team2_win_probability': probabilities[0] * 100
    }

# Example input JSON
input_json = {
    'team1': 'India',
    'team2': 'Australia',
    'venue': 'Nagpur',
    'toss_winner': 'India',
    't1_recent_form': 0.4,
    't2_recent_form': 0.467,
    't1_rating_old': 115,
    't2_rating_old': 122
     #'t1_playing_11': ,
     #'t2_playing_11':
}

# Predicting win probabilities
result = predict_match_outcome(input_json, model, scaler)
print(result)

{'team1_win_probability': 50.56119860375543, 'team2_win_probability': 49.43880139624457}


##Win % XgBoost Model

In [None]:
df = matches.copy()

In [None]:
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report

In [None]:
df['date'] = pd.to_datetime(df['date'])

In [None]:
df['winner'] = df['winner'].fillna('No result')

In [None]:
# Remove matches with "No result"
df = df[df['winner'] != "No result"]

In [None]:
# Encode Winner: team1 wins (1), team2 wins (0)
df['winner'] = (df['winner'] == df['team1']).astype(int)

# Define a cutoff date to split train and test data
cutoff_date = pd.to_datetime('2022-01-01')
train_df = df[df['date'] < cutoff_date]
test_df = df[df['date'] >= cutoff_date]

# Define features and target
features = ['t1_recent_form', 't2_recent_form', 't1_rating_old', 't2_rating_old']
X_train = train_df[features]
y_train = train_df['winner']
X_test = test_df[features]
y_test = test_df['winner']

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train Logistic Regression Model
#model = LogisticRegression()
model = xgb.XGBClassifier(n_estimators=50, max_depth=5, use_label_encoder=False, eval_metric='logloss')
model.fit(X_train_scaled, y_train)

# Predict on test data
y_pred = model.predict(X_test_scaled)

In [None]:
# Evaluate Model
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")
print(classification_report(y_test, y_pred))

Model Accuracy: 0.48
              precision    recall  f1-score   support

           0       0.50      0.34      0.40        65
           1       0.48      0.64      0.55        61

    accuracy                           0.48       126
   macro avg       0.49      0.49      0.47       126
weighted avg       0.49      0.48      0.47       126



In [None]:
from sklearn.preprocessing import StandardScaler
#from sklearn.linear_model import LogisticRegression

# Function to predict win probabilities
def predict_match_outcome(input_json, model, scaler):
    # Convert JSON input to DataFrame
    input_df = pd.DataFrame([input_json])

    # Define feature columns
    features = ['t1_recent_form', 't2_recent_form', 't1_rating_old', 't2_rating_old']

    # Scale input features
    input_scaled = scaler.transform(input_df[features])

    # Get probability predictions
    probabilities = model.predict_proba(input_scaled)[0]

    # Team 1 win probability = Prob[1], Team 2 win probability = Prob[0]
    return {
        'team1_win_probability': probabilities[1] * 100,
        'team2_win_probability': probabilities[0] * 100
    }

# Example input JSON
input_json = {
    'team1': 'India',
    'team2': 'Australia',
    'venue': 'Nagpur',
    'toss_winner': 'India',
    't1_recent_form': 0.4,
    't2_recent_form': 0.467,
    't1_rating_old': 115,
    't2_rating_old': 122
}

# Predicting win probabilities
result = predict_match_outcome(input_json, model, scaler)
print(result)

{'team1_win_probability': 65.42355418205261, 'team2_win_probability': 34.57644581794739}


## Win % Random Forest Model


In [None]:
df = matches.copy()

In [None]:
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.metrics import accuracy_score, classification_report

In [None]:
df['date'] = pd.to_datetime(df['date'])

In [None]:
df['winner'] = df['winner'].fillna('No result')

In [None]:
# Remove matches with "No result"
df = df[df['winner'] != "No result"]

In [None]:
# Encode Winner: team1 wins (1), team2 wins (0)
df['winner'] = (df['winner'] == df['team1']).astype(int)

# Define a cutoff date to split train and test data
cutoff_date = pd.to_datetime('2022-01-01')
train_df = df[df['date'] < cutoff_date]
test_df = df[df['date'] >= cutoff_date]

# Define features and target
features = ['t1_recent_form', 't2_recent_form', 't1_rating_old', 't2_rating_old']
X_train = train_df[features]
y_train = train_df['winner']
X_test = test_df[features]
y_test = test_df['winner']

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train Logistic Regression Model
#model = LogisticRegression()
#model = xgb.XGBClassifier(n_estimators=50, max_depth=5, use_label_encoder=False, eval_metric='logloss')
model = RandomForestClassifier(max_depth=4, random_state=0)
model.fit(X_train_scaled, y_train)

# Predict on test data
y_pred = model.predict(X_test_scaled)

In [None]:
# Evaluate Model
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")
print(classification_report(y_test, y_pred))

Model Accuracy: 0.48
              precision    recall  f1-score   support

           0       0.50      0.34      0.40        65
           1       0.48      0.64      0.55        61

    accuracy                           0.48       126
   macro avg       0.49      0.49      0.47       126
weighted avg       0.49      0.48      0.47       126



In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

# Function to predict win probabilities
def predict_match_outcome(input_json, model, scaler):
    # Convert JSON input to DataFrame
    input_df = pd.DataFrame([input_json])

    # Define feature columns
    features = ['t1_recent_form', 't2_recent_form', 't1_rating_old', 't2_rating_old']

    # Scale input features
    input_scaled = scaler.transform(input_df[features])

    # Get probability predictions
    probabilities = model.predict_proba(input_scaled)[0]

    # Team 1 win probability = Prob[1], Team 2 win probability = Prob[0]
    return {
        'team1_win_probability': probabilities[1] * 100,
        'team2_win_probability': probabilities[0] * 100
    }

# Example input JSON
input_json = {
    'team1': 'India',
    'team2': 'Australia',
    'venue': 'Nagpur',
    'toss_winner': 'India',
    't1_recent_form': 0.4,
    't2_recent_form': 0.467,
    't1_rating_old': 115,
    't2_rating_old': 122
}

# Predicting win probabilities
result = predict_match_outcome(input_json, model, scaler)
print(result)

{'team1_win_probability': 58.356416250073565, 'team2_win_probability': 41.64358374992645}


## Export Models

In [None]:
import joblib

In [None]:
# Save model as .pkl files
joblib.dump(model, "odi_logistic_regression_model.pkl")

['odi_logistic_regression_model.pkl']

In [None]:
# Save scaler as .pkl files
joblib.dump(scaler, "odi_scaler.pkl")

['odi_scaler.pkl']

In [None]:
team1 = input("Enter Batting Team: {}".format(match_df['team1'].unique()))
team2 = input("Enter Bowling Team: {}".format(match_df['team2'].unique()))
venue = input("Enter Venue: {}".format(match_df['venue'].unique()))
toss_winner = input("Enter Toss Winner: {}".format(df['toss_winner'].unique()))

input_data = pd.DataFrame({
            'team1': [team1], # Changed 'batting_team' to 'team1'
            'team2': [team2], # Changed 'bowling_team' to 'team2'
            'venue': [venue],
            'toss_winner': [toss_winner],
        })

# Convert categorical features to dummy variables
input_data = pd.get_dummies(input_data)

# Reindex the input_data DataFrame to match the training data columns
# This ensures that the input data has the same columns as the data used to train the model
input_data = input_data.reindex(columns=X_train.columns, fill_value=0)

pipe = pickle.load(open('model.pkl', 'rb'))
result = pipe.predict_proba(input_data)

win_probability = round(result[0][1] * 100,2)
loss_probability = 100 - win_probability

print(f"Winning Probability: {win_probability}%")
print(f"Losing Probability: {loss_probability}%")

## Flask API

In [None]:
import io
import glob
import joblib
import numpy as np
import pandas as pd
from tqdm import trange
from flask import Flask, request, jsonify
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

app = Flask(__name__)

# Load pre-trained model and scaler
model = joblib.load("odi_logistic_regression_model.pkl")
scaler = joblib.load("odi_scaler.pkl")

@app.route('/predict', methods=['POST'])
def predict():
    try:
        # Get JSON request
        data = request.get_json()

        # Extract features
        features = ['t1_recent_form', 't2_recent_form', 't1_rating_old', 't2_rating_old']
        input_df = pd.DataFrame([data])  # Convert JSON to DataFrame

        # Scale input data
        input_scaled = scaler.transform(input_df[features])

        # Get win probabilities
        probabilities = model.predict_proba(input_scaled)[0]

        # Return probabilities
        return jsonify({
            'team1': data['team1'],
            'team2': data['team2'],
            'team1_win_probability': round(probabilities[1] * 100, 2),
            'team2_win_probability': round(probabilities[0] * 100, 2)
        })

    except Exception as e:
        return jsonify({'error': str(e)})

if __name__ == '__main__':
    app.run(debug=True)


 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
INFO:werkzeug: * Restarting with stat
