## ***Cricket Worldcup-23 Prediction***

### Importing all python libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import RandomizedSearchCV
from sklearn import model_selection

### Reading raw data

In [2]:
df1 = pd.read_csv("one-day-matches.csv")

In [3]:
df1.head(2)

Unnamed: 0,Team 1,Team 2,Winner,Margin,Ground,Match Date,Scorecard
0,Australia,England,Australia,5 wickets,Melbourne,"Jan 5, 1971",ODI # 1
1,England,Australia,England,6 wickets,Manchester,"Aug 24, 1972",ODI # 2


### Processing raw data and creating ADS

In [4]:
df1.columns = ['team1','team2','winner','margin','ground','match_date','count']
df1.columns

Index(['team1', 'team2', 'winner', 'margin', 'ground', 'match_date', 'count'], dtype='object')

In [5]:
df1["team1"] =df1["team1"].astype("string")
df1["team2"] =df1["team2"].astype("string")
df1["winner"] =df1["winner"].astype("string")
df1["ground"] =df1["ground"].astype("string")
df1["match_date"] =df1["match_date"].astype("string")

In [6]:
df1.dtypes

team1         string
team2         string
winner        string
margin        object
ground        string
match_date    string
count         object
dtype: object

In [7]:
df1['year'] = df1['match_date'].str[-4:]
df1['month'] = df1['match_date'].str[:3]
df1['day'] = df1['match_date'].str[4:6]

In [9]:
df1.head(2)

Unnamed: 0,team1,team2,winner,margin,ground,match_date,count,year,month,day
0,Australia,England,Australia,5 wickets,Melbourne,"Jan 5, 1971",ODI # 1,1971,Jan,5
1,England,Australia,England,6 wickets,Manchester,"Aug 24, 1972",ODI # 2,1972,Aug,24


In [10]:
day_processed = ['1,', '2,','3,','4,', '5,', '6,','7,', '8,', '9,']

In [11]:
df1['day'] = np.where(df1['day'].isin(day_processed),df1['day'].str[0],df1['day'])

In [13]:
df1.head(2)

Unnamed: 0,team1,team2,winner,margin,ground,match_date,count,year,month,day
0,Australia,England,Australia,5 wickets,Melbourne,"Jan 5, 1971",ODI # 1,1971,Jan,5
1,England,Australia,England,6 wickets,Manchester,"Aug 24, 1972",ODI # 2,1972,Aug,24


In [14]:
date_del = df1['day'].str.split("-",expand=True)
date_del.head(2)

Unnamed: 0,0,1
0,5,
1,24,


In [15]:
df1['day_updated'] = date_del[0]
df1.head(2)

Unnamed: 0,team1,team2,winner,margin,ground,match_date,count,year,month,day,day_updated
0,Australia,England,Australia,5 wickets,Melbourne,"Jan 5, 1971",ODI # 1,1971,Jan,5,5
1,England,Australia,England,6 wickets,Manchester,"Aug 24, 1972",ODI # 2,1972,Aug,24,24


In [16]:
df1.columns

Index(['team1', 'team2', 'winner', 'margin', 'ground', 'match_date', 'count',
       'year', 'month', 'day', 'day_updated'],
      dtype='object')

In [17]:
df1_f = df1[['team1', 'team2', 'winner', 'margin', 'ground','year', 'month', 'day_updated']]
df1_f.columns = ['team1', 'team2', 'winner', 'margin', 'ground','year', 'month', 'day']

In [18]:
df1_f.day.unique()

array(['5', '24', '26', '28', '11', '18', '20', '7', '30', '31', '13',
       '15', '3', '1', '8', '9', '14', '21', '22', '16', '2', '4', '6',
       '23', '12', '17', '27', '25', '19', '10', '29'], dtype=object)

In [19]:
df1_f['year'] =df1_f['year'].astype(str)
df1_f['month'] =df1_f['month'].astype(str)
df1_f['day'] =df1_f['day'].astype(str)
df1_f.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1_f['year'] =df1_f['year'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1_f['month'] =df1_f['month'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1_f['day'] =df1_f['day'].astype(str)


Unnamed: 0,team1,team2,winner,margin,ground,year,month,day
0,Australia,England,Australia,5 wickets,Melbourne,1971,Jan,5
1,England,Australia,England,6 wickets,Manchester,1972,Aug,24
2,England,Australia,Australia,5 wickets,Lord's,1972,Aug,26
3,England,Australia,England,2 wickets,Birmingham,1972,Aug,28
4,New Zealand,Pakistan,New Zealand,22 runs,Christchurch,1973,Feb,11


In [20]:
df1_f["date"] = df1_f["day"]+"-"+df1_f["month"]+"-"+df1_f["year"]
df1_f.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1_f["date"] = df1_f["day"]+"-"+df1_f["month"]+"-"+df1_f["year"]


Unnamed: 0,team1,team2,winner,margin,ground,year,month,day,date
0,Australia,England,Australia,5 wickets,Melbourne,1971,Jan,5,5-Jan-1971
1,England,Australia,England,6 wickets,Manchester,1972,Aug,24,24-Aug-1972
2,England,Australia,Australia,5 wickets,Lord's,1972,Aug,26,26-Aug-1972
3,England,Australia,England,2 wickets,Birmingham,1972,Aug,28,28-Aug-1972
4,New Zealand,Pakistan,New Zealand,22 runs,Christchurch,1973,Feb,11,11-Feb-1973


In [21]:
df1_f['date'] = pd.to_datetime(df1_f['date'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1_f['date'] = pd.to_datetime(df1_f['date'])


In [22]:
df1_f.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4621 entries, 0 to 4620
Data columns (total 9 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   team1   4621 non-null   string        
 1   team2   4621 non-null   string        
 2   winner  4621 non-null   string        
 3   margin  4621 non-null   object        
 4   ground  4621 non-null   string        
 5   year    4621 non-null   object        
 6   month   4621 non-null   object        
 7   day     4621 non-null   object        
 8   date    4621 non-null   datetime64[ns]
dtypes: datetime64[ns](1), object(4), string(4)
memory usage: 325.0+ KB


In [24]:
print(df1_f.shape)
df1_f = df1_f[df1_f['year']>="2007"]
print(df1_f.shape)

(4621, 9)
(2152, 9)


In [25]:
df1_f.year.unique()

array(['2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014',
       '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022',
       '2023'], dtype=object)

In [26]:
dfv1 = df1_f[['team1', 'team2', 'winner']]

In [27]:
wc23_team = ['England', 'Pakistan', 'Bangladesh', 'South Africa', 'India',
       'New Zealand', 'Australia', 'Netherlands', 'Afghanistan','Sri Lanka']
print(dfv1.shape)
dfv1 = dfv1[dfv1['winner'].isin(wc23_team)]
print(dfv1.shape)

(2152, 3)
(1518, 3)


In [28]:
dfv1.head()

Unnamed: 0,team1,team2,winner
2469,New Zealand,Sri Lanka,New Zealand
2470,New Zealand,Sri Lanka,Sri Lanka
2472,Australia,England,Australia
2473,Australia,New Zealand,Australia
2474,England,New Zealand,England


In [29]:
dfv1 = dfv1.reset_index(drop=True)
dfv1.head()

Unnamed: 0,team1,team2,winner
0,New Zealand,Sri Lanka,New Zealand
1,New Zealand,Sri Lanka,Sri Lanka
2,Australia,England,Australia
3,Australia,New Zealand,Australia
4,England,New Zealand,England


In [30]:
df = pd.get_dummies(dfv1, prefix=['team1', 'team2'], columns=['team1', 'team2'])
X = df.drop(['winner'], axis=1)
y = df["winner"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [31]:
print(df.shape)
df.head(2)

(1518, 44)


Unnamed: 0,winner,team1_Afghanistan,team1_Australia,team1_Bangladesh,team1_Bermuda,team1_Canada,team1_England,team1_Hong Kong,team1_India,team1_Ireland,...,team2_New Zealand,team2_Oman,team2_Pakistan,team2_Scotland,team2_South Africa,team2_Sri Lanka,team2_U.A.E.,team2_U.S.A.,team2_West Indies,team2_Zimbabwe
0,New Zealand,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,Sri Lanka,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


### Building and Training Random Forest model

In [32]:
# Define the parameter grid for RandomizedSearchCV
param_grid = {
    'n_estimators': np.arange(50, 500, 50),
    'max_depth': [None] + list(np.arange(10, 110, 10)),
    'min_samples_split': np.arange(2, 11),
    'min_samples_leaf': np.arange(1, 11),
    'bootstrap': [True, False]
}

# Create a Random Forest classifier
rf = RandomForestClassifier()

# Create RandomizedSearchCV object
random_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_grid,
    n_iter=100,  # Number of parameter settings that are sampled
    scoring='accuracy',  # Use accuracy as the metric to evaluate the model
    cv=5,  # 5-fold cross-validation
    n_jobs=-1,  # Use all available CPU cores
    random_state=42
)

# Fit the RandomizedSearchCV to the data
random_search.fit(X_train, y_train)

# Print the best hyperparameters found
print("Best Hyperparameters:")
print(random_search.best_params_)

# Get the best model
best_model = random_search.best_estimator_

Best Hyperparameters:
{'n_estimators': 350, 'min_samples_split': 9, 'min_samples_leaf': 5, 'max_depth': 80, 'bootstrap': True}


In [33]:
best_model

### Checking train and test dataset accuracy

In [34]:
train_accuracy = best_model.score(X_train, y_train)
test_accuracy = best_model.score(X_test, y_test)
print("Train Accuracy:", train_accuracy)
print("Test Accuracy:", test_accuracy)

Train Accuracy: 0.763653483992467
Test Accuracy: 0.7456140350877193


### Forecasting odi world cup-23 group stage match results

In [35]:
df2 = pd.read_csv("odi_wc23_schedule.csv")
df2['Team A'] = np.where(df2['Team A']=="Q1","Netherlands",df2['Team A'])
df2['Team B'] = np.where(df2['Team B']=="Q1","Netherlands",df2['Team B'])
df2.head(2)

Unnamed: 0,Date,Team A,Team B,Venue,Time IST,Winner Team,Player of The Match
0,"Thursday, 05 October",England,New Zealand,"Narendra Modi Stadium, Ahmedabad",2:00 P.M,,
1,"Friday, 06 October",Pakistan,Netherlands,"Rajiv Gandhi International Stadium, Hyderabad",2:00 P.M,,


In [36]:
df3 = pd.read_csv("odi_team_ranking.csv",encoding= 'unicode_escape')
df3.head(2)

Unnamed: 0,POS,TEAM,MATCHES,POINTS,RATING
0,1,Australia,23,2714,118
1,2,Pakistan,20,2316,116


In [37]:
df2 = df2[['Team A', 'Team B']]
df2.columns = ['team1','team2']
df2.head(2)

Unnamed: 0,team1,team2
0,England,New Zealand
1,Pakistan,Netherlands


In [38]:
df3 = df3[['POS','TEAM']]
df3.columns = ['ranking','team']
df3.head(2)

Unnamed: 0,ranking,team
0,1,Australia
1,2,Pakistan


In [39]:
df3['team'] = df3['team'].astype("string")
df3.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19 entries, 0 to 18
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   ranking  19 non-null     int64 
 1   team     19 non-null     string
dtypes: int64(1), string(1)
memory usage: 432.0 bytes


In [40]:
df2['team1'] = df2['team1'].astype("string")
df2['team2'] = df2['team2'].astype("string")
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45 entries, 0 to 44
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   team1   45 non-null     string
 1   team2   45 non-null     string
dtypes: string(2)
memory usage: 848.0 bytes


In [41]:
print(df2.shape)
df2= pd.merge(df2,df3,left_on='team1',right_on='team',how='left')
print(df2.shape)
df2.head(2)

(45, 2)
(45, 4)


Unnamed: 0,team1,team2,ranking,team
0,England,New Zealand,5,England
1,Pakistan,Netherlands,2,Pakistan


In [42]:
df2.columns = ['team1','team2','team1_ranking','team']
df2.head(2)

Unnamed: 0,team1,team2,team1_ranking,team
0,England,New Zealand,5,England
1,Pakistan,Netherlands,2,Pakistan


In [43]:
df2 = df2[['team1','team2','team1_ranking']]

In [44]:
print(df2.shape)
df2= pd.merge(df2,df3,left_on='team2',right_on='team',how='left')
print(df2.shape)
df2.head(2)

(45, 3)
(45, 5)


Unnamed: 0,team1,team2,team1_ranking,ranking,team
0,England,New Zealand,5,4,New Zealand
1,Pakistan,Netherlands,2,14,Netherlands


In [45]:
df2.columns = ['team1','team2','team1_ranking','team2_ranking','team']
df2 = df2[['team1','team2','team1_ranking','team2_ranking']]
df2.head(2)

Unnamed: 0,team1,team2,team1_ranking,team2_ranking
0,England,New Zealand,5,4
1,Pakistan,Netherlands,2,14


In [46]:
df2['flag'] = np.where(df2['team1_ranking']<df2['team2_ranking'],"first","second")
df2.head(2)

Unnamed: 0,team1,team2,team1_ranking,team2_ranking,flag
0,England,New Zealand,5,4,second
1,Pakistan,Netherlands,2,14,first


In [47]:
df2['Team1'] = np.where(df2['flag']=="first",df2['team1'],df2['team2'])
df2['Team2'] = np.where(df2['flag']=="second",df2['team1'],df2['team2'])
df2.columns = ['Team1','Team2','team1_ranking','team2_ranking','flag','team1','team2']
df2.head()

Unnamed: 0,Team1,Team2,team1_ranking,team2_ranking,flag,team1,team2
0,England,New Zealand,5,4,second,New Zealand,England
1,Pakistan,Netherlands,2,14,first,Pakistan,Netherlands
2,Bangladesh,Afghanistan,7,8,first,Bangladesh,Afghanistan
3,South Africa,Sri Lanka,6,9,first,South Africa,Sri Lanka
4,India,Australia,3,1,second,Australia,India


In [48]:
wc23_match = df2[['team1','team2']]
wc23_match.columns = ['team1','team2']
wc23_match['winner'] = ""
backup_pred_set = wc23_match
wc23_match.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  wc23_match['winner'] = ""


Unnamed: 0,team1,team2,winner
0,New Zealand,England,
1,Pakistan,Netherlands,
2,Bangladesh,Afghanistan,
3,South Africa,Sri Lanka,
4,Australia,India,


In [49]:
df_forecast = pd.get_dummies(wc23_match, prefix=['team1', 'team2'], columns=['team1', 'team2'])
# Add missing columns compared to the model's training dataset
missing_cols = set(df.columns) - set(df_forecast.columns)
missing_cols
for c in missing_cols:
    df_forecast[c] = 0
df_forecast.head()
df_forecast = df_forecast[df.columns]


df_forecast = df_forecast.drop(['winner'], axis=1)
df_forecast.head(2)

Unnamed: 0,team1_Afghanistan,team1_Australia,team1_Bangladesh,team1_Bermuda,team1_Canada,team1_England,team1_Hong Kong,team1_India,team1_Ireland,team1_Kenya,...,team2_New Zealand,team2_Oman,team2_Pakistan,team2_Scotland,team2_South Africa,team2_Sri Lanka,team2_U.A.E.,team2_U.S.A.,team2_West Indies,team2_Zimbabwe
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [50]:
#group matches
predictions = best_model.predict(df_forecast)
for i in range(df2.shape[0]):
    print(backup_pred_set.iloc[i, 1] + " and " + backup_pred_set.iloc[i, 0])
    if predictions[i] == 1:
        print("Winner: " + backup_pred_set.iloc[i, 1])

    else:
        print("Winner: " + backup_pred_set.iloc[i, 0])
    print("")

England and New Zealand
Winner: New Zealand

Netherlands and Pakistan
Winner: Pakistan

Afghanistan and Bangladesh
Winner: Bangladesh

Sri Lanka and South Africa
Winner: South Africa

India and Australia
Winner: Australia

Netherlands and New Zealand
Winner: New Zealand

Bangladesh and England
Winner: England

Afghanistan and India
Winner: India

Sri Lanka and Pakistan
Winner: Pakistan

South Africa and Australia
Winner: Australia

Bangladesh and New Zealand
Winner: New Zealand

Afghanistan and England
Winner: England

India and Pakistan
Winner: Pakistan

Sri Lanka and Australia
Winner: Australia

Netherlands and South Africa
Winner: South Africa

Afghanistan and New Zealand
Winner: New Zealand

Bangladesh and India
Winner: India

Pakistan and Australia
Winner: Australia

Netherlands and Sri Lanka
Winner: Sri Lanka

South Africa and England
Winner: England

New Zealand and India
Winner: India

Afghanistan and Pakistan
Winner: Pakistan

Bangladesh and South Africa
Winner: South Africa



In [51]:
final_prediction = backup_pred_set

In [52]:
for i in range(len(final_prediction)):
  final_prediction['winner'][i] = predictions[i]

final_prediction.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_prediction['winner'][i] = predictions[i]


Unnamed: 0,team1,team2,winner
0,New Zealand,England,New Zealand
1,Pakistan,Netherlands,Pakistan
2,Bangladesh,Afghanistan,Bangladesh
3,South Africa,Sri Lanka,South Africa
4,Australia,India,Australia


In [53]:
final_prediction['flag'] = 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_prediction['flag'] = 1


### Checking number of wins by respective teams

In [54]:
win_count = final_prediction.groupby(['winner'],as_index=False).agg({'flag':'sum'})
win_count.columns = ['team','win_count']
win_count = win_count.sort_values(by = 'win_count',ascending=False)
win_count

Unnamed: 0,team,win_count
1,Australia,9
4,India,8
6,Pakistan,7
5,New Zealand,6
3,England,5
7,South Africa,4
8,Sri Lanka,3
2,Bangladesh,2
0,Afghanistan,1


### Forecasting odi world cup-23 semi-final match results

In [55]:
semifinal_team = {'team1':['Australia','Pakistan'],'team2':['New Zealand','India'],'winner':['','']}
semifinal_df = pd.DataFrame(semifinal_team)


wc23_semifinal = semifinal_df

semifinal_forecast = pd.get_dummies(wc23_semifinal, prefix=['team1', 'team2'], columns=['team1', 'team2'])
# Add missing columns compared to the model's training dataset
missing_cols = set(df.columns) - set(semifinal_forecast.columns)
missing_cols
for c in missing_cols:
    semifinal_forecast[c] = 0
semifinal_forecast.head()
semifinal_forecast = semifinal_forecast[df.columns]


semifinal_forecast = semifinal_forecast.drop(['winner'], axis=1)
semifinal_forecast.head()

Unnamed: 0,team1_Afghanistan,team1_Australia,team1_Bangladesh,team1_Bermuda,team1_Canada,team1_England,team1_Hong Kong,team1_India,team1_Ireland,team1_Kenya,...,team2_New Zealand,team2_Oman,team2_Pakistan,team2_Scotland,team2_South Africa,team2_Sri Lanka,team2_U.A.E.,team2_U.S.A.,team2_West Indies,team2_Zimbabwe
0,0,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [56]:
semifinal_df
semifinal_df.shape[0]

2

In [57]:
#group matches
semi_predictions = best_model.predict(semifinal_forecast)
for i in range(semifinal_df.shape[0]):
    print(semifinal_df.iloc[i, 1] + " and " + semifinal_df.iloc[i, 0])
    if predictions[i] == 1:
        print("Winner: " + semifinal_df.iloc[i, 1])

    else:
        print("Winner: " + semifinal_df.iloc[i, 0])
    print("")

New Zealand and Australia
Winner: Australia

India and Pakistan
Winner: Pakistan



### Forecasting odi world cup-23 final match results

In [58]:
final_team = {'team1':['Australia'],'team2':['Pakistan'],'winner':['']}
final_df = pd.DataFrame(final_team)


wc23_final = final_df

final_forecast = pd.get_dummies(wc23_final, prefix=['team1', 'team2'], columns=['team1', 'team2'])
# Add missing columns compared to the model's training dataset
missing_cols = set(df.columns) - set(final_forecast.columns)
missing_cols
for c in missing_cols:
    final_forecast[c] = 0
final_forecast.head()
final_forecast = final_forecast[df.columns]


final_forecast = final_forecast.drop(['winner'], axis=1)
final_forecast.head()

#group matches
final_predictions = best_model.predict(final_forecast)
for i in range(final_df.shape[0]):
    print(final_df.iloc[i, 1] + " and " + final_df.iloc[i, 0])
    if predictions[i] == 1:
        print("Winner: " + final_df.iloc[i, 1])

    else:
        print("Winner: " + final_df.iloc[i, 0])
    print("")

Pakistan and Australia
Winner: Australia

