# Dataset Information.
**HWD.csv:** It contains historical world cup statistics data, that shows the total win percentage of a particular team, total titles obtained, total losses, etc.
**results.csv:**It shows the teams overall performance since the year 2015 while playing against the rival teams. This will help the model understand how, consistent a team is.

Data Collection

In [None]:
# Extracting the world cup 2023 match data.
!pip install extract-wc-data

In [None]:
from ExtractWCData.get_latest_data import GetData
#extracting data and saving it as a csv
data = GetData()
df = data.get_data()
df.to_csv('WC_data.csv')

In [None]:
#importing the required libraries.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Loading the CSV datasets
world_cup=pd.read_csv("HWD.csv")
# world_cup.head()

In [None]:
results = pd.read_csv("results.csv")
results.head()

Unnamed: 0,Date,Team_1,Team_2,Winner,Margin,Ground
0,17-04-2015,Bangladesh,Pakistan,Bangladesh,won by 79 runs,Shere Bangla National Stadium
1,19-04-2015,Bangladesh,Pakistan,Bangladesh,won by 7 wickets,Shere Bangla National Stadium
2,22-04-2015,Bangladesh,Pakistan,Bangladesh,won by 8 wickets,Shere Bangla National Stadium
3,08-05-2015,Ireland,England,No result,No result,The Village
4,26-05-2015,Pakistan,Zimbabwe,Pakistan,won by 41 runs,Gaddafi Stadium


In [None]:
latest = pd.read_csv('WC_data.csv')
latest.head()

Unnamed: 0.1,Unnamed: 0,Date,Team_1,Team_2,Winner,Margin,Ground
0,0,05/10/2023,England,New Zealand,New Zealand,9 wickets,"Narendra Modi Stadium, Ahmedabad"
1,1,06/10/2023,Netherlands,Pakistan,Pakistan,81 runs,"Rajiv Gandhi International Stadium, Hyderabad"
2,2,07/10/2023,Afghanistan,Bangladesh,Bangladesh,6 wickets,"Himachal Pradesh Cricket Association Stadium, ..."
3,3,07/10/2023,South Africa,Sri Lanka,South Africa,102 runs,"Arun Jaitley Stadium, Delhi"
4,4,08/10/2023,India,Australia,India,6 wickets,"MA Chidambaram Stadium, Chennai"


In [None]:
#drop the unnamed row
latest.drop(columns=['Unnamed: 0'], axis=1, inplace=True)

In [None]:
# We will combine the data of WC_data and results so that we have upto date data.
results = pd.concat([results, latest], axis=0)
results = results.reset_index(drop = True)
# results.tail(5)

In [None]:
results.drop(columns=['Date', 'Margin', 'Ground'], axis=1, inplace=True)

In [None]:
results

Unnamed: 0,Team_1,Team_2,Winner
0,Bangladesh,Pakistan,Bangladesh
1,Bangladesh,Pakistan,Bangladesh
2,Bangladesh,Pakistan,Bangladesh
3,Ireland,England,No result
4,Pakistan,Zimbabwe,Pakistan
...,...,...,...
849,New Zealand,Sri Lanka,New Zealand
850,Afghanistan,South Africa,South Africa
851,Australia,Bangladesh,Australia
852,England,Pakistan,England


In [None]:
playing_teams=['England','South Africa','Netherlands','Pakistan','Australia','India','Bangladesh','Sri Lanka','Afghanistan']

In [None]:
team1_new=results[results['Team_1'].isin(playing_teams)]
team2_new=results[results['Team_2'].isin(playing_teams)]
winners_new=results[results['Winner'].isin(playing_teams)]

df_team= pd.concat((team1_new, team2_new, winners_new), axis=0)
df_team.head(5)

Unnamed: 0,Team_1,Team_2,Winner
8,England,New Zealand,New Zealand
11,Bangladesh,India,Bangladesh
29,South Africa,New Zealand,South Africa
35,England,Australia,England
53,Sri Lanka,West Indies,Sri Lanka


In [None]:
#we want winner column to be numerical not string 2 for team2 and 1 for team 1
df_team.loc[:,'Winning'] = np.where(df_team['Winner']==df_team['Team_1'],1,2)
df_team.head()

Unnamed: 0,Team_1,Team_2,Winner,Winning
8,England,New Zealand,New Zealand,2
11,Bangladesh,India,Bangladesh,2
29,South Africa,New Zealand,South Africa,1
35,England,Australia,England,1
53,Sri Lanka,West Indies,Sri Lanka,1


In [None]:
#we no more require winner col.
df_team.drop(columns=['Winner'], axis=1, inplace=True)
df_team.head()

Unnamed: 0,Team_1,Team_2,Winning
8,England,New Zealand,2
11,Bangladesh,India,2
29,South Africa,New Zealand,1
35,England,Australia,1
53,Sri Lanka,West Indies,1


Data Transformation

In [None]:
#One hot encoding
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

#Apply the encoding
df_team= pd.get_dummies(df_team, prefix=['Team_1', 'Team_2'], columns = ['Team_1', 'Team_2'], dtype=int, sparse=False)
df_team.head()

Unnamed: 0,Winning,Team_1_Afghanistan,Team_1_Afghanistan.1,Team_1_Australia,Team_1_Australia.1,Team_1_Bangladesh,Team_1_Bangladesh.1,Team_1_England,Team_1_England.1,Team_1_Hong Kong,...,Team_2_Netherlands,Team_2_New Zealand,Team_2_Oman,Team_2_Pakistan,Team_2_Scotland,Team_2_South Africa,Team_2_Sri Lanka,Team_2_United Arab Emirates,Team_2_West Indies,Team_2_Zimbabwe
8,2,0,0,0,0,0,0,1,0,0,...,0,1,0,0,0,0,0,0,0,0
11,2,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
29,1,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
35,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
53,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [None]:
input = df_team.drop(columns=['Winning'], axis=1)
output = df_team['Winning']

In [None]:
# Splitting the data in the training and testing set
x_train, x_test, y_train, y_test = train_test_split(input,output, test_size=0.2, random_state=34)

In [None]:
#model building
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline

In [None]:
classifiers = {
    'Random Forest' : RandomForestClassifier(),
    'Logistic Regression' : LogisticRegression(),
    'Decision Tree' : DecisionTreeClassifier()
}
for name, clf in classifiers.items():
  pipeline = Pipeline([('classifier',clf)])

  pipeline.fit(x_train, y_train)

  #Make Predictions
  y_pred = pipeline.predict(x_test)

  #Calculate the accuracy
  acc = accuracy_score(y_test, y_pred)
  print(f'{name}: ')
  print(f"Accuracy : {acc:.4f}")

Random Forest: 
Accuracy : 0.8164
Logistic Regression: 
Accuracy : 0.7738
Decision Tree: 
Accuracy : 0.8197


In [None]:
rf = RandomForestClassifier()
rf.fit(x_train, y_train)
predictions = rf.predict(x_test)

In [None]:
print(predictions)

[2 2 1 1 2 1 2 1 1 1 2 2 1 1 2 1 2 2 1 2 1 2 1 2 2 2 2 2 2 2 2 2 1 2 2 1 2
 2 2 2 1 1 2 2 1 1 1 2 2 1 1 2 2 1 1 1 1 1 2 1 1 1 2 2 2 1 1 2 2 1 1 2 1 1
 2 2 1 1 2 1 1 2 2 1 1 2 2 2 2 2 1 2 2 2 2 2 2 1 1 2 2 2 1 2 2 2 1 1 2 2 2
 2 2 1 1 1 2 1 2 1 2 1 1 2 2 1 2 1 2 2 2 2 1 1 2 2 1 1 2 2 1 1 1 2 1 1 1 2
 2 2 1 2 2 2 1 2 2 2 2 1 2 1 2 1 2 1 2 2 1 2 2 2 2 2 2 1 2 1 2 2 2 2 2 1 1
 2 2 2 2 2 2 2 2 2 1 2 2 1 2 2 2 1 2 1 1 2 2 1 1 2 2 2 1 1 2 2 2 1 1 1 1 1
 1 1 2 2 2 2 2 2 1 1 1 1 1 1 2 1 2 2 1 2 1 2 1 2 2 2 1 1 2 2 2 2 1 1 1 2 1
 2 2 2 1 1 2 2 2 2 2 1 2 2 1 2 1 2 2 1 1 1 1 1 1 2 1 2 2 2 2 1 1 1 1 1 2 1
 1 1 2 2 2 2 1 2 1]


In [None]:
rankings = pd.read_csv('Icc_ranking.csv')
fixtures = pd.read_csv("Fixtures.csv")

In [None]:
pred_set=[] #initialize an empty df.

fixtures.insert(1, 'first_position', fixtures['Team_1'].map(rankings.set_index('Team_name')['Team_ranking']))
fixtures.insert(2, 'second_position', fixtures['Team_2'].map(rankings.set_index('Team_name')['Team_ranking']))

fixtures = fixtures.iloc[:150, :]
fixtures.head()

Unnamed: 0,Round_number,first_position,second_position,Team_1,Team_2,Date,Location,Group,Result
0,1,5.0,,England,New Zealand,5/10/2023,"Narendra Modi Stadium, Ahmedabad",Group A,
1,1,,10.0,Pakistan,Netherlands,6/10/2023,"Rajiv Gandhi International Stadium, Hyderabad",Group A,
2,1,7.0,8.0,Bangladesh,Afghanistan,7/10/2023,"Himachal Pradesh Cricket Association Stadium, ...",Group A,
3,1,6.0,9.0,South Africa,Sri Lanka,7/10/2023,"Arun Jaitley Stadium, Delhi",Group A,
4,1,3.0,1.0,India,Australia,8/10/2023,"MA Chidambaram Stadium, Chennai",Group A,


In [None]:
fixtures.drop(columns=[''],axis=0,inplace=True)

In [None]:
for index, row in fixtures.iterrows():
  if row['first_position'] < row['second_position']:
    pred_set.append({'Team_1' :row['Team_1'] , 'Team_2' :row['Team_2'] , 'Winning_team' : None })
  else:
    pred_set.append({'Team_1' :row['Team_2'] , 'Team_2' :row['Team_1'] , 'Winning_team' : None })

pred_set = pd.DataFrame(pred_set)

pred_set.head()

Unnamed: 0,Team_1,Team_2,Winning_team
0,New Zealand,England,
1,Netherlands,Pakistan,
2,Bangladesh,Afghanistan,
3,South Africa,Sri Lanka,
4,Australia,India,


In [None]:
pred_set_backup=pred_set

In [None]:
pred_set = pd.get_dummies(pred_set, prefix=['Team_1', 'Team_2'], columns=['Team_1', 'Team_2'], dtype=int)

missing_cols = set(df_team.columns) - set(pred_set.columns)


KeyError: ignored

In [None]:
print(pred_set.iloc[:20,:])

   Winning_team  Team_1_Afghanistan  Team_1_Australia  Team_1_Bangladesh   
0          None                   0                 0                  0  \
1          None                   0                 0                  0   
2          None                   0                 0                  1   
3          None                   0                 0                  0   
4          None                   0                 1                  0   
5          None                   0                 0                  0   
6          None                   0                 0                  0   
7          None                   0                 0                  0   
8          None                   0                 0                  0   
9          None                   0                 1                  0   
10         None                   0                 0                  1   
11         None                   0                 0                  0   
12         N

In [None]:
for cols in missing_cols :
  pred_set[cols] = 0

pred_set = pred_set[df_team.columns]

pred_set = pred_set.drop(['Winning'], axis=1)
pred_set.head()

Unnamed: 0,Team_1_Afghanistan,Team_1_Afghanistan.1,Team_1_Australia,Team_1_Australia.1,Team_1_Bangladesh,Team_1_Bangladesh.1,Team_1_England,Team_1_England.1,Team_1_Hong Kong,Team_1_Hong Kong.1,...,Team_2_Netherlands,Team_2_New Zealand,Team_2_Oman,Team_2_Pakistan,Team_2_Scotland,Team_2_South Africa,Team_2_Sri Lanka,Team_2_United Arab Emirates,Team_2_West Indies,Team_2_Zimbabwe
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
predictions = rf.predict(pred_set)
for i in range(fixtures.shape[0]):
  print(pred_set_backup.iloc[i,1] + " Vs " + pred_set_backup.iloc[i,0])
  if predictions[i]==1 :
    print('Winner : ' + pred_set_backup.iloc[i,1])
  else :
    print('Winner : ' + pred_set_backup.iloc[i,0])
  print("")

England Vs New Zealand
Winner : New Zealand

Pakistan  Vs Netherlands
Winner : Netherlands

Afghanistan Vs Bangladesh
Winner : Bangladesh

Sri Lanka Vs South Africa
Winner : Sri Lanka

India Vs Australia
Winner : Australia

New Zealand Vs Netherlands
Winner : Netherlands

Bangladesh Vs England
Winner : Bangladesh

Afghanistan Vs India
Winner : Afghanistan

Pakistan  Vs Sri Lanka
Winner : Pakistan 

South Africa Vs Australia
Winner : Australia

New Zealand Vs Bangladesh
Winner : Bangladesh

Afghanistan Vs England
Winner : Afghanistan

India Vs Pakistan
Winner : Pakistan

Sri Lanka Vs Australia
Winner : Sri Lanka

Netherlands Vs South Africa
Winner : Netherlands

New Zealand Vs Afghanistan
Winner : Afghanistan

Bangladesh Vs India
Winner : Bangladesh

Pakistan Vs Australia
Winner : Pakistan

Netherlands Vs Sri Lanka
Winner : Netherlands

South Africa Vs England
Winner : England

India Vs New Zealand
Winner : India

Pakistan  Vs Afghanistan
Winner : Pakistan 

Bangladesh Vs South Africa
W

In [None]:
top_winners = latest['Winner'].value_counts().head(4).index.tolist()

print(f"Top 4 teams : {top_winners}")

Top 4 teams : ['India', 'South Africa', 'Australia', 'New Zealand']


In [None]:
# Predict the single match results of future

def predict_single_match(model, rankings, team_1, team_2):
  single_match_data = pd.DataFrame({
      'Team_1': [team_1],
      'Team_2':[team_2]
  })

  #Insert the team ranking data
  single_match_data.insert(1, 'first_position',single_match_data['Team_1'].map(rankings.set_index("Team_name")['Team_ranking']))
  single_match_data.insert(2, 'second_position',single_match_data['Team_2'].map(rankings.set_index("Team_name")['Team_ranking']))

  # Apply one hot encoding
  single_match_data = pd.get_dummies(single_match_data, prefix = ['Team_1', 'Team_2'], columns=['Team_1', 'Team_2'], dtype=int)

  #Find the missing columns
  missing_cols = set(df_team.columns) - set(single_match_data.columns)

  #Set the missing columns to 0 and then keep only th ecolumns present

  for col in missing_cols:
    single_match_data[col] = 0

  single_match_data = single_match_data[df_team.columns]

  #Drop the winning column
  single_match_data = single_match_data.drop(['Winning'], axis=1)

  #Making the prediction
  prediction = model.predict(single_match_data)

  #print the result
  print(f"{team_1} vs {team_2}")

  if prediction[0] ==1 :
    print(f"Winner: {team_1}")
  else:
    print(f"Winner: {team_2}")

  print((""))

In [None]:
predict_single_match(rf, rankings, "India", "New Zealand")

India vs New Zealand
Winner: India



In [None]:
predict_single_match(rf, rankings, "India", "South Africa")

India vs South Africa
Winner: India

