# #                                              Modelling

In [1]:
#importing libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.ticker as ticker
import matplotlib.ticker as plticker
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

In [2]:
#load data 

world_cup = pd.read_csv('T20TeamStats.csv')
results = pd.read_csv('T20Records.csv')
ranks = pd.read_csv('T20Ranking.csv')

In [3]:
world_cup.head()

Unnamed: 0,Team,Group,Previous Appearances,Previous Titles,Previous Finals,Previous Semifinals,Current Ranking
0,Canada,A,0,0,0,0,23
1,India,A,8,1,2,4,1
2,Ireland,A,7,0,0,0,11
3,Pakistan,A,8,1,3,6,7
4,U.S.A.,A,0,0,0,0,19


In [4]:
# Previous records
results

Unnamed: 0,Date,Team1,Team2,Winner,Venue
0,2/17/2005,New Zealand,Australia,Australia,Auckland
1,6/13/2005,England,Australia,England,Southampton
2,10/21/2005,South Africa,New Zealand,New Zealand,Johannesburg
3,1/9/2006,Australia,South Africa,Australia,Brisbane
4,2/16/2006,New Zealand,West Indies,tied,Auckland
...,...,...,...,...,...
2607,5/12/2024,Bangladesh,Zimbabwe,Zimbabwe,Mirpur
2608,5/12/2024,Japan,Mongolia,Japan,Sano
2609,5/14/2024,Ireland,Pakistan,Pakistan,Dublin
2610,5/18/2024,Netherlands,Scotland,Netherlands,The Hague


In [5]:
#Removing the record with ties
data=results[results.Winner!='tied']

In [6]:
# Dropping Venue column ,Since Only Australia will host the World Cup so home team advantage not possible for other team
data.drop(columns=['Venue'],inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.drop(columns=['Venue'],inplace=True)


In [7]:
#Dropping the date column
data.drop(columns=['Date'],inplace=True)
data.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.drop(columns=['Date'],inplace=True)


Unnamed: 0,Team1,Team2,Winner
0,New Zealand,Australia,Australia
1,England,Australia,England
2,South Africa,New Zealand,New Zealand
3,Australia,South Africa,Australia
5,South Africa,Australia,South Africa


In [9]:
#Filtering the records for only teams that will play the world cup
worldcup_teams = ['South Africa','India','England','New Zealand','Bangladesh','Australia','Pakistan','Canada',
                  'Sri Lanka','Ireland','Netherlands','West Indies','Afghanistan','Nepal','Oman','U.S.A.','P.N.G.',
                  'Namibia','Uganda','Scotland']
df_teams_1 = data[data['Team1'].isin(worldcup_teams)]
df_teams_2 = data[data['Team2'].isin(worldcup_teams)]
df_teams = pd.concat((df_teams_1, df_teams_2))
df_teams.drop_duplicates()
df_teams.count()

Team1     2602
Team2     2602
Winner    2602
dtype: int64

In [10]:
#Adding column 'Count': It will record the team which won the most matches from its previous 6 matches (i.e. if India have won 4 matches from its previous 6 matches and Pakistan have won 3 matches than count which select India)
# and 'Rank': The team which is having higher rank will be named in this column
df_teams['Count']=0
df_teams['Rank']=0
df_team= df_teams.reset_index(drop=True) 
df_team

Unnamed: 0,Team1,Team2,Winner,Count,Rank
0,New Zealand,Australia,Australia,0,0
1,England,Australia,England,0,0
2,South Africa,New Zealand,New Zealand,0,0
3,Australia,South Africa,Australia,0,0
4,South Africa,Australia,South Africa,0,0
...,...,...,...,...,...
2597,Ireland,Pakistan,Ireland,0,0
2598,Ireland,Pakistan,Pakistan,0,0
2599,Ireland,Pakistan,Pakistan,0,0
2600,Netherlands,Scotland,Netherlands,0,0


In [15]:
# Assuming df_team and ranks DataFrames are already defined and imported

# Ensure 'Count' and 'Rank' columns exist in the DataFrame
df_team['Count'] = ""
df_team['Rank'] = ""

# Iterate over each row
for i in range(2602):
    dt1 = df_team['Team1'].iloc[i].strip()
    dt2 = df_team['Team2'].iloc[i].strip()
    c1 = c2 = w1 = w2 = 0

    # Check for the next 50 matches
    for j in range(i+1, min(i+51, len(df_team))):
        if ((df_team['Team1'].iloc[j].strip() == dt1) or (df_team['Team2'].iloc[j].strip() == dt1)) and c1 < 6:
            c1 += 1
            if df_team['Winner'].iloc[j].strip() == dt1:
                w1 += 1
        
        if ((df_team['Team1'].iloc[j].strip() == dt2) or (df_team['Team2'].iloc[j].strip() == dt2)) and c2 < 6:
            c2 += 1
            if df_team['Winner'].iloc[j].strip() == dt2:
                w2 += 1

    # Assign 'Count' based on the wins
    df_team.loc[i, 'Count'] = dt2 if w2 > w1 else dt1

    # Retrieve ranks for both teams
    r1 = ranks.loc[ranks['Team'] == dt1, 'Rank'].values[0] if not ranks.loc[ranks['Team'] == dt1, 'Rank'].empty else float('inf')
    r2 = ranks.loc[ranks['Team'] == dt2, 'Rank'].values[0] if not ranks.loc[ranks['Team'] == dt2, 'Rank'].empty else float('inf')

    # Assign 'Rank' based on the comparison
    df_team.loc[i, 'Rank'] = dt1 if r2 > r1 else dt2


In [16]:
data=df_team.iloc[0:2602]
data

Unnamed: 0,Team1,Team2,Winner,Count,Rank
0,New Zealand,Australia,Australia,New Zealand,Australia
1,England,Australia,England,Australia,Australia
2,South Africa,New Zealand,New Zealand,South Africa,South Africa
3,Australia,South Africa,Australia,South Africa,Australia
4,South Africa,Australia,South Africa,South Africa,Australia
...,...,...,...,...,...
2597,Ireland,Pakistan,Ireland,Pakistan,Pakistan
2598,Ireland,Pakistan,Pakistan,Ireland,Pakistan
2599,Ireland,Pakistan,Pakistan,Ireland,Pakistan
2600,Netherlands,Scotland,Netherlands,Netherlands,Netherlands


In [17]:
#Assigning the binary number to data , team1: 0 and team2: 1
for i in range(2602):
    dt1=data['Team1'].iloc[i]
    dt2=data['Team2'].iloc[i]
    data['Team1'].iloc[i]=0
    data['Team2'].iloc[i]=1
    if data['Winner'].iloc[i]==dt1.strip():
        data['Winner'].iloc[i]=data['Team1'].iloc[i]
    else:
        data['Winner'].iloc[i]=data['Team2'].iloc[i]
    
    if data['Count'].iloc[i]==dt1:
        data['Count'].iloc[i]=data['Team1'].iloc[i]
    else:
        data['Count'].iloc[i]=data['Team2'].iloc[i]
    
    if data['Rank'].iloc[i]==dt1:
        data['Rank'].iloc[i]=data['Team1'].iloc[i]
    else:
        data['Rank'].iloc[i]=data['Team2'].iloc[i]
        
    
        

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Team1'].iloc[i]=0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Team2'].iloc[i]=1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Winner'].iloc[i]=data['Team2'].iloc[i]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Count'].iloc[i]=data['Team1'].iloc[i]
A value is trying to be set on 

In [18]:
data

Unnamed: 0,Team1,Team2,Winner,Count,Rank
0,0,1,1,0,1
1,0,1,0,1,1
2,0,1,1,0,0
3,0,1,0,1,0
4,0,1,0,0,1
...,...,...,...,...,...
2597,0,1,0,1,1
2598,0,1,1,0,1
2599,0,1,1,0,1
2600,0,1,0,0,0


In [21]:
#importing the fixture file of upcoming world cup
fixtures = pd.read_csv('T20Fixture.csv')
fixtures

Unnamed: 0,Date,Team1,Team2,Group,Venue,Country,Result
0,6/1/2024,U.S.A.,Canada,Group A,Dallas,U.S.A.,
1,6/5/2024,India,Ireland,Group A,New York,U.S.A.,
2,6/6/2024,U.S.A.,Pakistan,Group A,Dallas,U.S.A.,
3,6/7/2024,Canada,Ireland,Group A,New York,U.S.A.,
4,6/9/2024,India,Pakistan,Group A,New York,U.S.A.,
5,6/11/2024,Pakistan,Canada,Group A,New York,U.S.A.,
6,6/12/2024,U.S.A.,India,Group A,New York,U.S.A.,
7,6/14/2024,U.S.A.,Ireland,Group A,Lauderhill,U.S.A.,
8,6/15/2024,India,Canada,Group A,Lauderhill,U.S.A.,
9,6/16/2024,Pakistan,Ireland,Group A,Lauderhill,U.S.A.,


In [34]:
#selecting the record till League matches
fixtures=fixtures.iloc[0:40]
fixtures

Unnamed: 0,Team1,Team2,Result
0,U.S.A.,Canada,
1,India,Ireland,
2,U.S.A.,Pakistan,
3,Canada,Ireland,
4,India,Pakistan,
5,Pakistan,Canada,
6,U.S.A.,India,
7,U.S.A.,Ireland,
8,India,Canada,
9,Pakistan,Ireland,


In [35]:
#Dropping the Date,Column1 and Venue columns
fixtures.drop(columns=['Date','Group','Venue','Country'],inplace=True)


KeyError: "['Date', 'Group', 'Venue', 'Country'] not found in axis"

In [36]:
#Seperating the output i.e. Winner column for testing model
y=data["Winner"]
y = y.astype(float, errors = 'raise')

In [37]:
#dropping the target column and creating features file
X=data.drop('Winner',axis=1)
X['Team1'] = X.Team1.astype(float)
X['Team2'] = X.Team2.astype(float)
X['Count'] = X.Count.astype(float)
X['Rank'] = X.Rank.astype(float)

In [38]:
#importing libraries for SVM(Support Vector Machine) model
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2)

In [39]:
model = SVC(kernel = 'linear', C = 1)

In [40]:
model.fit(X_train, y_train)

In [41]:
svm_pred = model.predict(X_test)

In [42]:
#Accuracy score
accuracy = model.score(X_test, y_test)
accuracy

0.6314779270633397

In [43]:
U=fixtures.drop('Result',axis=1)
U['Count']=0
U['Rank']=0

In [44]:
U

Unnamed: 0,Team1,Team2,Count,Rank
0,U.S.A.,Canada,0,0
1,India,Ireland,0,0
2,U.S.A.,Pakistan,0,0
3,Canada,Ireland,0,0
4,India,Pakistan,0,0
5,Pakistan,Canada,0,0
6,U.S.A.,India,0,0
7,U.S.A.,Ireland,0,0
8,India,Canada,0,0
9,Pakistan,Ireland,0,0


In [67]:
#We have assumped from recent performance of teams in qualifying matches that Group A winner : Sri Lanka
#Group A Runner Up : Namibia , Group B winner : West Indies and Group B runner Up : Scotland
  
for i in range(40):
    dt1=U['Team1'].iloc[i]
    dt2=U['Team2'].iloc[i]
    r1=0
    r2=0
    for k in range(40):
        if (dt1.strip()==ranks['Team'].iloc[k]):
            r1=ranks['Rank'].iloc[k]
        
    for k in range(40):
        if (dt2.strip()==ranks['Team'].iloc[k]):
            r2=ranks['Rank'].iloc[k]
        
    if(r2>r1):
        U['Rank'].iloc[i]=dt1
        U['Count'].iloc[i]=dt1
        
    else:
        U['Rank'].iloc[i]=dt2
        U['Count'].iloc[i]=dt2
        

        
    

AttributeError: 'numpy.float64' object has no attribute 'strip'

In [60]:
for i in range(40):
    dt1=U['Team1'].iloc[i]
    dt2=U['Team2'].iloc[i]
    U['Team1'].iloc[i]=0
    U['Team2'].iloc[i]=1
    if U['Count'].iloc[i]==dt1:
        U['Count'].iloc[i]=U['Team1'].iloc[i]
    else:
        U['Count'].iloc[i]=U['Team2'].iloc[i]
    
    if U['Rank'].iloc[i]==dt1:
        U['Rank'].iloc[i]=U['Team1'].iloc[i]
    else:
        U['Rank'].iloc[i]=U['Team2'].iloc[i]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  U['Team1'].iloc[i]=0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  U['Team2'].iloc[i]=1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  U['Count'].iloc[i]=U['Team2'].iloc[i]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  U['Rank'].iloc[i]=U['Team2'].iloc[i]
A value is trying to be set on a copy of a slice fr

In [68]:
U

Unnamed: 0,Team1,Team2,Count,Rank
0,0.0,1.0,1.0,1.0
1,0.0,1.0,1.0,1.0
2,0.0,1.0,1.0,1.0
3,0.0,1.0,1.0,1.0
4,0.0,1.0,1.0,1.0
5,0.0,1.0,1.0,1.0
6,0.0,1.0,1.0,1.0
7,0.0,1.0,1.0,1.0
8,0.0,1.0,1.0,1.0
9,0.0,1.0,1.0,1.0


In [69]:
U['Team1'] = U.Team1.astype(float)
U['Team2'] = U.Team2.astype(float)
U['Count'] = U.Count.astype(float)
U['Rank'] = U.Rank.astype(float)

In [70]:
#Appling SVM model on World Cup schedule
svm_pred = model.predict(U)

In [71]:
#Predicted vales 0: Team1 and 1:Team2
svm_pred

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1.])

In [65]:
#Linking the predicted values with records
print("Results till League matches\n")
for i in range(40):
    if (svm_pred[i]==1.0):
        print(str(i+1)+"."+fixtures['Team1'].iloc[i] + " Vs " + fixtures['Team2'].iloc[i] + " : " + fixtures['Team2'].iloc[i] )
        print()
    else:
        print(str(i+1)+"."+fixtures['Team1'].iloc[i] + " Vs " + fixtures['Team2'].iloc[i] + " : " + fixtures['Team1'].iloc[i] )
        print()

Results till League matches

1.U.S.A. Vs Canada : Canada

2.India Vs Ireland : Ireland

3.U.S.A. Vs Pakistan : Pakistan

4.Canada Vs Ireland : Ireland

5.India Vs Pakistan : Pakistan

6.Pakistan Vs Canada : Canada

7.U.S.A. Vs India : India

8.U.S.A. Vs Ireland : Ireland

9.India Vs Canada : Canada

10.Pakistan Vs Ireland : Ireland

11.Namibia Vs Oman : Oman

12.England Vs Scotland : Scotland

13.Australia Vs Oman : Oman

14.Namibia Vs Scotland : Scotland

15.Australia Vs England : England

16.Oman Vs Scotland : Scotland

17.Australia Vs Namibia : Namibia

18.England Vs Oman : Oman

19.Namibia Vs England : England

20.Australia Vs Scotland : Scotland

21.West Indies Vs P.N.G. : P.N.G.

22.Afghanistan Vs Uganda : Uganda

23.P.N.G. Vs Uganda : Uganda

24.New Zealand Vs Afghanistan : Afghanistan

25.West Indies Vs Uganda : Uganda

26.West Indies Vs New Zealand : New Zealand

27.Afghanistan Vs P.N.G. : P.N.G.

28.New Zealand Vs Uganda : Uganda

29.New Zealand Vs P.N.G. : P.N.G.

30.West In

In [33]:
#Point table after league matches
print("From the Modelling the Point table as follows:\n")
print("Group 1\n")
print("Team\t\t\tWin")
print("Afghanistan\t\t0")
print("Australia\t\t3")
print("England\t\t\t4")
print("New Zealand\t\t4")
print("Group A Winner\t\t2")
print("Group B Runner Up\t2")
print("\nGroup 2\n")
print("Team\t\t\tWin")
print("Bangladesh\t\t0")
print("India\t\t\t5")
print("Pakistan\t\t4")
print("South Africa\t\t3")
print("Group B Winner\t\t2")
print("Group A Runner Up\t1")



From the Modelling the Point table as follows:

Group 1

Team			Win
Afghanistan		0
Australia		3
England			4
New Zealand		4
Group A Winner		2
Group B Runner Up	2

Group 2

Team			Win
Bangladesh		0
India			5
Pakistan		4
South Africa		3
Group B Winner		2
Group A Runner Up	1


In [34]:
V=U.iloc[18:20]
V['Count'].iloc[1]=0.0


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [35]:
#Predicting the semi-final winners
svm_pred = model.predict(V)

Feature names unseen at fit time:
- Team_1
- Team_2
Feature names seen at fit time, yet now missing:
- Team1
- Team2



In [36]:
svm_pred


array([0., 0.])

In [37]:
print("Semi Finals\n")
print("India(0) Vs England(1) : India")
print("New Zealand(0) Vs Pakistan(1) : New Zealand")

Semi Finals

India(0) Vs England(1) : India
New Zealand(0) Vs Pakistan(1) : New Zealand


In [38]:
#Predicting the Final Winner
V=U.iloc[18:19]
svm_pred = model.predict(V)
svm_pred

Feature names unseen at fit time:
- Team_1
- Team_2
Feature names seen at fit time, yet now missing:
- Team1
- Team2



array([0.])

In [39]:
print("Final\n")
print("India(0) Vs New Zealand(0) : India(0)\n")
print("Probable Winner of World Cup : India")

Final

India(0) Vs New Zealand(0) : India(0)

Probable Winner of World Cup : India
