In [466]:
import numpy as np
import pandas as pd
from scipy import stats
from pandas.api.types import CategoricalDtype
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [467]:
df = pd.read_pickle('International_Results.pkl')
df.shape

(40945, 7)

### Extracting the world cup 2018 Data

In [468]:
wc_2018 = df[df['tournament'] == 'FIFA World Cup']

In [469]:
wc_2018 = wc_2018[wc_2018['date'].dt.year == 2018]

List of unique teams in 2018 world cup.
We will use this to filter our original dataset to only show results of these teams.

In [470]:
li = []
for x in wc_2018['home_team'].unique():
    li.append(x)
    
for x in wc_2018['away_team'].unique():
    if (x not in li):
        li.append(x)

print(len(li))

32


In [471]:
wc_2018.head()

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,neutral
39861,2018-06-14,Russia,Saudi Arabia,5,0,FIFA World Cup,False
39862,2018-06-15,Egypt,Uruguay,0,1,FIFA World Cup,True
39863,2018-06-15,Morocco,Iran,0,1,FIFA World Cup,True
39864,2018-06-15,Portugal,Spain,3,3,FIFA World Cup,True
39865,2018-06-16,France,Australia,2,1,FIFA World Cup,True


### Filter features dataset

We will now filter our original data set to only have the results of the teams participating in 2018 world cup and to exclude results from 2018.

We will also exclude friendlies since friendlies are not played with as much seriousness or high intensity compared to other tournaments. This is because there are little to no rewards for winning friendly matches.

In [472]:
trainDF = df[df['date'].dt.year < 2018]
trainDF = trainDF[trainDF['date'].dt.year >= 1995]
trainDF = trainDF[trainDF['tournament'] != 'Friendly']
trainDF = trainDF[trainDF['home_team'].isin(li) | trainDF['away_team'].isin(li)]
trainDF.head()

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,neutral
18667,1995-01-06,Japan,Nigeria,0,3,Confederations Cup,True
18668,1995-01-06,Saudi Arabia,Mexico,0,2,Confederations Cup,False
18669,1995-01-07,Senegal,Tunisia,0,0,African Cup of Nations qualification,False
18677,1995-01-08,Japan,Argentina,1,5,Confederations Cup,True
18678,1995-01-08,Saudi Arabia,Denmark,0,2,Confederations Cup,False


We will now add a home_result column which tells us if the home team won , draw or lost.

In [473]:
cnd = [trainDF['home_score'] == trainDF['away_score'],trainDF['home_score'] < trainDF['away_score'],trainDF['home_score'] > trainDF['away_score']]
choice = [0,-1,1]
trainDF['home_result'] = np.select(cnd,choice,default='Exception')
trainDF.head()

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,neutral,home_result
18667,1995-01-06,Japan,Nigeria,0,3,Confederations Cup,True,-1
18668,1995-01-06,Saudi Arabia,Mexico,0,2,Confederations Cup,False,-1
18669,1995-01-07,Senegal,Tunisia,0,0,African Cup of Nations qualification,False,0
18677,1995-01-08,Japan,Argentina,1,5,Confederations Cup,True,-1
18678,1995-01-08,Saudi Arabia,Denmark,0,2,Confederations Cup,False,-1


In [474]:
trainDF['year'] = trainDF['date'].dt.year

In [475]:
trainDF = trainDF.iloc[:,[1,2,6,7,8]]
trainDF

Unnamed: 0,home_team,away_team,neutral,home_result,year
18667,Japan,Nigeria,True,-1,1995
18668,Saudi Arabia,Mexico,False,-1,1995
18669,Senegal,Tunisia,False,0,1995
18677,Japan,Argentina,True,-1,1995
18678,Saudi Arabia,Denmark,False,-1,1995
...,...,...,...,...,...
39458,Japan,North Korea,False,1,2017
39460,South Korea,China PR,True,0,2017
39469,Japan,China PR,False,1,2017
39470,North Korea,South Korea,True,-1,2017


In [476]:
mia = ["Martinique","Guadeloupe",'Burma','Sao Tome and Principe','Eswatini','East Timor','Brunei','French Guiana']
trainDF = trainDF[~trainDF['home_team'].isin(mia)]
trainDF = trainDF[~trainDF['away_team'].isin(mia)]

### Adding Fifa ranking feature to our dataset

In [477]:
rank = pd.read_csv('fifa_ranking.csv')

In [478]:
tmp = rank.iloc[:,[0,1,15]]
tmp

Unnamed: 0,rank,country_full,rank_date
0,1,Germany,1993-08-08
1,2,Italy,1993-08-08
2,3,Switzerland,1993-08-08
3,4,Sweden,1993-08-08
4,5,Argentina,1993-08-08
...,...,...,...
57788,206,Anguilla,2018-06-07
57789,206,Bahamas,2018-06-07
57790,206,Eritrea,2018-06-07
57791,206,Somalia,2018-06-07


In [479]:
tmp['year'] = tmp['rank_date'].str.split('-').str[0]
tmp = tmp.iloc[:,[0,1,3]]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [480]:
tmp['year'] = tmp['year'].astype('int')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [481]:
tmp = tmp.replace('IR Iran','Iran',)
tmp = tmp.replace('Curaçao','Curacao')
tmp = tmp.replace(to_replace='St ',value="Saint ",regex = True)
tmp = tmp.replace('Korea Republic','South Korea')
tmp = tmp.replace('Côte d\'Ivoire','Ivory Coast')
tmp = tmp.replace('USA','United States')
tmp = tmp.replace('Korea DPR','North Korea')
tmp = tmp.replace('Congo DR','DR Congo')
tmp = tmp.replace('FYR Macedonia','North Macedonia')
tmp = tmp.replace('Kyrgyz Republic','Kyrgyzstan')
tmp = tmp.replace('Kyrgyz Republic','Kyrgyzstan')
tmp = tmp.replace('Cape Verde Islands','Cape Verde')

In [482]:
genRank = tmp.groupby(['year','country_full']).agg(lambda x : x.value_counts().index[0])
genRank

Unnamed: 0_level_0,Unnamed: 1_level_0,rank
year,country_full,Unnamed: 2_level_1
1993,Albania,92
1993,Algeria,31
1993,Angola,97
1993,Antigua and Barbuda,127
1993,Argentina,9
...,...,...
2018,Vietnam,102
2018,Wales,21
2018,Yemen,133
2018,Zambia,78


We will add a rank difference column to our main data set

In [483]:
genRank

Unnamed: 0_level_0,Unnamed: 1_level_0,rank
year,country_full,Unnamed: 2_level_1
1993,Albania,92
1993,Algeria,31
1993,Angola,97
1993,Antigua and Barbuda,127
1993,Argentina,9
...,...,...
2018,Vietnam,102
2018,Wales,21
2018,Yemen,133
2018,Zambia,78


In [484]:
t = genRank[genRank.index.get_level_values('country_full') == "Germany"]
t.iloc[0].loc['rank']

1

In [485]:
def rank(r,col):
    try:
        return genRank.loc[(r['year'],r[col])].loc['rank']
        
    except: 
        filt = genRank[genRank.index.get_level_values('country_full') == r[col]]
        y = r['year']
        for x in filt.iterrows():
            if abs(x[0][0] - y) <= 3 :
                return x[1]['rank']
        return filt.iloc[0].loc['rank']
        

In [486]:
trainDF['rank_home'] = trainDF.apply(lambda r: rank(r,'home_team'), axis=1 )
trainDF['rank_away'] = trainDF.apply(lambda r: rank(r,'away_team'), axis=1 )

47
47
47
47
47
47
205
205
59
186
47
47
47
47
47
47
47
47
205
205


In [487]:
trainDF['rank_difference'] = trainDF['rank_home'] - trainDF['rank_away']

In [488]:
trainDF = trainDF.drop(columns=['rank_home','rank_away','year'])
trainDF.head()

Unnamed: 0,home_team,away_team,neutral,home_result,rank_difference
18667,Japan,Nigeria,True,-1,20
18668,Saudi Arabia,Mexico,False,-1,21
18669,Senegal,Tunisia,False,0,22
18677,Japan,Argentina,True,-1,26
18678,Saudi Arabia,Denmark,False,-1,15


In [489]:
trainDF['neutral'] = np.where(trainDF['neutral'] == True , 1 , 0)
trainDF.head()

Unnamed: 0,home_team,away_team,neutral,home_result,rank_difference
18667,Japan,Nigeria,1,-1,20
18668,Saudi Arabia,Mexico,0,-1,21
18669,Senegal,Tunisia,0,0,22
18677,Japan,Argentina,1,-1,26
18678,Saudi Arabia,Denmark,0,-1,15


In [490]:
trainDF = pd.get_dummies(trainDF,columns=['home_team','away_team'])

## Model 

In [491]:
X = trainDF.drop(columns=['home_result'])
Y = trainDF['home_result']

In [492]:
from sklearn.preprocessing import StandardScaler
cols = []
for x in X:
    cols.append(x)
col_arr = np.array(cols)

X = X.to_numpy()
Y = Y.to_numpy()
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [493]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.25)

In [658]:
reg = LogisticRegressionCV(cv=3,max_iter=1000,multi_class='multinomial')
reg.fit(X_train,Y_train)

pred = reg.predict(X_test)
score = reg.score(X_train,Y_train)
print("Training accuracy = {} %".format(round(score*100,2)))

test_score = reg.score(X_test,Y_test)
print("Test accuracy = {} %".format(round(test_score*100,2)))

Training accuracy = 63.53 %
Test accuracy = 61.63 %


## Simulation

In [619]:
# Groups
A = ['Russia', 'Saudi Arabia', 'Egypt', 'Uruguay']
B = ['Portugal', 'Spain', 'Morocco', 'Iran']
C = ['France', 'Australia', 'Peru', 'Denmark']
D = ['Argentina', 'Iceland', 'Croatia', 'Nigeria']
E = ['Brazil', 'Switzerland', 'Costa Rica', 'Serbia']
F = ['Germany', 'Mexico', 'Sweden', 'South Korea']
G = ['Belgium', 'Panama', 'Tunisia', 'England']
H = ['Poland', 'Senegal', 'Colombia', 'Japan']

In [594]:
def rank_wc(team):
    try:
        return genRank.loc[(2018,li[x])].loc['rank']

    except: 
        filt = genRank[genRank.index.get_level_values('country_full') == team]
        y = 2018
        return filt.iloc[0].loc['rank']

def sim_group(li):
    results = [0,0,0,0]
    for x in range(4):
        for y in range(x+1,4):
            home_team = "home_team_" +  li[x]
            away_team = "away_team_"+ li[y]
            neutral = True
            if home_team == "Russia":
                neutral = False
            
            home_rank = rank_wc(li[x])
            away_rank = rank_wc(li[y])
            diff = home_rank-away_rank
            
            conv_row = trainDF.drop(columns=['home_result']).iloc[1].copy()
            
            for i in conv_row.iteritems():
                if (i[0] == 'neutral'):
                    if (neutral == True):
                        conv_row['neutral'] = 1
                    else:
                        conv_row['neutral'] = 0
                elif (i[0] == 'rank_difference'):
                    conv_row[i[0]] = diff
                elif (i[0] == home_team):
                    conv_row[i[0]] = 1
                elif (i[0] == away_team):
                    conv_row[i[0]] = 1
                else:
                    conv_row[i[0]] = 0
                
            res = reg.predict(np.array(conv_row).reshape(1,-1))
            str_res = ""
            if res[0] == '-1' : 
                str_res = li[y]
                results[y] += 3
            elif res[0] == '1':
                str_res = li[x]
                results[x] += 3
            else:
                str_res = "draw"
                results[x] += 1
                results[y] += 1
                
            if str_res != "draw":
                print("{} vs {} : {} wins ".format(li[x],li[y],str_res))
            else:
                print("{} vs {} : draw".format(li[x],li[y]))
                
    return results
        
            

### Simulate Group A

In [605]:
g1 = sim_group(A)

Russia vs Saudi Arabia : Russia wins 
Russia vs Egypt : Russia wins 
Russia vs Uruguay : Russia wins 
Saudi Arabia vs Egypt : Egypt wins 
Saudi Arabia vs Uruguay : Uruguay wins 
Egypt vs Uruguay : Uruguay wins 


In [606]:
df = pd.DataFrame({
    'Teams': A,
    'Points': g1
})
df.sort_values(by=['Points'],ascending=False).set_index('Teams')

Unnamed: 0_level_0,Points
Teams,Unnamed: 1_level_1
Russia,9
Uruguay,6
Egypt,3
Saudi Arabia,0


### Simulate Group B

In [607]:
g2 = sim_group(B)

Portugal vs Spain : Spain wins 
Portugal vs Morocco : Portugal wins 
Portugal vs Iran : Portugal wins 
Spain vs Morocco : Spain wins 
Spain vs Iran : Spain wins 
Morocco vs Iran : Morocco wins 


In [609]:
df2 = pd.DataFrame({
    'Teams': B,
    'Points': g2
})
df2.sort_values(by=['Points'],ascending=False).set_index('Teams')

Unnamed: 0_level_0,Points
Teams,Unnamed: 1_level_1
Spain,9
Portugal,6
Morocco,3
Iran,0


### Simulate Group C

In [612]:
g3 = sim_group(C)

France vs Australia : France wins 
France vs Peru : France wins 
France vs Denmark : Denmark wins 
Australia vs Peru : Australia wins 
Australia vs Denmark : Denmark wins 
Peru vs Denmark : Denmark wins 


In [613]:
df3 = pd.DataFrame({
    'Teams': C,
    'Points': g3
})
df3.sort_values(by=['Points'],ascending=False).set_index('Teams')

Unnamed: 0_level_0,Points
Teams,Unnamed: 1_level_1
Denmark,9
France,6
Australia,3
Peru,0


### Simulate Group D

In [614]:
g2 = sim_group(D)

Argentina vs Iceland : Argentina wins 
Argentina vs Croatia : Argentina wins 
Argentina vs Nigeria : Argentina wins 
Iceland vs Croatia : Iceland wins 
Iceland vs Nigeria : Nigeria wins 
Croatia vs Nigeria : Nigeria wins 


In [615]:
df2 = pd.DataFrame({
    'Teams': D,
    'Points': g2
})
df2.sort_values(by=['Points'],ascending=False).set_index('Teams')

Unnamed: 0_level_0,Points
Teams,Unnamed: 1_level_1
Argentina,9
Nigeria,6
Iceland,3
Croatia,0


### Simulate Group E

In [616]:
g2 = sim_group(E)

Brazil vs Switzerland : Brazil wins 
Brazil vs Costa Rica : Brazil wins 
Brazil vs Serbia : Brazil wins 
Switzerland vs Costa Rica : Switzerland wins 
Switzerland vs Serbia : Switzerland wins 
Costa Rica vs Serbia : Costa Rica wins 


In [617]:
df2 = pd.DataFrame({
    'Teams': E,
    'Points': g2
})
df2.sort_values(by=['Points'],ascending=False).set_index('Teams')

Unnamed: 0_level_0,Points
Teams,Unnamed: 1_level_1
Brazil,9
Switzerland,6
Costa Rica,3
Serbia,0


### Simulate Group F

In [620]:
g2 = sim_group(F)

Germany vs Mexico : Germany wins 
Germany vs Sweden : Germany wins 
Germany vs South Korea : Germany wins 
Mexico vs Sweden : Sweden wins 
Mexico vs South Korea : Mexico wins 
Sweden vs South Korea : Sweden wins 


In [622]:
df2 = pd.DataFrame({
    'Teams': F,
    'Points': g2
})
df2.sort_values(by=['Points'],ascending=False).set_index('Teams')

Unnamed: 0_level_0,Points
Teams,Unnamed: 1_level_1
Germany,9
Sweden,6
Mexico,3
South Korea,0


### Simulate Group G

In [643]:
g2 = sim_group(G)

Belgium vs Panama : Belgium wins 
Belgium vs Tunisia : Belgium wins 
Belgium vs England : England wins 
Panama vs Tunisia : Tunisia wins 
Panama vs England : England wins 
Tunisia vs England : England wins 


In [644]:
df2 = pd.DataFrame({
    'Teams': G,
    'Points': g2
})
df2.sort_values(by=['Points'],ascending=False).set_index('Teams')

Unnamed: 0_level_0,Points
Teams,Unnamed: 1_level_1
England,9
Belgium,6
Tunisia,3
Panama,0


### Simulate Group H

In [623]:
g2 = sim_group(H)

Poland vs Senegal : Poland wins 
Poland vs Colombia : Colombia wins 
Poland vs Japan : Poland wins 
Senegal vs Colombia : Colombia wins 
Senegal vs Japan : Japan wins 
Colombia vs Japan : Colombia wins 


In [624]:
df2 = pd.DataFrame({
    'Teams': H,
    'Points': g2
})
df2.sort_values(by=['Points'],ascending=False).set_index('Teams')

Unnamed: 0_level_0,Points
Teams,Unnamed: 1_level_1
Colombia,9
Poland,6
Japan,3
Senegal,0


### Round of 16

In [640]:
def sim_ko(li):
    results = [0,0]
    home_team = "home_team_" +  li[0]
    away_team = "away_team_"+ li[1]
    neutral = True
    if li[0] == "Russia":
        neutral = False

    home_rank = rank_wc(li[0])
    away_rank = rank_wc(li[1])
    diff = home_rank-away_rank

    conv_row = trainDF.drop(columns=['home_result']).iloc[1].copy()

    for i in conv_row.iteritems():
        if (i[0] == 'neutral'):
            if (neutral == True):
                conv_row['neutral'] = 1
            else:
                conv_row['neutral'] = 0
        elif (i[0] == 'rank_difference'):
            conv_row[i[0]] = diff
        elif (i[0] == home_team):
            conv_row[i[0]] = 1
        elif (i[0] == away_team):
            conv_row[i[0]] = 1
        else:
            conv_row[i[0]] = 0

    res = reg.predict_proba(np.array(conv_row).reshape(1,-1))
    res = res[0]
    str_res = ""
    if res[0] > res[2]:
        str_res = li[1]
    else:
        str_res = li[0]
    
    print("{} vs {} : {} wins ".format(li[0],li[1],str_res))
                
        

In [647]:
k1 = ['Russia','Colombia']
k2 = ['Uruguay','Belgium']
k3 = ['Spain','Sweden']
k4 = ['Portugal','Germany']
k5 = ['Denmark','Switzerland']
k6 = ['France','Brazil']
k7 = ['Argentina','Nigeria']
k8 = ['England','Poland']
knockouts = [k1,k2,k3,k4,k5,k6,k7,k8]

In [648]:
for x in knockouts:
    sim_ko(x)

Russia vs Colombia : Russia wins 
Uruguay vs Belgium : Uruguay wins 
Spain vs Sweden : Spain wins 
Portugal vs Germany : Germany wins 
Denmark vs Switzerland : Denmark wins 
France vs Brazil : Brazil wins 
Argentina vs Nigeria : Argentina wins 
England vs Poland : England wins 


### Quarter Finals

In [651]:
q1 = ['Russia','Argentina']
q2 = ['England','Uruguay']
q3 = ['Spain','Brazil']
q4 = ['Germany','Denmark']
quart = [q1,q2,q3,q4]

In [652]:
for x in quart:
    sim_ko(x)

Russia vs Argentina : Argentina wins 
England vs Uruguay : England wins 
Spain vs Brazil : Brazil wins 
Germany vs Denmark : Germany wins 


### Semi Finals

In [653]:
s1 = ['Argentina','Germany']
s2 = ['England','Brazil']
semi = [s1,s2]    

In [655]:
sim_ko(['Germany','Brazil'])for x in semi: 
    sim_ko(x)

Argentina vs Germany : Germany wins 
England vs Brazil : Brazil wins 


### 3rd / 4th Place

In [659]:
sim_ko(['Argentina','England'])

Argentina vs England : Argentina wins 


### Finals

In [656]:
sim_ko(['Germany','Brazil'])

Germany vs Brazil : Germany wins 
