In [1]:
import pandas as pd
import numpy as np
import json
from pandas.io.json import json_normalize  
import matplotlib.pyplot as plt

# Import Matches

In [2]:
dataset = pd.read_json('/home/yves/Desktop/Data_Science/ADM1/adm_hw2/data/matches/matches_England.json')

#  Is there a home-field advantage?

In [3]:
# In order to work easily, I select only the most useful columns for this task. 
sub_dataset = dataset[['status','winner', 'label']] 

In [4]:
# Now I will extract all the information contained in the label column and rearrange it!
label = pd.DataFrame(sub_dataset['label'])

label.head(3)

Unnamed: 0,label
0,"Burnley - AFC Bournemouth, 1 - 2"
1,"Crystal Palace - West Bromwich Albion, 2 - 0"
2,"Huddersfield Town - Arsenal, 0 - 1"


In [5]:
new_dataset = label["label"].str.split(",",n = 1, expand = True) # split the column label

In [6]:
new_dataset.columns = ("teams", "score") # rename the columns

In [7]:
new_dataset.head(3)

Unnamed: 0,teams,score
0,Burnley - AFC Bournemouth,1 - 2
1,Crystal Palace - West Bromwich Albion,2 - 0
2,Huddersfield Town - Arsenal,0 - 1


In [8]:
teams = new_dataset["teams"].str.split("-",n = 1, expand = True) # split the teams column

In [9]:
# merge teams to the sub_dataset
dataset = pd.merge(sub_dataset, teams, left_index=True, right_index=True)

dataset.head(5)

Unnamed: 0,status,winner,label,0,1
0,Played,1659,"Burnley - AFC Bournemouth, 1 - 2",Burnley,AFC Bournemouth
1,Played,1628,"Crystal Palace - West Bromwich Albion, 2 - 0",Crystal Palace,West Bromwich Albion
2,Played,1609,"Huddersfield Town - Arsenal, 0 - 1",Huddersfield Town,Arsenal
3,Played,1612,"Liverpool - Brighton & Hove Albion, 4 - 0",Liverpool,Brighton & Hove Albion
4,Played,1611,"Manchester United - Watford, 1 - 0",Manchester United,Watford


In [10]:
# check il all the matches had been played
dataset['status'].unique()

array(['Played'], dtype=object)

In [11]:
dataset.drop(columns = ['label','status'], inplace=True)

In [12]:
# rename columns
dataset.columns = ("winner", "home", "away")
dataset.head(2)

Unnamed: 0,winner,home,away
0,1659,Burnley,AFC Bournemouth
1,1628,Crystal Palace,West Bromwich Albion


In [13]:
# adjust the home and away strings. Because they contain extra spaces
home = []
for i in dataset['home']:
    j = " ".join(i.split())
    home.append(j)
away = []
for i in dataset['away']:
    j = " ".join(i.split())
    away.append(j)

dataset['home'] = home
dataset['away'] = away

In [14]:
dataset.head(5)

Unnamed: 0,winner,home,away
0,1659,Burnley,AFC Bournemouth
1,1628,Crystal Palace,West Bromwich Albion
2,1609,Huddersfield Town,Arsenal
3,1612,Liverpool,Brighton & Hove Albion
4,1611,Manchester United,Watford


Now I convert the club's names into their Id

# Import Teams

In [15]:
teams = pd.read_json('/home/yves/Desktop/Data_Science/ADM1/adm_hw2/data/teams.json')

In [16]:
teams.columns

Index(['city', 'name', 'wyId', 'officialName', 'area', 'type'], dtype='object')

Now I extract the name of the country of each club in order to focus on the premier league

In [17]:
# transform the area column into the nountry name
L = []
for i in teams['area']:
    L.append(i['name'])
teams['area'] = L

In [18]:
# extract only the teams that are in England or Wales
keep = ['England', 'Wales']
teams = teams.loc[teams['area'].isin(keep)]

In [19]:
teams.head(5)

Unnamed: 0,city,name,wyId,officialName,area,type
0,Newcastle upon Tyne,Newcastle United,1613,Newcastle United FC,England,club
8,"Huddersfield, West Yorkshire",Huddersfield Town,1673,Huddersfield Town FC,England,club
19,Swansea,Swansea City,10531,Swansea City AFC,Wales,club
26,"Bournemouth, Dorset",AFC Bournemouth,1659,AFC Bournemouth,England,club
27,"Brighton, East Sussex",Brighton & Hove Albion,1651,Brighton & Hove Albion FC,England,club


In [20]:
 # select only clubs from the Premier League. Delete National teams and keep clubs
teams = teams[teams.type != 'national'] 

In [21]:
teams.shape

(20, 6)

In [22]:
dataset

Unnamed: 0,winner,home,away
0,1659,Burnley,AFC Bournemouth
1,1628,Crystal Palace,West Bromwich Albion
2,1609,Huddersfield Town,Arsenal
3,1612,Liverpool,Brighton & Hove Albion
4,1611,Manchester United,Watford
...,...,...,...
375,1623,Everton,Stoke City
376,0,Southampton,Swansea City
377,1627,West Bromwich Albion,AFC Bournemouth
378,0,Watford,Liverpool


In [23]:
# get the team's ID 
home_Id = []
for i in dataset['home']:
    j = teams.name[teams.name == i].index.tolist()[0]
    home_Id.append(teams['wyId'][j])

In [24]:
# get the team's ID 
away_Id = []
for i in dataset['away']:
    j = teams.name[teams.name == i].index.tolist()[0]
    away_Id.append(teams['wyId'][j])


In [25]:
dataset['home'] = home_Id
dataset['away'] = away_Id

In [26]:
dataset

Unnamed: 0,winner,home,away
0,1659,1646,1659
1,1628,1628,1627
2,1609,1673,1609
3,1612,1612,1651
4,1611,1611,1644
...,...,...,...
375,1623,1623,1639
376,0,1619,10531
377,1627,1627,1659
378,0,1644,1612


Ramdon selection of 5 teams for the Chi Squared test

In [27]:
small_sample = dataset['home'].unique()
small_sample = np.random.choice(small_sample, 5, replace=False)
small_sample

array([ 1609,  1633,  1651,  1646, 10531])

Remove that match played among the teams we randomly selected

In [28]:
import itertools
matches_to_remove = list(itertools.permutations(small_sample, 2))


In [29]:
rows_to_remove = []
for i in matches_to_remove:
    lst_h = dataset.index[dataset['home'] == i[0]].tolist() # all postions of the first team when playing at home
    lst_a = dataset.index[dataset['away'] == i[1]].tolist() # all positions of the second team when playing away
    rows_to_remove.append(list(set(lst_h).intersection(lst_a))) # the only position that is in common correspond 
                                                                # to the match to delete


In [30]:
# Delete row with index label 'b' 
for j in rows_to_remove:
    for p in j:
        dataset = dataset.drop(p)
dataset.shape

(360, 3)

### Contingency table

In [31]:
print('contingency table of 5 random teams', "\n")

draw_h_all = 0    # total number of draw when playing at home
draw_a_all = 0    # total number of draw when playing away
win_h_all = 0    # total number of win when playing at home
lose_h_all = 0    # total number of lose when playing at home
win_a_all = 0    # total number of win when playing away
lose_a_all = 0   # total number of lose when playing away 

for team in small_sample:
    
    draw_h = 0   # number of draw when playing at home
    draw_a = 0   # number of draw when playing away
    win_h = 0    # number of win when playing at home
    lose_h = 0    # number of lose when playing at home
    win_a = 0     #number of win when playing away
    lose_a = 0   # number of lose when playing away
    
    lst_h = dataset.index[dataset['home'] == team].tolist() # get the index of the team when playing at home
    lst_a = dataset.index[dataset['away'] == team].tolist() # get the index of the team when playing at home
    
    for pos in lst_h:
        if dataset['winner'][pos] == 0:   # when there is a zero, we have a draw
            draw_h += 1
        else:
            if dataset['winner'][pos] == dataset['home'][pos]:  # when playing at home count the number of wins
                win_h += 1
            else:
                lose_h += 1
    for pos in lst_a:
        if dataset['winner'][pos] == 0:      # when there is a zero, we have a draw
            draw_a += 1
        else:
            if dataset['winner'][pos] == dataset['away'][pos]:     # when playing away count the number of wins
                win_a += 1
            else:
                lose_a += 1
                
    draw_h_all += draw_h
    draw_a_all += draw_a
    win_h_all += win_h
    lose_h_all += lose_h
    win_a_all += win_a
    lose_a_all += lose_a
    
                
                # create the contingency table
    df = pd.DataFrame({'win':[win_h, win_a, (win_h + win_a)], 
                       'draw': [draw_h, draw_a, (draw_h + draw_a)], 
                       'lose' : [lose_h, lose_a, (lose_h + lose_a)],
                      'all': [(win_h + draw_h + lose_h), (win_a + draw_a + lose_a), 
                              (win_h + draw_h + lose_h + win_a + draw_a + lose_a)]})
    df.index = ('home', 'away', 'all')
    
    j = teams.name[teams.wyId == team].index.tolist()[0]
    team_name = teams['name'][j]
    
    print('team: '+ team_name)
    print(df, "\n")
    print('***********************************', "\n")

contingency table of 5 random teams 

team: Arsenal
      win  draw  lose  all
home   11     2     2   15
away    3     3     9   15
all    14     5    11   30 

*********************************** 

team: West Ham United
      win  draw  lose  all
home    6     5     4   15
away    3     5     7   15
all     9    10    11   30 

*********************************** 

team: Brighton & Hove Albion
      win  draw  lose  all
home    4     7     4   15
away    0     5    10   15
all     4    12    14   30 

*********************************** 

team: Burnley
      win  draw  lose  all
home    6     3     6   15
away    6     6     3   15
all    12     9     9   30 

*********************************** 

team: Swansea City
      win  draw  lose  all
home    3     3     9   15
away    2     6     7   15
all     5     9    16   30 

*********************************** 



In [32]:
print('overall contingency table')

df = pd.DataFrame({'win':[win_h_all, win_a_all, (win_h_all + win_a_all)], 
                   'draw': [draw_h_all, draw_a_all, (draw_h_all + draw_a_all)], 
                   'lose' : [lose_h_all, lose_a_all, (lose_h_all + lose_a_all)],
                  'all': [(win_h_all + draw_h_all + lose_h_all), (win_a_all + draw_a_all + lose_a_all), 
                          (win_h_all + draw_h_all + lose_h_all + win_a_all + draw_a_all + lose_a_all)]})
df.index = ('home', 'away', 'all')
print(df)

overall contingency table
      win  draw  lose  all
home   30    20    25   75
away   14    25    36   75
all    44    45    61  150


In [33]:
from scipy.stats import chi2_contingency
from scipy.stats import chi2

# contingency table

table = df
print(table, "\n")
print('Our NULL Hypothesis is that there is home field advantage in sport', "\n")

stat, p, dof, expected = chi2_contingency(table)
print('dof=%d' % dof, "\n")
print(expected, "\n")



# interpret test-statistic
prob = 0.95
critical = chi2.ppf(prob, dof)

print('probability=%.3f, critical=%.3f, stat=%.3f' % (prob, critical, stat), "\n")

if abs(stat) >= critical:
    print('Dependent (reject H0)', "\n")
else:
    print('Independent (fail to reject H0)', "\n")
    
# interpret p-value
alpha = 1.0 - prob
print('significance=%.3f, p=%.3f' % (alpha, p), "\n")
if p <= alpha:
    print('Dependent (reject H0)', "\n")
else:
    print('Independent (fail to reject H0)', "\n")



      win  draw  lose  all
home   30    20    25   75
away   14    25    36   75
all    44    45    61  150 

Our NULL Hypothesis is that there is home field advantage in sport 

dof=6 

[[ 22.   22.5  30.5  75. ]
 [ 22.   22.5  30.5  75. ]
 [ 44.   45.   61.  150. ]] 

probability=0.950, critical=12.592, stat=8.357 

Independent (fail to reject H0) 

significance=0.050, p=0.213 

Independent (fail to reject H0) 



# Conclusion

After our building our overall contingency table and performing our Chi Squared test. We argue that there is not a relationship between that field (home, away) and the result of the match (win, draw and lose). WE arrived to this conclusion because we fail to reject our Null Hypotesis (stating the correlation between the feild and the match's result). 