# Import and Clean Matches data

In [1]:
import pandas as pd
matches = pd.read_json("DataSet\\matches\\matches_England.json")
matches = matches[["status", "gameweek", "winner", "label" ]] # fetch necessary columns
matches = matches[matches.status =="Played"] # be sure that all matches are played
del matches["status"] # we don't need status anymore
matches = matches.sort_values(by=["gameweek"]) #sort the rows according to game week
matches.index = range(len(matches.index)) #fix the indices
matches

Unnamed: 0,gameweek,winner,label
0,1,1609,"Arsenal - Leicester City, 4 - 3"
1,1,1611,"Manchester United - West Ham United, 4 - 0"
2,1,1624,"Newcastle United - Tottenham Hotspur, 0 - 2"
3,1,1625,"Brighton & Hove Albion - Manchester City, 0 - 2"
4,1,1646,"Chelsea - Burnley, 2 - 3"
5,1,0,"Watford - Liverpool, 3 - 3"
6,1,1623,"Everton - Stoke City, 1 - 0"
7,1,0,"Southampton - Swansea City, 0 - 0"
8,1,1627,"West Bromwich Albion - AFC Bournemouth, 1 - 0"
9,1,1673,"Crystal Palace - Huddersfield Town, 0 - 3"


# Import and Clean Teams data

In [2]:
teams = pd.read_json("DataSet\\teams.json")
teams = teams[teams.type =="club"] # we don't want national teams
teams = teams[["name", "area", "wyId"]] # we don't need the other colums
teams

Unnamed: 0,name,area,wyId
0,Newcastle United,"{'name': 'England', 'id': '0', 'alpha3code': '...",1613
1,Celta de Vigo,"{'name': 'Spain', 'id': '724', 'alpha3code': '...",692
2,Espanyol,"{'name': 'Spain', 'id': '724', 'alpha3code': '...",691
3,Deportivo Alav\u00e9s,"{'name': 'Spain', 'id': '724', 'alpha3code': '...",696
4,Levante,"{'name': 'Spain', 'id': '724', 'alpha3code': '...",695
5,Troyes,"{'name': 'France', 'id': '250', 'alpha3code': ...",3795
6,Getafe,"{'name': 'Spain', 'id': '724', 'alpha3code': '...",698
7,Borussia M'gladbach,"{'name': 'Germany', 'id': '276', 'alpha3code':...",2454
8,Huddersfield Town,"{'name': 'England', 'id': '0', 'alpha3code': '...",1673
9,Athletic Club,"{'name': 'Spain', 'id': '724', 'alpha3code': '...",678


## Create a DataFrame called premier league and add only the teams in England and Wales

In [3]:
premier_league = pd.DataFrame(columns=["name", "area", "wyId"])
for i in range(len(teams)):
    country = teams.iloc[i,:][1]["name"]
    if country == "England" or country == "Wales":
        premier_league = premier_league.append(teams.iloc[i,:])

premier_league.index = range(len(premier_league.index)) #fix the indices
del premier_league["area"] # we don't need area anymore
premier_league

Unnamed: 0,name,wyId
0,Newcastle United,1613
1,Huddersfield Town,1673
2,Swansea City,10531
3,AFC Bournemouth,1659
4,Brighton & Hove Albion,1651
5,Burnley,1646
6,Leicester City,1631
7,West Ham United,1633
8,Stoke City,1639
9,Watford,1644


### create new columns called home and away to split the label string


In [4]:
home = []
away = []
for i in range(len(matches)):
    string = matches.iloc[i,:][2]
    game = string.split(",")[0].split("-")
    home.append(game[0])
    away.append(game[1])
matches["home"] = home
matches["away"] = away
del matches["label"] # we don't need label anymore
matches

Unnamed: 0,gameweek,winner,home,away
0,1,1609,Arsenal,Leicester City
1,1,1611,Manchester United,West Ham United
2,1,1624,Newcastle United,Tottenham Hotspur
3,1,1625,Brighton & Hove Albion,Manchester City
4,1,1646,Chelsea,Burnley
5,1,0,Watford,Liverpool
6,1,1623,Everton,Stoke City
7,1,0,Southampton,Swansea City
8,1,1627,West Bromwich Albion,AFC Bournemouth
9,1,1673,Crystal Palace,Huddersfield Town


for each team in premier league, a 2D array should be created to perform the chi^2 test.
in this manner the 2*3 sized array rows would be home and away matches. On the other hand
the columns would be the number of  win, draw and loose respectively matches. The for loop bellow forms such
2D arrays for each team in premier league.

In [5]:
all_contingency_tables = []
for i in range(len(premier_league)):
    team = premier_league.iloc[i,0]
    wyıd = premier_league.iloc[i,1]
    home_games = matches[matches["home"] == team + " "]
    away_games = matches[matches["away"] == " " + team]

    home_wins = len(home_games[home_games.winner == wyıd])
    home_draws = len(home_games[home_games.winner == 0])
    home_loses = len(home_games[home_games.winner != wyıd]) - home_draws

    away_wins = len(away_games[away_games.winner == wyıd])
    away_draws = len(away_games[away_games.winner == 0])
    away_loses = len(away_games[away_games.winner != wyıd]) - away_draws

    all_contingency_tables.append([[home_wins, home_draws, home_loses],
                                   [away_wins, away_draws, away_loses]])

premier_league["contingency_tables"] = all_contingency_tables

premier_league

Unnamed: 0,name,wyId,contingency_tables
0,Newcastle United,1613,"[[8, 4, 7], [4, 4, 11]]"
1,Huddersfield Town,1673,"[[6, 5, 8], [3, 5, 11]]"
2,Swansea City,10531,"[[6, 3, 10], [2, 6, 11]]"
3,AFC Bournemouth,1659,"[[7, 5, 7], [4, 6, 9]]"
4,Brighton & Hove Albion,1651,"[[7, 8, 4], [2, 6, 11]]"
5,Burnley,1646,"[[7, 5, 7], [7, 7, 5]]"
6,Leicester City,1631,"[[7, 6, 6], [5, 5, 9]]"
7,West Ham United,1633,"[[7, 6, 6], [3, 6, 10]]"
8,Stoke City,1639,"[[5, 5, 9], [2, 7, 10]]"
9,Watford,1644,"[[6, 7, 6], [4, 2, 13]]"


# CHI^2 TEST 

For premier league teams, does the result of the match (win, draw or loose) depends on pitch (home or away)?

H0 = The result of the match dosn't depend on playing at home or away. The variables are independent.

The Chi^2 test is applied to all teams in Premier league and their p values are added to premier_league
dataframe.

In [6]:
from scipy.stats import chi2_contingency
p_values =[]
for i in range(len(premier_league)):
    # contingency table
    table = all_contingency_tables[i]
    stat, p, dof, expected = chi2_contingency(table)
    p_values.append(p)

premier_league["P-Values"] = p_values

premier_league

Unnamed: 0,name,wyId,contingency_tables,P-Values
0,Newcastle United,1613,"[[8, 4, 7], [4, 4, 11]]",0.329193
1,Huddersfield Town,1673,"[[6, 5, 8], [3, 5, 11]]",0.478623
2,Swansea City,10531,"[[6, 3, 10], [2, 6, 11]]",0.21788
3,AFC Bournemouth,1659,"[[7, 5, 7], [4, 6, 9]]",0.560153
4,Brighton & Hove Albion,1651,"[[7, 8, 4], [2, 6, 11]]",0.042211
5,Burnley,1646,"[[7, 5, 7], [7, 7, 5]]",0.716531
6,Leicester City,1631,"[[7, 6, 6], [5, 5, 9]]",0.599223
7,West Ham United,1633,"[[7, 6, 6], [3, 6, 10]]",0.272532
8,Stoke City,1639,"[[5, 5, 9], [2, 7, 10]]",0.43351
9,Watford,1644,"[[6, 7, 6], [4, 2, 13]]",0.056227


In [7]:
# The teams having home field advantage when confidence interval is 95%
confidence95 = premier_league[premier_league["P-Values"] <= (1-0.95)]
confidence95

Unnamed: 0,name,wyId,contingency_tables,P-Values
4,Brighton & Hove Albion,1651,"[[7, 8, 4], [2, 6, 11]]",0.042211
19,Arsenal,1609,"[[15, 2, 2], [4, 4, 11]]",0.001316


In [8]:
# The teams having home field advantage when confidence interval is 90%
confidence90 = premier_league[premier_league["P-Values"] <= (1-0.90)]
confidence90

Unnamed: 0,name,wyId,contingency_tables,P-Values
4,Brighton & Hove Albion,1651,"[[7, 8, 4], [2, 6, 11]]",0.042211
9,Watford,1644,"[[6, 7, 6], [4, 2, 13]]",0.056227
16,Liverpool,1612,"[[12, 7, 0], [9, 5, 5]]",0.056081
19,Arsenal,1609,"[[15, 2, 2], [4, 4, 11]]",0.001316


In [9]:
# The teams having home field advantage when confidence interval is 80%
confidence80 = premier_league[premier_league["P-Values"] <= (1-0.80)]
confidence80

Unnamed: 0,name,wyId,contingency_tables,P-Values
4,Brighton & Hove Albion,1651,"[[7, 8, 4], [2, 6, 11]]",0.042211
9,Watford,1644,"[[6, 7, 6], [4, 2, 13]]",0.056227
10,Everton,1623,"[[9, 5, 5], [3, 7, 9]]",0.106661
11,West Bromwich Albion,1627,"[[3, 9, 7], [3, 4, 12]]",0.198011
16,Liverpool,1612,"[[12, 7, 0], [9, 5, 5]]",0.056081
19,Arsenal,1609,"[[15, 2, 2], [4, 4, 11]]",0.001316


Here we can claim that Arsenal is the team having the most Home field advantage.
The performans of the Arsenal team really depends on playing home field.
On the other hand, there is absolutely no home field advantage for Manchester City.
They are already unbeatable so playing in home field or not doesn't effect their performance.

Now lets apply the chi^2 test to first 5 contingency tables to see if there is a home field advantage.

In [10]:
import numpy as np
total = np.zeros((2,3))
for i in all_contingency_tables[:5]:
    total=np.sum([i, total], axis=0)
stat, p, dof, expected = chi2_contingency(total)
total
print("stat = "+ str(stat), "p-value ="+ str(p))

stat = 10.691461026934542 p-value =0.004768466503684661


as seen when we increase the size of the data, the chi^2 test becomes more precise and returns lower p-values. Which shows that on there is home field advantage overall in premier league and the variables are dependent.

Finally, Lets apply chi^2 test to 5 random teams 50 times to see number of times the H0 is rejected for different cases.

In [11]:
fail =0
reject =0
import random
for k in range(50):
    random.seed(k)
    total = np.zeros((2,3))
    for i in np.random.choice(20, 5):
        total=np.sum([all_contingency_tables[i], total], axis=0)
    stat, p, dof, expected = chi2_contingency(total)
    print("stat = "+ str(stat), "p-value ="+ str(p))
    if p>(1-0.95):
        print("Fail to reject H0")
        fail+=1
    else:
        print("reject H0")
        reject+=1
print("reject = "+str(reject), "fail= "+str(fail))

stat = 7.136612021857924 p-value =0.028203589809545595
reject H0
stat = 9.526723410781383 p-value =0.008536862688267987
reject H0
stat = 12.44789859677447 p-value =0.0019814045870218196
reject H0
stat = 8.48063973063973 p-value =0.014402984091108297
reject H0
stat = 12.147550326316757 p-value =0.0023024646031945713
reject H0
stat = 2.88336728943604 p-value =0.2365291921290503
Fail to reject H0
stat = 2.2271287747000312 p-value =0.3283863762978818
Fail to reject H0
stat = 3.0912911431172208 p-value =0.21317420758638506
Fail to reject H0
stat = 6.543859649122807 p-value =0.03793315206541126
reject H0
stat = 3.7723242584649834 p-value =0.1516527169273607
Fail to reject H0
stat = 8.904950605570129 p-value =0.011649694729559122
reject H0
stat = 1.2213064891125907 p-value =0.5429960439835159
Fail to reject H0
stat = 3.9109848484848486 p-value =0.1414947840745197
Fail to reject H0
stat = 3.323136123136123 p-value =0.1898410641105843
Fail to reject H0
stat = 4.869281045751634 p-value =0.087629