In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv("football.csv")
print(data.dtypes)
print(data.shape)
data.head()

date          object
home_team     object
away_team     object
home_score     int64
away_score     int64
tournament    object
city          object
country       object
neutral         bool
dtype: object
(41586, 9)


Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral
0,1872-11-30,Scotland,England,0,0,Friendly,Glasgow,Scotland,False
1,1873-03-08,England,Scotland,4,2,Friendly,London,England,False
2,1874-03-07,Scotland,England,2,1,Friendly,Glasgow,Scotland,False
3,1875-03-06,England,Scotland,2,2,Friendly,London,England,False
4,1876-03-04,Scotland,England,3,0,Friendly,Glasgow,Scotland,False


In [3]:
# Change date to year
data['date'] = pd.to_datetime(data['date']).dt.year
print(data.dtypes)
data.head()

date           int64
home_team     object
away_team     object
home_score     int64
away_score     int64
tournament    object
city          object
country       object
neutral         bool
dtype: object


Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral
0,1872,Scotland,England,0,0,Friendly,Glasgow,Scotland,False
1,1873,England,Scotland,4,2,Friendly,London,England,False
2,1874,Scotland,England,2,1,Friendly,Glasgow,Scotland,False
3,1875,England,Scotland,2,2,Friendly,London,England,False
4,1876,Scotland,England,3,0,Friendly,Glasgow,Scotland,False


# Number of Games

In [4]:
# Group by date and rename
yearlyGames = data.copy()
yearlyGames = yearlyGames.groupby(['date'], as_index=False).count()
yearlyGames = yearlyGames[['date','home_team']]
yearlyGames = yearlyGames.rename(columns={"date":"year","home_team":"count"})
yearlyGames

Unnamed: 0,year,count
0,1872,1
1,1873,1
2,1874,1
3,1875,1
4,1876,2
...,...,...
144,2016,926
145,2017,958
146,2018,905
147,2019,1155


In [5]:
# Export to csv
yearlyGames.to_csv("football_q1.csv", index=False)

# Win Percentage

In [6]:
# Filter for games from 1970
winner = data.copy()
winner = winner[winner['date']>=1970] 
print(winner.shape)
winner.head()

(34238, 9)


Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral
7348,1970,Malta,Luxembourg,1,1,Friendly,Gżira,Malta,False
7349,1970,England,Netherlands,0,0,Friendly,London,England,False
7350,1970,Israel,Netherlands,0,1,Friendly,Jaffa,Israel,False
7351,1970,Peru,Czechoslovakia,0,2,Friendly,Lima,Peru,False
7352,1970,Cameroon,Ivory Coast,3,2,African Cup of Nations,Khartoum,Sudan,True


In [7]:
# Find winner of games
winner['winner'] = np.where(winner['home_score']>winner['away_score'],winner['home_team'],None)
winner['winner'] = np.where(winner['home_score']<winner['away_score'],winner['away_team'],winner['winner'])
winner.head()

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral,winner
7348,1970,Malta,Luxembourg,1,1,Friendly,Gżira,Malta,False,
7349,1970,England,Netherlands,0,0,Friendly,London,England,False,
7350,1970,Israel,Netherlands,0,1,Friendly,Jaffa,Israel,False,Netherlands
7351,1970,Peru,Czechoslovakia,0,2,Friendly,Lima,Peru,False,Czechoslovakia
7352,1970,Cameroon,Ivory Coast,3,2,African Cup of Nations,Khartoum,Sudan,True,Cameroon


In [8]:
# Count number of games played
win_pct = pd.DataFrame(winner['away_team'].value_counts().add(winner['home_team'].value_counts(), fill_value=0), 
                       columns=["played"])
# Count number of games won
win_pct['won'] = winner['winner'].value_counts()
# Fill NaN in games won with 0, change dtype
win_pct['won']= win_pct['won'].fillna(0)
win_pct = win_pct.astype({'played': 'int64','won': 'int64'})
print(win_pct.shape)
win_pct.head()

(308, 2)


Unnamed: 0,played,won
Abkhazia,28,12
Afghanistan,109,30
Albania,297,76
Alderney,19,3
Algeria,480,206


In [9]:
# Get code for countries
world_code = pd.read_csv("world_code.csv")
world_code = world_code.set_index("country")
print(world_code.shape)
world_code.head()

(246, 1)


Unnamed: 0_level_0,code
country,Unnamed: 1_level_1
Antigua and Barbuda,ATG
Algeria,DZA
Azerbaijan,AZE
Albania,ALB
Armenia,ARM


In [10]:
# Join games stats with code
win_pct = win_pct.join(world_code)
win_pct = win_pct.reset_index()
win_pct = win_pct.rename(columns={"index":"country"})
win_pct.head()

Unnamed: 0,country,played,won,code
0,Abkhazia,28,12,
1,Afghanistan,109,30,AFG
2,Albania,297,76,ALB
3,Alderney,19,3,
4,Algeria,480,206,DZA


In [11]:
# Calculate win percentage
win_pct['win_pct'] = win_pct['won']/win_pct['played']
print(win_pct.shape)
win_pct.head()

(308, 5)


Unnamed: 0,country,played,won,code,win_pct
0,Abkhazia,28,12,,0.428571
1,Afghanistan,109,30,AFG,0.275229
2,Albania,297,76,ALB,0.255892
3,Alderney,19,3,,0.157895
4,Algeria,480,206,DZA,0.429167


In [12]:
# Filter top 25% of teams in terms of games played
top_quartile = win_pct['played'].quantile(0.75)
print(top_quartile)
win_pct = win_pct[win_pct['played']>top_quartile]
win_pct = win_pct.dropna()
win_pct.head()

377.25


Unnamed: 0,country,played,won,code,win_pct
4,Algeria,480,206,DZA,0.429167
12,Argentina,579,301,ARG,0.519862
16,Australia,444,221,AUS,0.497748
17,Austria,425,167,AUT,0.392941
20,Bahrain,485,164,BHR,0.338144


In [13]:
win_pct = win_pct.sort_values(by="win_pct", ascending=False)
win_pct = win_pct.reset_index(drop=True)
win_pct.head(10)

Unnamed: 0,country,played,won,code,win_pct
0,Brazil,669,434,BRA,0.648729
1,Spain,521,319,ESP,0.612284
2,Germany,617,377,DEU,0.611021
3,France,528,306,FRA,0.579545
4,Netherlands,498,283,NLD,0.568273
5,England,564,308,GBR,0.546099
6,Iran,478,260,IRN,0.543933
7,Portugal,460,245,PRT,0.532609
8,Italy,539,283,ITA,0.525046
9,South Korea,761,396,KOR,0.520368


In [16]:
win_pct['rank'] = win_pct.index + 1
win_pct.head(10).to_csv("football_q2.csv", index=False)