# Import Data

In [1]:
import pandas as pd
df = pd.read_csv('all_leagues_data.csv', index_col=0)
df

Unnamed: 0,Home_Team,Away_Team,Result,Link,Season,Round,League
0,Millwall,Southend United,3-1,https://www.besoccer.com/match/millwall-fc/sou...,1995,1,championship
1,Portsmouth,Notts County,2-1,https://www.besoccer.com/match/portsmouth/nott...,1995,1,championship
2,Stoke City,Tranmere Rovers,1-0,https://www.besoccer.com/match/stoke-city/tran...,1995,1,championship
3,Barnsley,Derby County,2-1,https://www.besoccer.com/match/barnsley-fc/der...,1995,1,championship
4,Middlesbrough,Burnley,2-0,https://www.besoccer.com/match/middlesbrough-f...,1995,1,championship
...,...,...,...,...,...,...,...
146493,Frosinone,Pro Vercelli,2-1,https://www.besoccer.com/match/frosinone-calci...,2017,42,serie_b
146494,Perugia,Salernitana,3-2,https://www.besoccer.com/match/perugia/salerni...,2017,42,serie_b
146495,Vicenza,Spezia,0-1,https://www.besoccer.com/match/vicenza-calcio/...,2017,42,serie_b
146496,Ascoli,Ternana Calcio,1-2,https://www.besoccer.com/match/ascoli/ternana-...,2017,42,serie_b


# Data Cleaning and Exploratory Data Analysis

### General Info

In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 146498 entries, 0 to 146497
Data columns (total 7 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   Home_Team  146498 non-null  object
 1   Away_Team  146498 non-null  object
 2   Result     146498 non-null  object
 3   Link       146498 non-null  object
 4   Season     146498 non-null  int64 
 5   Round      146498 non-null  int64 
 6   League     146498 non-null  object
dtypes: int64(2), object(5)
memory usage: 8.9+ MB


Findings:
- No null values
- Dtypes as expected
- Only 7 features

Need to go deeper into each feature...

### Data Cleaning

In [3]:
# Check if all links appear to be valid
df['Link'].str.contains('https://www.besoccer.com/match/').value_counts()

True    146498
Name: Link, dtype: int64

In [4]:
# Check results are valid
print(df['Result'].str.len().value_counts())
print(df[df['Result'].str.len() != 3]['Result'])

# Drop 'Results' that do not add value to analysis
df.drop(df[df['Result'].str.len() != 3].index, inplace=True)

3    146487
4         5
9         4
6         2
Name: Result, dtype: int64
10615          10-2
33098        17 JAN
35163     1 (0-0) 1
42888          10-0
53431        18 MAR
57909          0-13
58699          10-0
60698     0 (0-0) 0
62800          10-0
140590    3 (3-2) 2
140592    0 (0-1) 1
Name: Result, dtype: object


In [5]:
# Check same No. home teams vs away teams in database
ht_count = df['Home_Team'].value_counts()
at_count = df['Away_Team'].value_counts()
diff_count = pd.concat([ht_count, at_count])
print(len(ht_count), len(at_count), len(diff_count))

538 545 1083


In [6]:
import difflib

# Create dictionary of team names to standardise
home_teams = ht_count.keys().tolist()
away_teams = at_count.keys().tolist()
all_teams = home_teams + away_teams
print(len(home_teams), len(away_teams), len(all_teams))

teams_dict = {}

for team in all_teams:
    teams_dict[team] = difflib.get_close_matches(team, away_teams)[0]

df['Home_Team_New'] = df['Home_Team'].apply(lambda x: teams_dict[x])
df['Away_Team_New'] = df['Away_Team'].apply(lambda x: teams_dict[x])

print((df['Home_Team'] == df['Home_Team_New']).value_counts())
print((df['Away_Team'] == df['Away_Team_New']).value_counts())


538 545 1083
True     146373
False       114
dtype: int64
True    146487
dtype: int64


In [7]:
# Convert results string to Home Team and Away Team scores as Integars
df['Home_Team_Score'] = df['Result'].apply(lambda x: int(x.split('-')[0]))
df['Away_Team_Score'] = df['Result'].apply(lambda x: int(x.split('-')[1]))
df.drop(columns=['Result'], inplace=True)
df

Unnamed: 0,Home_Team,Away_Team,Link,Season,Round,League,Home_Team_New,Away_Team_New,Home_Team_Score,Away_Team_Score
0,Millwall,Southend United,https://www.besoccer.com/match/millwall-fc/sou...,1995,1,championship,Millwall,Southend United,3,1
1,Portsmouth,Notts County,https://www.besoccer.com/match/portsmouth/nott...,1995,1,championship,Portsmouth,Notts County,2,1
2,Stoke City,Tranmere Rovers,https://www.besoccer.com/match/stoke-city/tran...,1995,1,championship,Stoke City,Tranmere Rovers,1,0
3,Barnsley,Derby County,https://www.besoccer.com/match/barnsley-fc/der...,1995,1,championship,Barnsley,Derby County,2,1
4,Middlesbrough,Burnley,https://www.besoccer.com/match/middlesbrough-f...,1995,1,championship,Middlesbrough,Burnley,2,0
...,...,...,...,...,...,...,...,...,...,...
146493,Frosinone,Pro Vercelli,https://www.besoccer.com/match/frosinone-calci...,2017,42,serie_b,Frosinone,Pro Vercelli,2,1
146494,Perugia,Salernitana,https://www.besoccer.com/match/perugia/salerni...,2017,42,serie_b,Perugia,Salernitana,3,2
146495,Vicenza,Spezia,https://www.besoccer.com/match/vicenza-calcio/...,2017,42,serie_b,Vicenza,Spezia,0,1
146496,Ascoli,Ternana Calcio,https://www.besoccer.com/match/ascoli/ternana-...,2017,42,serie_b,Ascoli,Ternana Calcio,1,2


In [8]:
df['Region'] = df['League']
values_to_update ={
    'Region': {
        'segunda_division': 'Spain',
        'primera_division': 'Spain',
        'serie_b': 'Italy',
        'serie_a': 'Italy',
        'premier_league': 'England',
        'championship': 'England',
        'ligue_1': 'France',
        'ligue_2': 'France',
        '2_liga': 'Germany',
        'bundesliga': 'Germany',
        'eredivisie': 'Netherlands',
        'eerste_divisie': 'Netherlands',
        'primeira_liga': 'Portugal',
        'segunda_liga': 'Portugal'        
        }
}

df = df.replace(values_to_update)
df

Unnamed: 0,Home_Team,Away_Team,Link,Season,Round,League,Home_Team_New,Away_Team_New,Home_Team_Score,Away_Team_Score,Region
0,Millwall,Southend United,https://www.besoccer.com/match/millwall-fc/sou...,1995,1,championship,Millwall,Southend United,3,1,England
1,Portsmouth,Notts County,https://www.besoccer.com/match/portsmouth/nott...,1995,1,championship,Portsmouth,Notts County,2,1,England
2,Stoke City,Tranmere Rovers,https://www.besoccer.com/match/stoke-city/tran...,1995,1,championship,Stoke City,Tranmere Rovers,1,0,England
3,Barnsley,Derby County,https://www.besoccer.com/match/barnsley-fc/der...,1995,1,championship,Barnsley,Derby County,2,1,England
4,Middlesbrough,Burnley,https://www.besoccer.com/match/middlesbrough-f...,1995,1,championship,Middlesbrough,Burnley,2,0,England
...,...,...,...,...,...,...,...,...,...,...,...
146493,Frosinone,Pro Vercelli,https://www.besoccer.com/match/frosinone-calci...,2017,42,serie_b,Frosinone,Pro Vercelli,2,1,Italy
146494,Perugia,Salernitana,https://www.besoccer.com/match/perugia/salerni...,2017,42,serie_b,Perugia,Salernitana,3,2,Italy
146495,Vicenza,Spezia,https://www.besoccer.com/match/vicenza-calcio/...,2017,42,serie_b,Vicenza,Spezia,0,1,Italy
146496,Ascoli,Ternana Calcio,https://www.besoccer.com/match/ascoli/ternana-...,2017,42,serie_b,Ascoli,Ternana Calcio,1,2,Italy


In [9]:
df = df.loc[:, ['Region', 'League', 'Season', 'Round', 'Home_Team_New', 'Home_Team_Score', 'Away_Team_Score', 'Away_Team_New', 'Link']]
df.rename(columns={'Home_Team_New': 'Home_Team', 'Away_Team_New': 'Away_Team', 'Season': 'Year'}, inplace=True)
df


Unnamed: 0,Region,League,Year,Round,Home_Team,Home_Team_Score,Away_Team_Score,Away_Team,Link
0,England,championship,1995,1,Millwall,3,1,Southend United,https://www.besoccer.com/match/millwall-fc/sou...
1,England,championship,1995,1,Portsmouth,2,1,Notts County,https://www.besoccer.com/match/portsmouth/nott...
2,England,championship,1995,1,Stoke City,1,0,Tranmere Rovers,https://www.besoccer.com/match/stoke-city/tran...
3,England,championship,1995,1,Barnsley,2,1,Derby County,https://www.besoccer.com/match/barnsley-fc/der...
4,England,championship,1995,1,Middlesbrough,2,0,Burnley,https://www.besoccer.com/match/middlesbrough-f...
...,...,...,...,...,...,...,...,...,...
146493,Italy,serie_b,2017,42,Frosinone,2,1,Pro Vercelli,https://www.besoccer.com/match/frosinone-calci...
146494,Italy,serie_b,2017,42,Perugia,3,2,Salernitana,https://www.besoccer.com/match/perugia/salerni...
146495,Italy,serie_b,2017,42,Vicenza,0,1,Spezia,https://www.besoccer.com/match/vicenza-calcio/...
146496,Italy,serie_b,2017,42,Ascoli,1,2,Ternana Calcio,https://www.besoccer.com/match/ascoli/ternana-...


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 146487 entries, 0 to 146497
Data columns (total 9 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   Region           146487 non-null  object
 1   League           146487 non-null  object
 2   Year             146487 non-null  int64 
 3   Round            146487 non-null  int64 
 4   Home_Team        146487 non-null  object
 5   Home_Team_Score  146487 non-null  int64 
 6   Away_Team_Score  146487 non-null  int64 
 7   Away_Team        146487 non-null  object
 8   Link             146487 non-null  object
dtypes: int64(4), object(5)
memory usage: 11.2+ MB


In [11]:
df.describe()

Unnamed: 0,Year,Round,Home_Team_Score,Away_Team_Score
count,146487.0,146487.0,146487.0,146487.0
mean,2005.518688,18.878078,1.500577,1.067433
std,9.034836,11.088508,1.260164,1.081524
min,1990.0,1.0,0.0,0.0
25%,1998.0,9.0,1.0,0.0
50%,2005.0,19.0,1.0,1.0
75%,2013.0,28.0,2.0,2.0
max,2021.0,46.0,9.0,9.0


### Exploratory Data Analysis

What do we want to find out:
- How many teams are in each league?
- Which league did each team play in?
- How long was a season in each league?
- How many goals are scored by home and away teams in each league?
- Who is the most (and least) successful team in each league?
- How have these results changed over time?

In [12]:
# How many teams were in each league every year?
teams_year_table = pd.pivot_table(df, values='Home_Team', index='Year', columns='League', aggfunc=lambda x: len(x.unique()))
teams_year_table

League,2_liga,bundesliga,championship,eerste_divisie,eredivisie,ligue_1,ligue_2,premier_league,primeira_liga,primera_division,segunda_division,segunda_liga,serie_a,serie_b
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1990,20.0,18.0,24.0,,18.0,10.0,,20.0,18.0,20.0,20.0,,18.0,20.0
1991,20.0,18.0,24.0,,18.0,20.0,,20.0,20.0,20.0,20.0,,18.0,20.0
1992,12.0,20.0,24.0,,18.0,20.0,,22.0,18.0,20.0,20.0,,18.0,20.0
1993,24.0,18.0,24.0,,18.0,20.0,,22.0,18.0,20.0,20.0,,18.0,20.0
1994,20.0,18.0,24.0,,18.0,20.0,22.0,22.0,18.0,20.0,20.0,,18.0,20.0
1995,18.0,18.0,24.0,,18.0,20.0,22.0,22.0,18.0,20.0,20.0,,18.0,20.0
1996,18.0,18.0,24.0,,18.0,20.0,22.0,20.0,18.0,22.0,20.0,,18.0,20.0
1997,18.0,18.0,24.0,9.0,18.0,20.0,22.0,20.0,18.0,22.0,20.0,,18.0,20.0
1998,18.0,18.0,24.0,9.0,18.0,18.0,22.0,20.0,18.0,20.0,22.0,9.0,18.0,20.0
1999,18.0,18.0,24.0,9.0,18.0,18.0,20.0,20.0,18.0,20.0,22.0,9.0,,20.0


In [None]:
# Which league has each team appeared in?
df_teams = df.groupby(by=['Home_Team', 'Year'])['League'].min()
df_teams = df_teams.to_frame().reset_index()
df_teams = df_teams.groupby(by=['Home_Team', 'League'])['Year'].count()
df_teams = df_teams.unstack(level=1)
df_teams.fillna(value=0, inplace=True)
df_teams


In [None]:
# How many rounds occur in each league every year?
rounds_year_table = pd.pivot_table(df, values='Round', index='Year', columns='League', aggfunc=max)
rounds_year_table

In [None]:
df.columns

In [None]:
# How many goals were scored by home and away teams in each league?

df_home_goals = df.groupby(by=['Region', 'League', 'Home_Team', 'Year'])['Home_Team_Score'].sum()
df_home_goals = df_home_goals.to_frame().reset_index()
df_home_goals

In [None]:
df_away_goals = df.groupby(by=['Region', 'League', 'Away_Team', 'Year'])['Away_Team_Score'].sum()
df_away_goals = df_away_goals.to_frame().reset_index()
df_away_goals

In [None]:
# Who is the most (and least) successful team in each league?
df_home = df.loc[:,['Home_Team', 'Home_Team_Score', 'Away_Team_Score', 'League', 'Region', 'Year', 'Round']]
df_home.rename(columns={'Home_Team': 'Team', 'Home_Team_Score': 'GS', 'Away_Team_Score': 'GC'}, inplace=True)
df_away = df.loc[:,['Away_Team', 'Away_Team_Score', 'Home_Team_Score', 'League', 'Region', 'Year', 'Round']]
df_away.rename(columns={'Away_Team': 'Team', 'Away_Team_Score': 'GS', 'Home_Team_Score': 'GC'}, inplace=True)
df_points = pd.concat([df_home, df_away]).reset_index(drop=True)

def win_func(x):
    if x['GS'] > x['GC']:
        return 3
    elif x['GS'] == x['GC']:
        return 1
    else:
        return 0

df_points['Result'] = df_points.apply(lambda row: win_func(row), axis=1)
df_points

In [None]:
df_result = df_points.groupby(by=['Region', 'League', 'Team', 'Year'])['Result'].sum()
df_result

# Feature Engineering

In [None]:
import pickle
pickle.load(open('elo_dict.pkl', 'rb'))