<p><strong>Objective:</strong> To build a model that can classify the total number of goals scored in a soccer match, given publicly available data</p>
<p><strong>Performance criteria:</strong> F1 score</p>

In [1]:
path = r"C:\Machine_learning_datafiles\footballPrediction"

In [2]:
import os, gc, warnings
warnings.filterwarnings('ignore')

In [3]:
os.chdir(path)

In [4]:
os.getcwd()

'C:\\Machine_learning_datafiles\\footballPrediction'

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats

In [6]:
%matplotlib inline
pd.set_option('display.max_columns', None)

In [7]:
train = pd.read_csv('training_data.csv', sep=';')
valid = pd.read_csv('validation_data.csv', sep=';')
test = pd.read_csv('testing_data.csv', sep=';')

In [8]:
train.shape, test.shape, valid.shape

((100000, 102), (500, 101), (50000, 102))

In [9]:
teams = pd.Series(list(set(train.teams_home_team_id).union(set(train.teams_away_team_id))))

In [10]:
print('There are a total of {:,} teams in the dataset.'.format(len(teams)))

There are a total of 10,885 teams in the dataset.


In [11]:
print('Total number of leagues are {:,}'.format(train.league_name.nunique()))

Total number of leagues are 562


<b>Data Cleaning</b>

In [12]:
# dropping Duplicate rows
train = train.drop_duplicates()

In [13]:
print('Train set after removing duplicates is {:,}'.format(train.shape[0]))

Train set after removing duplicates is 99,536


In [14]:
# replacing row with % and with nan
for data_df in [train, test, valid]:
    data_df.loc[
        :, 'winning_percent_home':'teams_away_last_5_matches_def'] = data_df.loc[
        :, 'winning_percent_home':'teams_away_last_5_matches_def'].apply(
        lambda x: pd.to_numeric(x.replace('[%nan]', '', regex=True)))

In [15]:
# replacing -NAN.5
for data_df in [train, test, valid]:
    data_df.loc[:, ['goals_home','goals_away']] = data_df.loc[
        :, ['goals_home','goals_away']].apply(lambda x: x.replace('.[0-9]', np.nan, regex=True))

In [16]:
for data_df in [train, test, valid]:
    data_df['fixture_date'] = pd.to_datetime(data_df.fixture_date, format='%Y-%m-%d')
    data_df['fixture_year'] = data_df.fixture_date.dt.isocalendar().year
    data_df['fixture_week'] = data_df.fixture_date.dt.isocalendar().week
    data_df['fixture_day'] = data_df.fixture_date.dt.isocalendar().day
    data_df['fixture_month'] = data_df.fixture_date.dt.month
    data_df['fixture_weekday'] = data_df.fixture_date.dt.weekday

Checking if home and away teams are the same

In [17]:
ishomeaway = pd.DataFrame()

In [18]:
ishomeaway['ishome'] = (train.teams_home_team_id == train.teams_home_team_id) * 1
ishomeaway['isAway'] = (train.teams_home_team_id == train.teams_away_team_id) * 1

In [19]:
print(ishomeaway.isAway.value_counts())
print(ishomeaway.ishome.value_counts())

0    99531
1        5
Name: isAway, dtype: int64
1    99536
Name: ishome, dtype: int64


In [20]:
ishomeaway[(ishomeaway.isAway == 1)]

Unnamed: 0,ishome,isAway
3827,1,1
6579,1,1
12643,1,1
14276,1,1
78053,1,1


In [21]:
# let's confirm
train.loc[[3827,6579, 12643, 14276, 78053], :]

Unnamed: 0,fixture_id,teams_home_team_id,teams_away_team_id,fixture_date,fixture_venue,league_id,league_name,league_country,league_season,league_round,home_team_venue_name,home_team_venue_city,home_team_venue_capacity,home_team_country,home_team_founded,away_team_venue_name,away_team_venue_city,away_team_venue_capacity,away_team_country,away_team_founded,match_winner,under_over,goals_home,goals_away,winning_percent_home,winning_percent_draws,winning_percent_away,comparison_forme_home,comparison_forme_away,comparison_att_home,comparison_att_away,comparison_def_home,comparison_def_away,comparison_fish_law_home,comparison_fish_law_away,comparison_h2h_home,comparison_h2h_away,comparison_goals_h2h_home,comparison_goals_h2h_away,teams_home_last_5_matches_forme,teams_home_last_5_matches_att,teams_home_last_5_matches_def,teams_away_last_5_matches_forme,teams_away_last_5_matches_att,teams_away_last_5_matches_def,teams_home_last_5_matches_goals,teams_home_last_5_matches_goals_avg,teams_home_last_5_matches_goals_against,teams_home_last_5_matches_goals_against_avg,teams_home_all_last_matches_matchs_matchsPlayed_home,teams_home_all_last_matches_matchs_matchsPlayed_away,teams_home_all_last_matches_matchs_wins_home,teams_home_all_last_matches_matchs_wins_away,teams_home_all_last_matches_matchs_draws_home,teams_home_all_last_matches_matchs_draws_away,teams_home_all_last_matches_matchs_loses_home,teams_home_all_last_matches_matchs_loses_away,teams_home_all_last_matches_goals_goalsFor_home,teams_home_all_last_matches_goals_goalsFor_away,teams_home_all_last_matches_goals_goalsAgainst_home,teams_home_all_last_matches_goals_goalsAgainst_away,teams_home_all_last_matches_goalsAvg_goalsFor_home,teams_home_all_last_matches_goalsAvg_goalsFor_away,teams_home_all_last_matches_goalsAvg_goalsAgainst_home,teams_home_all_last_matches_goalsAvg_goalsAgainst_away,teams_home_last_h2h_played_home,teams_home_last_h2h_played_away,teams_home_last_h2h_wins_home,teams_home_last_h2h_wins_away,teams_home_last_h2h_draws_home,teams_home_last_h2h_draws_away,teams_home_last_h2h_loses_home,teams_home_last_h2h_loses_away,teams_away_last_5_matches_goals,teams_away_last_5_matches_goals_avg,teams_away_last_5_matches_goals_against,teams_away_last_5_matches_goals_against_avg,teams_away_all_last_matches_matchs_matchsPlayed_home,teams_away_all_last_matches_matchs_matchsPlayed_away,teams_away_all_last_matches_matchs_wins_home,teams_away_all_last_matches_matchs_wins_away,teams_away_all_last_matches_matchs_draws_home,teams_away_all_last_matches_matchs_draws_away,teams_away_all_last_matches_matchs_loses_home,teams_away_all_last_matches_matchs_loses_away,teams_away_all_last_matches_goals_goalsFor_home,teams_away_all_last_matches_goals_goalsFor_away,teams_away_all_last_matches_goals_goalsAgainst_home,teams_away_all_last_matches_goals_goalsAgainst_away,teams_away_all_last_matches_goalsAvg_goalsFor_home,teams_away_all_last_matches_goalsAvg_goalsFor_away,teams_away_all_last_matches_goalsAvg_goalsAgainst_home,teams_away_all_last_matches_goalsAvg_goalsAgainst_away,teams_away_last_h2h_played_home,teams_away_last_h2h_played_away,teams_away_last_h2h_wins_home,teams_away_last_h2h_wins_away,teams_away_last_h2h_draws_home,teams_away_last_h2h_draws_away,teams_away_last_h2h_loses_home,teams_away_last_h2h_loses_away,outcome,fixture_year,fixture_week,fixture_day,fixture_month,fixture_weekday
3827,618759,1405,1405,2020-10-16 17:09:00+00:00,Sportplatz KAC,2909,Landesliga - Karnten,Austria,2020,Kärnten - 12,Sportplatz KAC,Klagenfurt,2000.0,Austria,,Sportplatz KAC,Klagenfurt,2000.0,Austria,,N 2,,-2.5,-2.5,10,45,45,50,50,50,50,50,50,25,75,50,50,50,50,47,39.0,67.0,47,39.0,67.0,7,1.4,6,1.2,11,11,3,8,2,2,6,1,18,20,19,9,1.6,1.8,1.7,0.8,9,9,3,4,2,2,4,3,7,1.4,6,1.2,11,11,3,8,2,2,6,1,18,20,19,9,1.6,1.8,1.7,0.8,9,9,3,4,2,2,4,3,Under,2020,42,5,10,4
6579,618759,1405,1405,2020-10-16 17:09:00+00:00,Sportplatz KAC,2909,Landesliga - Karnten,Austria,2020,Kärnten - 12,Sportplatz KAC,Klagenfurt,2000.0,Austria,,Sportplatz KAC,Klagenfurt,2000.0,Austria,,N 2,,-2.5,-2.5,10,45,45,50,50,50,50,50,50,24,76,50,50,50,50,47,35.0,70.0,47,35.0,70.0,7,1.4,6,1.2,11,11,3,8,2,2,6,1,18,20,19,9,1.6,1.8,1.7,0.8,9,9,3,4,2,2,4,3,7,1.4,6,1.2,11,11,3,8,2,2,6,1,18,20,19,9,1.6,1.8,1.7,0.8,9,9,3,4,2,2,4,3,Under,2020,42,5,10,4
12643,636846,9810,9810,2020-11-22 00:00:00+00:00,,2991,Tercera Division - Group 13,Spain,2020,Group 13 - 6,Estadio Municipal Cartagonova,Cartagena,15105.0,Spain,,Estadio Municipal Cartagonova,Cartagena,15105.0,Spain,,1 N,-3.5,-2.5,-1.5,45,45,10,50,50,50,50,50,50,90,10,0,0,0,0,53,55.0,73.0,53,55.0,73.0,6,1.2,3,0.6,3,2,2,0,1,1,0,1,5,1,1,2,1.7,0.5,0.3,1.0,0,0,0,0,0,0,0,0,6,1.2,3,0.6,3,2,2,0,1,1,0,1,5,1,1,2,1.7,0.5,0.3,1.0,0,0,0,0,0,0,0,0,Over,2020,47,7,11,6
14276,259490,1405,1405,2019-08-23 17:00:00+00:00,Sportplatz Welzenegg,1088,Landesliga - Karnten,Austria,2019,Kärnten - 5,Sportplatz KAC,Klagenfurt,2000.0,Austria,,Sportplatz KAC,Klagenfurt,2000.0,Austria,,,,,,33,33,33,50,50,50,50,50,50,0,0,50,50,50,50,33,64.0,27.0,33,64.0,27.0,7,1.4,8,1.6,4,4,0,1,2,1,2,2,4,7,8,9,1.0,1.8,2.0,2.3,9,9,3,4,2,2,4,3,7,1.4,8,1.6,4,4,0,1,2,1,2,2,4,7,8,9,1.0,1.8,2.0,2.3,9,9,3,4,2,2,4,3,Over,2019,34,5,8,4
78053,177710,4459,4459,2019-05-12 07:30:00+00:00,Kowloon Tsai Park - Field 1 (Kowloon),630,HKFA 1st Division,Hong-Kong,2018,Regular Season - 26,Tuen Mun Tang Shiu Kin Sports Ground,Tuen Mun,2500.0,Hong-Kong,1960.0,Tuen Mun Tang Shiu Kin Sports Ground,Tuen Mun,2500.0,Hong-Kong,1960.0,,,,,33,33,33,50,50,50,50,50,50,42,58,50,50,50,50,53,54.0,62.0,53,54.0,62.0,7,1.4,5,1.0,25,25,9,9,6,9,10,7,40,49,48,41,1.6,2.0,1.9,1.6,10,10,3,3,4,4,3,3,7,1.4,5,1.0,25,25,9,9,6,9,10,7,40,49,48,41,1.6,2.0,1.9,1.6,10,10,3,3,4,4,3,3,Over,2019,19,7,5,6


<b>There's no way a home team will play against themselves. For this, we'll drop these rows</b>

In [22]:
train = train.drop(labels=[3827,6579, 12643, 14276, 78053], axis=0)

<h3> Feature Engineering</h3>

Since, the objective is to predict the total goals scored by a team in a match. For this categorical variables that define a particular team may not be useful. Variables such as stadium of the home or away team, city they are situated. Hence, we will drop them and create some numerical features that define any teams such as:

1. if the match is played in a neutral, home, or away ground.
2. Time match was played: Summer, Winter, Autumn, Spring 

In [23]:
def venue_status(df):
    """This checks if the fixture venue is the teams home venue or is an away or neutral ground"""
    # replacing non-ascii numbers
    df[['fixture_venue', 'home_team_venue_name', 'away_team_venue_name']] \
    = df[['fixture_venue', 'home_team_venue_name', 'away_team_venue_name']].apply(
        lambda x: x.replace('[^a-zA-Z0-9\'\s]+', '', regex=True).str.strip())
    
    for row, num in df[['fixture_venue', 'home_team_venue_name', 'away_team_venue_name']].iterrows():
        
        
        if str(num.fixture_venue) == 'nan' or str(num.home_team_venue_name) == 'nan' or \
        str(num.away_team_venue_name) == 'nan':
            df.loc[row, 'venue_status'] = 'Unknown'
        elif str(num.away_team_venue_name).lower() in str(num.fixture_venue).lower():
            df.loc[row, 'venue_status'] = 'Away'
        elif str(num.home_team_venue_name).lower() in str(num.fixture_venue).lower():
            df.loc[row, 'venue_status'] = 'Home'
        elif str(num.away_team_venue_name).lower() not in str(num.fixture_venue).lower() and \
        str(num.away_team_venue_name).lower() not in str(num.fixture_venue).lower():
            df.loc[row, 'venue_status'] = 'Neutral'
    return df

In [24]:
test = venue_status(test)

In [26]:
valid = venue_status(valid)

In [27]:
train = venue_status(train)

In [28]:
def weather(x):
    if x in [3,4,5]: return 'spring'
    elif x in [6,7,8]: return 'summer'
    elif x in [9,10,11]: return 'fall'
    else: return 'winter'

In [29]:
for data_df in [train, valid, test]:
    data_df['weather_season'] = data_df.fixture_month.apply(weather)

In [30]:
# dropping irrelevant columns
drop_cols = ['fixture_date', 'home_team_venue_name', 'home_team_venue_city', 
             'away_team_venue_city', 'away_team_venue_name', 'home_team_venue_capacity', 
             'away_team_founded', 'home_team_founded', 'fixture_id','away_team_venue_capacity']

In [31]:
train.head(2)

Unnamed: 0,fixture_id,teams_home_team_id,teams_away_team_id,fixture_date,fixture_venue,league_id,league_name,league_country,league_season,league_round,home_team_venue_name,home_team_venue_city,home_team_venue_capacity,home_team_country,home_team_founded,away_team_venue_name,away_team_venue_city,away_team_venue_capacity,away_team_country,away_team_founded,match_winner,under_over,goals_home,goals_away,winning_percent_home,winning_percent_draws,winning_percent_away,comparison_forme_home,comparison_forme_away,comparison_att_home,comparison_att_away,comparison_def_home,comparison_def_away,comparison_fish_law_home,comparison_fish_law_away,comparison_h2h_home,comparison_h2h_away,comparison_goals_h2h_home,comparison_goals_h2h_away,teams_home_last_5_matches_forme,teams_home_last_5_matches_att,teams_home_last_5_matches_def,teams_away_last_5_matches_forme,teams_away_last_5_matches_att,teams_away_last_5_matches_def,teams_home_last_5_matches_goals,teams_home_last_5_matches_goals_avg,teams_home_last_5_matches_goals_against,teams_home_last_5_matches_goals_against_avg,teams_home_all_last_matches_matchs_matchsPlayed_home,teams_home_all_last_matches_matchs_matchsPlayed_away,teams_home_all_last_matches_matchs_wins_home,teams_home_all_last_matches_matchs_wins_away,teams_home_all_last_matches_matchs_draws_home,teams_home_all_last_matches_matchs_draws_away,teams_home_all_last_matches_matchs_loses_home,teams_home_all_last_matches_matchs_loses_away,teams_home_all_last_matches_goals_goalsFor_home,teams_home_all_last_matches_goals_goalsFor_away,teams_home_all_last_matches_goals_goalsAgainst_home,teams_home_all_last_matches_goals_goalsAgainst_away,teams_home_all_last_matches_goalsAvg_goalsFor_home,teams_home_all_last_matches_goalsAvg_goalsFor_away,teams_home_all_last_matches_goalsAvg_goalsAgainst_home,teams_home_all_last_matches_goalsAvg_goalsAgainst_away,teams_home_last_h2h_played_home,teams_home_last_h2h_played_away,teams_home_last_h2h_wins_home,teams_home_last_h2h_wins_away,teams_home_last_h2h_draws_home,teams_home_last_h2h_draws_away,teams_home_last_h2h_loses_home,teams_home_last_h2h_loses_away,teams_away_last_5_matches_goals,teams_away_last_5_matches_goals_avg,teams_away_last_5_matches_goals_against,teams_away_last_5_matches_goals_against_avg,teams_away_all_last_matches_matchs_matchsPlayed_home,teams_away_all_last_matches_matchs_matchsPlayed_away,teams_away_all_last_matches_matchs_wins_home,teams_away_all_last_matches_matchs_wins_away,teams_away_all_last_matches_matchs_draws_home,teams_away_all_last_matches_matchs_draws_away,teams_away_all_last_matches_matchs_loses_home,teams_away_all_last_matches_matchs_loses_away,teams_away_all_last_matches_goals_goalsFor_home,teams_away_all_last_matches_goals_goalsFor_away,teams_away_all_last_matches_goals_goalsAgainst_home,teams_away_all_last_matches_goals_goalsAgainst_away,teams_away_all_last_matches_goalsAvg_goalsFor_home,teams_away_all_last_matches_goalsAvg_goalsFor_away,teams_away_all_last_matches_goalsAvg_goalsAgainst_home,teams_away_all_last_matches_goalsAvg_goalsAgainst_away,teams_away_last_h2h_played_home,teams_away_last_h2h_played_away,teams_away_last_h2h_wins_home,teams_away_last_h2h_wins_away,teams_away_last_h2h_draws_home,teams_away_last_h2h_draws_away,teams_away_last_h2h_loses_home,teams_away_last_h2h_loses_away,outcome,fixture_year,fixture_week,fixture_day,fixture_month,fixture_weekday,venue_status,weather_season
0,570479,3396,3400,2020-10-31 00:00:00+00:00,Tsirion Athltiko Kentro,2658,1. Division,Cyprus,2020,Regular Season - 9,Tsirion Athltiko Kentro,Lemesós,13331.0,Cyprus,1930.0,Dimotiko Stadio Aradippou,Aradippou,2000.0,Cyprus,1958.0,1 N,,-2.5,-1.5,45,45,10,56,44,83,17,27,73,61,39,100,0,73,27,60,100.0,11.0,47,22.0,67.0,10,2.0,8,1.6,3,5,3,2,0,0,0,3,9,5,3,8,3.0,1.0,1.0,1.6,2,2,2,2,0,0,0,0,2,0.4,3,0.6,4,4,0,3,1,0,3,1,0,5,9,3,0.0,1.3,2.3,0.8,2,2,0,0,0,0,2,2,Over,2020,44,6,10,5,Home,fall
1,102820,2788,2777,2017-11-18 11:00:00+00:00,UMT Stadium Ubon Ratchathani,390,Thai Premier League,Thailand,2017,Regular Season - 34,Udon Thani Institute of PE Stadium,Udon Thani,3500.0,Thailand,2005.0,Sri Nakhon Lamduan Stadium,Sisaket,12000.0,Thailand,2012.0,1 N,1.5,-2.5,-2.5,45,45,10,100,0,50,50,73,27,0,0,50,50,67,33,47,50.0,50.0,0,50.0,0.0,8,1.6,8,1.6,16,17,6,5,7,4,3,8,29,24,24,29,1.8,1.4,1.5,1.7,2,3,1,1,0,0,1,2,8,1.6,22,4.4,17,16,4,2,5,0,8,14,26,16,41,47,1.5,1.0,2.4,2.9,3,2,2,1,0,0,1,1,Over,2017,46,6,11,5,Neutral,fall


In [32]:
# dropping irrelevant columns
train.drop(drop_cols, axis=1, inplace=True)
valid.drop(drop_cols, axis=1, inplace=True)
test.drop(drop_cols, axis=1, inplace=True)

In [33]:
train.shape, test.shape, valid.shape

((99531, 99), (500, 98), (50000, 99))

In [34]:
# saving data
train.to_pickle('train_cleaned.pkl', compression='gzip')
valid.to_pickle('valid_cleaned.pkl', compression='gzip')
test.to_pickle('test_cleaned.pkl', compression='gzip')