#### Part iii - Feature Engineering
Create new features that may be useful for running the model:
- Date, day of week, time
- Categorise wins (Home/Draw/Loss)
- Home/Away/Total win streak
- Home/Away/Total form (last 5 matches)
- Goals scored that season (as per game ratio)
- Red/Yellow/Total cards that season (as per game ratio)

Note some data will need to be normalised (i.e. divided as per game ratio) to account for inconsistent/incomplete dataset.

In [199]:
import pandas as pd
import os
import datetime as dt
import numpy as np

In [200]:
project_dir = os.path.dirname(os.path.abspath(''))
df = pd.read_json(os.path.join(project_dir, 'cleaned_dataset.json'))
df.head()

Unnamed: 0,Home_Team,Away_Team,Result,Link,Season,Round,League,Date_New,Referee,Home_Yellow,Home_Red,Away_Yellow,Away_Red,City,Country,Stadium,Capacity,Pitch,Elo_home,Elo_away
0,Perugia,Bologna,2-1,https://www.besoccer.com/match/perugia/bologna...,2015,1,serie_b,"Friday, 29 August 2014, 20:30",Referee: Claudio Gavillucci,4,0,4,2,Perugia,Italy,Stadio Renato Curi,28000,Natural,46,67
1,Avellino,Pro Vercelli,1-0,https://www.besoccer.com/match/us-avellino/us-...,2015,1,serie_b,"Saturday, 30 August 2014, 20:30",Referee: Ivano Pezzuto,2,0,4,1,Avellino,Italy,Stadio Partenio,10215,Natural,56,54
2,Catania,Virtus Lanciano,3-3,https://www.besoccer.com/match/catania/ss-virt...,2015,1,serie_b,"Saturday, 30 August 2014, 20:30",Referee: Daniele Minelli,6,1,3,1,Catania,Italy,Stadio Angelo Massimino,23420,Natural,67,48
3,Crotone,Ternana Calcio,0-2,https://www.besoccer.com/match/fc-crotone/tern...,2015,1,serie_b,"Saturday, 30 August 2014, 20:30",Referee: Maurizio Mariani,5,0,4,0,Crotone,Italy,Ezio Scida,16640,Natural,60,53
4,Virtus Entella,SSC Bari,0-2,https://www.besoccer.com/match/virtus-entella/...,2015,1,serie_b,"Saturday, 30 August 2014, 20:30",Referee: Leonardo Baracani,2,0,2,0,Chiavari,Italy,Stadio Comunale Chiavari,4154,Artificial,40,60


Remove unneeded columns.

In [201]:
df = df.drop(['Link', 'Round', 'Referee', 'City', 'Stadium', 'Pitch'], axis=1)

Convert non numerical data that will be used in the model, to numerical data.

In [202]:
df['Capacity'] = df['Capacity'].str.replace(',', '')
df['Capacity'] = df['Capacity'].astype('int64')

Create date and time features.

In [203]:
df['Date'] = df['Date_New'].apply(lambda x: x[x.find(',') + 2:x.rfind(',')])
df['Date'] = pd.to_datetime(df['Date'], infer_datetime_format=True)
#print(set(df['Date']))

df['Day'] = df['Date_New'].apply(lambda x: (x[:x.find(',')]))
df['Day'] = df['Date'].dt.day_of_week
print(set(df['Day']))

df['Time'] = df['Date_New'].apply(lambda x: x[x.rfind(',') + 2:])
print(set(df['Time']))
df.head()

df = df.drop(['Date_New'], axis=1)

{0, 1, 2, 3, 4, 5, 6}
{'14:55', '12:15', '21:00', '22:00', '14:37', '17:25', '19:25', '22:30', '21:30', '15:45', '13:50', '17:00', '16:30', '21:03', '17:07', '12:30', '18:15', '17:05', '20:35', '17:40', '14:00', '21:05', '19:10', '13:15', '17:45', '20:20', '16:45', '13:30', '17:15', '18:20', '19:30', '05:35', '17:50', '20:45', '20:40', '15:05', '22:05', '15:20', '20:50', '15:30', '18:25', '21:10', '23:30', '16:00', '19:39', '18:50', '19:55', '16:35', '18:55', '21:35', '15:40', '14:30', '19:15', '14:05', '17:30', '14:35', '23:00', '19:45', '21:55', '14:10', '18:00', '20:55', '15:15', '21:15', '19:50', '14:15', '12:45', '13:00', '20:15', '13:05', '16:15', '22:45', '14:50', '12:00', '01:00', '23:15', '13:45', '00:05', '21:45', '22:15', '21:08', '19:00', '20:30', '18:10', '18:35', '16:07', '16:10', '18:45', '18:30', '13:10', '14:45', '16:05', '15:10', '19:05', '11:30', '00:00', '18:40', '20:00', '17:10', '15:00'}


Put dataframe in chronological order and re-index.

In [204]:
df = df.sort_values(by='Date').reset_index(drop=True)
df

Unnamed: 0,Home_Team,Away_Team,Result,Season,League,Home_Yellow,Home_Red,Away_Yellow,Away_Red,Country,Capacity,Elo_home,Elo_away,Date,Day,Time
0,Olympique Lyonnais,Olympique,1-4,1990,ligue_1,2,0,2,0,France,59168,72,81,1989-07-21,4,00:00
1,Montpellier,Cannes,4-1,1990,ligue_1,1,0,0,0,France,32950,74,73,1989-07-22,5,00:00
2,Nantes,Auxerre,2-1,1990,ligue_1,1,0,2,0,France,38285,79,79,1989-07-22,5,00:00
3,Lille,Caen,1-0,1990,ligue_1,3,0,2,0,France,49834,76,72,1989-07-22,5,00:00
4,Sochaux,Stade Brestois,1-0,1990,ligue_1,1,0,2,0,France,20005,78,73,1989-07-22,5,00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105523,Olympique,Olympique Lyonnais,1-1,2021,ligue_1,4,0,4,1,France,67394,79,87,2021-02-28,6,21:00
105524,Lille,Strasbourg,1-1,2021,ligue_1,0,0,2,0,France,49834,79,69,2021-02-28,6,17:05
105525,Stade de Reims,Montpellier,0-0,2021,ligue_1,1,0,1,0,France,21628,70,74,2021-02-28,6,15:00
105526,Crystal Palace,Fulham,0-0,2021,premier_league,1,0,1,0,England,26309,75,69,2021-02-28,6,13:00


Create a result column where +ve goals equate to home wins, -ve to away wins and 0 to draws.

In [205]:
def get_result(result):
    result_list = result.split('-')
    home = int(result_list[0])
    away = int(result_list[1])
    outcome = np.sign(home - away)
    return home, away, outcome

df[['Home_Goals', 'Away_Goals', 'Outcome']] = df.apply(lambda df: get_result(df['Result']), axis=1, result_type='expand')
df.head()

Unnamed: 0,Home_Team,Away_Team,Result,Season,League,Home_Yellow,Home_Red,Away_Yellow,Away_Red,Country,Capacity,Elo_home,Elo_away,Date,Day,Time,Home_Goals,Away_Goals,Outcome
0,Olympique Lyonnais,Olympique,1-4,1990,ligue_1,2,0,2,0,France,59168,72,81,1989-07-21,4,00:00,1,4,-1
1,Montpellier,Cannes,4-1,1990,ligue_1,1,0,0,0,France,32950,74,73,1989-07-22,5,00:00,4,1,1
2,Nantes,Auxerre,2-1,1990,ligue_1,1,0,2,0,France,38285,79,79,1989-07-22,5,00:00,2,1,1
3,Lille,Caen,1-0,1990,ligue_1,3,0,2,0,France,49834,76,72,1989-07-22,5,00:00,1,0,1
4,Sochaux,Stade Brestois,1-0,1990,ligue_1,1,0,2,0,France,20005,78,73,1989-07-22,5,00:00,1,0,1


Calculate winning streak for home, away and both.

In [206]:
def adjust_series(df, team, inc_away=True):
    if inc_away:
        series = df['Outcome'] * (df['Away_Team'] == team).astype(int).apply(lambda x: (x * -2) + 1)
    else:
        series = df['Outcome']
    return series

def get_details(home, away, team):
    if home and away:
        mask = (df['Home_Team'] == team) | (df['Away_Team'] == team)
        col_name = 'Streak'
    elif home:
        mask = (df['Home_Team'] == team)
        col_name = 'Home_Streak'
    else:
        mask = (df['Away_Team'] == team)
        col_name = 'Away_Streak'
    return col_name, mask

perms = [[True, False], [False, True], [True, True]]
for p in perms:
    home, away = p
    for team in set(df['Home_Team'].to_list()):
        col_name, mask = get_details(home, away, team)
        series = adjust_series(df[mask], team, inc_away=away)
        streak_series = series.groupby((series != series.shift()).cumsum()).cumsum()
        df.loc[mask, col_name] = streak_series.where(streak_series > 0, 0)

In [212]:
df[~df['Away_Team'].isin(df['Home_Team'])]

Unnamed: 0,Home_Team,Away_Team,Result,Season,League,Home_Yellow,Home_Red,Away_Yellow,Away_Red,Country,...,Elo_away,Date,Day,Time,Home_Goals,Away_Goals,Outcome,Home_Streak,Away_Streak,Streak
44210,Gimnàstic,Pontevedra,0-0,2005,segunda_division,4,0,5,0,Spain,...,47,2004-11-21,6,00:00,0,0,0,0.0,,0.0
49452,Caen,FC Libourne Saint Seurin,2-1,2007,ligue_2,4,0,3,1,France,...,44,2006-07-28,4,20:00,2,1,1,6.0,,2.0
49455,Stade Brestois,FC Libourne Saint Seurin,2-2,2007,ligue_2,2,0,4,0,France,...,44,2006-08-04,4,20:00,2,2,0,0.0,,0.0
49501,Metz,FC Libourne Saint Seurin,1-0,2007,ligue_2,4,0,3,0,France,...,44,2006-08-11,4,20:00,1,0,1,3.0,,1.0
49603,Montpellier,FC Libourne Saint Seurin,0-1,2007,ligue_2,5,0,4,0,France,...,44,2006-08-25,4,20:00,0,1,-1,0.0,,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66472,Padova,Calcio Portogruaro-Summaga,3-1,2011,serie_b,2,0,4,0,Italy,...,44,2011-04-09,5,15:00,3,1,1,1.0,,1.0
66642,Siena,Calcio Portogruaro-Summaga,1-2,2011,serie_b,2,0,4,0,Italy,...,44,2011-04-22,4,19:00,1,2,-1,0.0,,0.0
66846,Atalanta,Calcio Portogruaro-Summaga,4-1,2011,serie_b,3,0,2,0,Italy,...,44,2011-05-07,5,15:00,4,1,1,1.0,,1.0
67066,Modena,Calcio Portogruaro-Summaga,3-1,2011,serie_b,1,0,1,0,Italy,...,44,2011-05-21,5,15:00,3,1,1,1.0,,1.0


In [207]:
a = set(df['Away_Team'])
b = set(df['Home_Team'])
a == b

False

# STILL TO DO

Calculate form (i.e. W/D/L in last five games)
- Home/Away/Total form (last 5 matches)
- Goals scored that season (as per game ratio)
- Red/Yellow/Total cards that season (as per game ratio)

In [None]:
df[df['Home_Team']=='Watford'].groupby(by='Outcome').cumsum()#['Home_Goals'].count()
####TO BE UPDATED

Export dataset

In [None]:
df.to_json(os.path.join(project_dir, 'cleaned_dataset.json'))