In [1]:
import pandas as pd
import numpy as np
import os
import csv
from datetime import datetime

DATA_PATH = 'data/'

In [2]:
# TODO: Make scraper module so that we can get data from inside Jupyter Notebook
# Get league data and league_standings

Once we have the data downloaded we can start to create our dataset. We will begin by using data for the last 12 seasons. This should give us enough data to make good predictions, going any further back and the data might not as relevant. We will use the current season 19/20 for test data.

Due to the nature of football data being time-series data (ie: matches occur over the course of a season) we will be using full seasons (or two seasons) for our test data. We can also use a repeated K-fold to check our accuracy. 

In [3]:
df1 = pd.read_csv(os.path.join(DATA_PATH, 'season0708.csv'))
df2 = pd.read_csv(os.path.join(DATA_PATH, 'season0809.csv'))
df3 = pd.read_csv(os.path.join(DATA_PATH, 'season0910.csv'))
df4 = pd.read_csv(os.path.join(DATA_PATH, 'season1011.csv'))
df5 = pd.read_csv(os.path.join(DATA_PATH, 'season1112.csv'))
df6 = pd.read_csv(os.path.join(DATA_PATH, 'season1213.csv'))
df7 = pd.read_csv(os.path.join(DATA_PATH, 'season1314.csv'))
df8 = pd.read_csv(os.path.join(DATA_PATH, 'season1415.csv'))
df9 = pd.read_csv(os.path.join(DATA_PATH, 'season1516.csv'))
df10 = pd.read_csv(os.path.join(DATA_PATH, 'season1617.csv'))
df11 = pd.read_csv(os.path.join(DATA_PATH, 'season1718.csv'))
df12 = pd.read_csv(os.path.join(DATA_PATH, 'season1819.csv'))
df13 = pd.read_csv(os.path.join(DATA_PATH, 'season1920.csv'))

Some of the dfs have missing data on the last row. We will remove these missing rows first. 

In [4]:
all_dfs = (df1, df2, df3, df4, df5, df6, df7, 
           df8, df9, df10, df11, df12, df13)
for df in all_dfs:
    df.dropna(subset=['Date'], axis=0, how='all', inplace=True)

In [5]:
def parse_date(date):
    """
    Converts date from string to datetime object.
    """
    return datetime.strptime(date, '%d/%m/%y').date()

def parse_date_other(date):
    if date == '':
        return None
    else:
        return datetime.strptime(date, '%d/%m/%Y').date()

In [6]:
df1.Date = df1.Date.apply(parse_date)
df2.Date = df2.Date.apply(parse_date)
df3.Date = df3.Date.apply(parse_date)
df4.Date = df4.Date.apply(parse_date)
df5.Date = df5.Date.apply(parse_date)
df6.Date = df6.Date.apply(parse_date)
df7.Date = df7.Date.apply(parse_date)
df8.Date = df8.Date.apply(parse_date)
df9.Date = df9.Date.apply(parse_date)
df10.Date = df10.Date.apply(parse_date)
df11.Date = df11.Date.apply(parse_date_other)
df12.Date = df12.Date.apply(parse_date_other)
df13.Date = df13.Date.apply(parse_date_other)

In [7]:
# Get only the columns we need to get stats
cols = ['Date', 'HomeTeam', 'AwayTeam', 'HS', 'AS', 
        'FTHG','FTAG', 'FTR', 'B365H', 'B365D', 'B365A', 'season']

playing_stats1 = df1[cols]
playing_stats2 = df2[cols]
playing_stats3 = df3[cols]
playing_stats4 = df4[cols]
playing_stats5 = df5[cols]
playing_stats6 = df6[cols]
playing_stats7 = df7[cols]
playing_stats8 = df8[cols]
playing_stats9 = df9[cols]
playing_stats10 = df10[cols]
playing_stats11 = df11[cols]
playing_stats12 = df12[cols]
playing_stats13 = df13[cols]

In [8]:
def get_matchweek(playing_stat):
    j = 1
    MatchWeek = []
    for i in range(len(playing_stat)):
        MatchWeek.append(j)
        if ((i + 1)% 10) == 0:
            j = j + 1
    playing_stat['MW'] = MatchWeek
    return playing_stat

playing_stats1 = get_matchweek(playing_stats1)
playing_stats2 = get_matchweek(playing_stats2)
playing_stats3 = get_matchweek(playing_stats3)
playing_stats4 = get_matchweek(playing_stats4)
playing_stats5 = get_matchweek(playing_stats5)
playing_stats6 = get_matchweek(playing_stats6)
playing_stats7 = get_matchweek(playing_stats7)
playing_stats8 = get_matchweek(playing_stats8)
playing_stats9 = get_matchweek(playing_stats9)
playing_stats10 = get_matchweek(playing_stats10)
playing_stats11 = get_matchweek(playing_stats11)
playing_stats12 = get_matchweek(playing_stats12)
playing_stats13 = get_matchweek(playing_stats13)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [9]:
# Gets the goals scored agg arranged by teams and matchweek
def get_goals_scored(playing_stat):
    # Get the number of matchweeks in the season
    mw = max(playing_stat['MW'])   
    # Create a dictionary with team names as keys
    teams = {}
    for i in playing_stat.groupby('HomeTeam').mean().T.columns:
        teams[i] = []
    
    # the value corresponding to keys is a list containing the match location.
    for i in range(len(playing_stat)):
        HTGS = playing_stat.iloc[i]['FTHG']
        ATGS = playing_stat.iloc[i]['FTAG']
        teams[playing_stat.iloc[i].HomeTeam].append(HTGS)
        teams[playing_stat.iloc[i].AwayTeam].append(ATGS)
    
    # Create a dataframe for goals scored where rows are teams and cols are matchweek.
    GoalsScored = pd.DataFrame(data=teams, index = [i for i in range(mw)]).T
    GoalsScored[0] = 0
    # Aggregate to get uptil that point
    for i in range(2,mw):
        GoalsScored[i] = GoalsScored[i] + GoalsScored[i-1]
    return GoalsScored

# Gets the goals conceded agg arranged by teams and matchweek
def get_goals_conceded(playing_stat):
    # Get the number of matchweeks in the season
    mw = max(playing_stat['MW'])  
    # Create a dictionary with team names as keys
    teams = {}
    for i in playing_stat.groupby('HomeTeam').mean().T.columns:
        teams[i] = []
    
    # the value corresponding to keys is a list containing the match location.
    for i in range(len(playing_stat)):
        ATGC = playing_stat.iloc[i]['FTHG']
        HTGC = playing_stat.iloc[i]['FTAG']
        teams[playing_stat.iloc[i].HomeTeam].append(HTGC)
        teams[playing_stat.iloc[i].AwayTeam].append(ATGC)
    
    # Create a dataframe for goals scored where rows are teams and cols are matchweek.
    GoalsConceded = pd.DataFrame(data=teams, index = [i for i in range(mw)]).T
    GoalsConceded[0] = 0
    # Aggregate to get uptil that point
    for i in range(1, mw):
        GoalsConceded[i] = GoalsConceded[i] + GoalsConceded[i-1]
    return GoalsConceded

def get_gss(playing_stat):
    GC = get_goals_conceded(playing_stat)
    GS = get_goals_scored(playing_stat)
   
    j = 0
    HTGS = []
    ATGS = []
    HTGC = []
    ATGC = []

    for i in range(len(playing_stat)):
        ht = playing_stat.iloc[i].HomeTeam
        at = playing_stat.iloc[i].AwayTeam
        HTGS.append(GS.loc[ht][j])
        ATGS.append(GS.loc[at][j])
        HTGC.append(GC.loc[ht][j])
        ATGC.append(GC.loc[at][j])
        
        if ((i + 1)% 10) == 0:
            j = j + 1
        
    playing_stat['HTGS'] = HTGS
    playing_stat['ATGS'] = ATGS
    playing_stat['HTGC'] = HTGC
    playing_stat['ATGC'] = ATGC
    
    return playing_stat

# Apply to each dataset
playing_stats1 = get_gss(playing_stats1)
playing_stats2 = get_gss(playing_stats2)
playing_stats3 = get_gss(playing_stats3)
playing_stats4 = get_gss(playing_stats4)
playing_stats5 = get_gss(playing_stats5)
playing_stats6 = get_gss(playing_stats6)
playing_stats7 = get_gss(playing_stats7)
playing_stats8 = get_gss(playing_stats8)
playing_stats9 = get_gss(playing_stats9)
playing_stats10 = get_gss(playing_stats10)
playing_stats11 = get_gss(playing_stats11)
playing_stats12 = get_gss(playing_stats12)
playing_stats13 = get_gss(playing_stats13)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_gui

In [10]:
# Retrospective points
def get_points(result):
    if result == 'W':
        return 3
    elif result == 'D':
        return 1
    else:
        return 0
    
def get_cumulative_points(matchres, mw):
    matchres_points = matchres.applymap(get_points)
    for i in range(2, mw+1):
        matchres_points[i] = matchres_points[i] + matchres_points[i-1]
        
    matchres_points.insert(column =0, loc = 0, value = [0*i for i in range(20)])
    return matchres_points

def get_match_result(playing_stat):
    # Create a dictionary with team names as keys
    teams = {}
    for i in playing_stat.groupby('HomeTeam').mean().T.columns:
        teams[i] = []

    # the value corresponding to keys is a list containing the match result
    for i in range(len(playing_stat)):
        if playing_stat.iloc[i].FTR == 'H':
            teams[playing_stat.iloc[i].HomeTeam].append('W')
            teams[playing_stat.iloc[i].AwayTeam].append('L')
        elif playing_stat.iloc[i].FTR == 'A':
            teams[playing_stat.iloc[i].AwayTeam].append('W')
            teams[playing_stat.iloc[i].HomeTeam].append('L')
        else:
            teams[playing_stat.iloc[i].AwayTeam].append('D')
            teams[playing_stat.iloc[i].HomeTeam].append('D')
            
    return pd.DataFrame(data=teams, index = [i for i in range(1, max(playing_stat['MW']+1))]).T

def get_agg_points(playing_stat):
    matchres = get_match_result(playing_stat)
    cum_pts = get_cumulative_points(matchres, max(playing_stat['MW']))
    HTP = []
    ATP = []
    j = 0
    for i in range(len(playing_stat)):
        ht = playing_stat.iloc[i].HomeTeam
        at = playing_stat.iloc[i].AwayTeam
        HTP.append(cum_pts.loc[ht][j])
        ATP.append(cum_pts.loc[at][j])

        if ((i + 1)% 10) == 0:
            j = j + 1
            
    playing_stat['HTP'] = HTP
    playing_stat['ATP'] = ATP
    return playing_stat

playing_stats1 = get_agg_points(playing_stats1)
playing_stats2 = get_agg_points(playing_stats2)
playing_stats3 = get_agg_points(playing_stats3)
playing_stats4 = get_agg_points(playing_stats4)
playing_stats5 = get_agg_points(playing_stats5)
playing_stats6 = get_agg_points(playing_stats6)
playing_stats7 = get_agg_points(playing_stats7)
playing_stats8 = get_agg_points(playing_stats8)
playing_stats9 = get_agg_points(playing_stats9)
playing_stats10 = get_agg_points(playing_stats10)
playing_stats11 = get_agg_points(playing_stats11)
playing_stats12 = get_agg_points(playing_stats12)
playing_stats13 = get_agg_points(playing_stats13)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [11]:
def get_form(playing_stat,num):
    form = get_match_result(playing_stat)
    form_final = form.copy()
    for i in range(num, max(playing_stat['MW'])+1):
        form_final[i] = ''
        j = 0
        while j < num:
            form_final[i] += form[i-j]
            j += 1
    return form_final

def add_form(playing_stat,num):
    form = get_form(playing_stat,num)
    h = ['M' for i in range(num * 10)]  # since form is not available for n MW (n*10)
    a = ['M' for i in range(num * 10)]
    
    j = num
    for i in range((num*10),playing_stat.shape[0]):
        ht = playing_stat.iloc[i].HomeTeam
        at = playing_stat.iloc[i].AwayTeam
        
        past = form.loc[ht][j]               # get past n results
        h.append(past[num-1])                    # 0 index is most recent
        
        past = form.loc[at][j]               # get past n results.
        a.append(past[num-1])                   # 0 index is 39most recent
        
        if ((i + 1)% 10) == 0:
            j = j + 1

    playing_stat['HM' + str(num)] = h                 
    playing_stat['AM' + str(num)] = a

    
    return playing_stat


def add_form_df(playing_statistics):
    playing_statistics = add_form(playing_statistics,1)
    playing_statistics = add_form(playing_statistics,2)
    playing_statistics = add_form(playing_statistics,3)
    playing_statistics = add_form(playing_statistics,4)
    playing_statistics = add_form(playing_statistics,5)
    return playing_statistics

playing_stats1 = add_form_df(playing_stats1)
playing_stats2 = add_form_df(playing_stats2)
playing_stats3 = add_form_df(playing_stats3)
playing_stats4 = add_form_df(playing_stats4)
playing_stats5 = add_form_df(playing_stats5)
playing_stats6 = add_form_df(playing_stats6)
playing_stats7 = add_form_df(playing_stats7)
playing_stats8 = add_form_df(playing_stats8)
playing_stats9 = add_form_df(playing_stats9)
playing_stats10 = add_form_df(playing_stats10)
playing_stats11 = add_form_df(playing_stats11)
playing_stats12 = add_form_df(playing_stats12)
playing_stats13 = add_form_df(playing_stats13)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [12]:
# Rearranging columns
cols = ['Date', 'season', 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'FTR', 'HTGS', 'ATGS', 
        'HTGC', 'ATGC', 'HTP', 'ATP', 'B365H', 'B365D', 
        'B365A', 'HM1', 'HM2', 'HM3', 'HM4', 'HM5', 'AM1', 'AM2', 'AM3', 'AM4', 'AM5', 'MW']

playing_stats1 = playing_stats1[cols]
playing_stats2 = playing_stats2[cols]
playing_stats3 = playing_stats3[cols]
playing_stats4 = playing_stats4[cols]
playing_stats5 = playing_stats5[cols]
playing_stats6 = playing_stats6[cols]
playing_stats7 = playing_stats7[cols]
playing_stats8 = playing_stats8[cols]
playing_stats9 = playing_stats9[cols]
playing_stats10 = playing_stats10[cols]
playing_stats11 = playing_stats11[cols]
playing_stats12 = playing_stats12[cols]
playing_stats13 = playing_stats13[cols]

In [13]:
standings = pd.read_csv(DATA_PATH + 'league_standings.csv')

def get_league_pos(playing_stat, standings, year):
    HomeTeamLP = []
    AwayTeamLP = []
        
    for i in range(len(playing_stat)): 
        # TODO: Use regex to clean string
        ht = playing_stat.iloc[i].HomeTeam
        at = playing_stat.iloc[i].AwayTeam
        # If team was promoted set to default pos of 18th
        try:
            LP = standings.loc[(standings['Team'] == ht) & (standings['Season'] == year )].Position.item()
            HomeTeamLP.append(LP)
        except ValueError as e:
            HomeTeamLP.append(18)
        
        try:
            LP = standings.loc[(standings['Team'] == at) & (standings['Season'] == year )].Position.item()
            AwayTeamLP.append(LP)
        except ValueError as e:
            AwayTeamLP.append(18)
       
     
#         HomeTeamLP.append(standings.loc[(standings['Team'].str.contains(ht)) & (standings['Season'] == year )].Position.item())
#         AwayTeamLP.append(standings.loc[(standings['Team'].str.contains(at)) & (standings['Season'] == year )].Position.item())
                     
    playing_stat['HomeTeamLP'] = HomeTeamLP
    playing_stat['AwayTeamLP'] = AwayTeamLP

    return playing_stat

playing_stats1 = get_league_pos(playing_stats1, standings, 6)
playing_stats2 = get_league_pos(playing_stats2, standings, 7)
playing_stats3 = get_league_pos(playing_stats3, standings, 8)
playing_stats4 = get_league_pos(playing_stats4, standings, 9)
playing_stats5 = get_league_pos(playing_stats5, standings, 10)
playing_stats6 = get_league_pos(playing_stats6, standings, 11)
playing_stats7 = get_league_pos(playing_stats7, standings, 12)
playing_stats8 = get_league_pos(playing_stats8, standings, 13)
playing_stats9 = get_league_pos(playing_stats9, standings, 14)
playing_stats10 = get_league_pos(playing_stats10, standings, 15)
playing_stats11 = get_league_pos(playing_stats11, standings, 16)
playing_stats12 = get_league_pos(playing_stats12, standings, 17)
playing_stats13 = get_league_pos(playing_stats13, standings, 17)

  del sys.path[0]


In [14]:
# Concatenate seasons and add gameId so we can process data easier
playing_stats = pd.concat([playing_stats1,
                          playing_stats2,
                          playing_stats3,
                          playing_stats4,
                          playing_stats5,
                          playing_stats6,
                          playing_stats7,
                          playing_stats8,
                          playing_stats9,
                          playing_stats10,
                          playing_stats11,
                          playing_stats12,
                          playing_stats13], ignore_index=True).assign(gameId=lambda df: list(df.index + 1))

In [15]:
playing_stats.head()

Unnamed: 0,Date,season,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTGS,ATGS,HTGC,...,HM5,AM1,AM2,AM3,AM4,AM5,MW,HomeTeamLP,AwayTeamLP,gameId
0,2007-08-11,708,Aston Villa,Liverpool,1.0,2.0,A,0.0,0.0,0.0,...,M,M,M,M,M,M,1,18,3,1
1,2007-08-11,708,Bolton,Newcastle,1.0,3.0,A,0.0,0.0,0.0,...,M,M,M,M,M,M,1,7,13,2
2,2007-08-11,708,Derby,Portsmouth,2.0,2.0,D,0.0,0.0,0.0,...,M,M,M,M,M,M,1,18,9,3
3,2007-08-11,708,Everton,Wigan,2.0,1.0,H,0.0,0.0,0.0,...,M,M,M,M,M,M,1,6,17,4
4,2007-08-11,708,Middlesbrough,Blackburn,1.0,2.0,A,0.0,0.0,0.0,...,M,M,M,M,M,M,1,12,10,5


In [16]:
# Gets the form points.
def get_form_points(string):
    sum = 0
    for letter in string:
        sum += get_points(letter)
    return sum

playing_stats['HTFormPtsStr'] = playing_stats['HM1'] + playing_stats['HM2'] + playing_stats['HM3'] + playing_stats['HM4'] + playing_stats['HM5']
playing_stats['ATFormPtsStr'] = playing_stats['AM1'] + playing_stats['AM2'] + playing_stats['AM3'] + playing_stats['AM4'] + playing_stats['AM5']

playing_stats['HTFormPts'] = playing_stats['HTFormPtsStr'].apply(get_form_points)
playing_stats['ATFormPts'] = playing_stats['ATFormPtsStr'].apply(get_form_points)

In [17]:
# Get doal difference
playing_stats['HTGD'] = playing_stats['HTGS'] - playing_stats['HTGC']
playing_stats['ATGD'] = playing_stats['ATGS'] - playing_stats['ATGC']

# Diff in points
playing_stats['DiffPts'] = playing_stats['HTP'] - playing_stats['ATP']
playing_stats['DiffFormPts'] = playing_stats['HTFormPts'] - playing_stats['ATFormPts']

# Uncomment this line to create unscaled df for predictions
# this_season = playing_stats.loc[playing_stats['season'] == 1920]
# this_season.to_csv(os.path.join(DATA_PATH, 'season1920_data.csv'))

# Diff in last year positions
playing_stats['DiffLP'] = playing_stats['HomeTeamLP'] - playing_stats['AwayTeamLP']

In [19]:
# Scale DiffPts , DiffFormPts, HTGD, ATGD by Matchweek.
cols = ['HTGD','ATGD','DiffPts','DiffFormPts','HTP','ATP']
playing_stats.MW = playing_stats.MW.astype(float)

for col in cols:
    playing_stats[col] = playing_stats[col] / playing_stats.MW

In [20]:
playing_stats = playing_stats.drop(['FTHG', 'FTAG', 'HomeTeamLP', 'AwayTeamLP'], 1)
playing_stats.to_csv(os.path.join(DATA_PATH, 'league_data.csv'))