In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from datetime import datetime as dt
import itertools
from pathlib import Path

%matplotlib inline

In [13]:
path='.\Datasets\\'
file = 'Ligue1_Championship.csv'
df = pd.read_csv(path + file, sep=',')
print(df)
num_seasons = df.Season.nunique()

mapping_s_to_i = {s:i for i, s in enumerate(df.Season.unique())}

df['Season'] = df.Season.map(mapping_s_to_i)

dfs = []

teams = {}
for i in df.groupby('HomeTeam').mean().T.columns:
        teams[i] = []

for idx, gb in df.groupby('Season'):
    numteams=0
    for i in gb.groupby('HomeTeam').mean().T.columns:
        numteams = numteams + 1
    nummatch = (numteams-1)*2
    inummatch = nummatch + 1
    totalmatch = int(nummatch*numteams/2)
    weekmatch = int(numteams/2)

   
    _df = gb.copy()
    _df = get_gss(_df, numteams, nummatch, inummatch, totalmatch)
    _df = get_agg_points(_df, numteams, nummatch, inummatch, totalmatch)
    _df = add_form_df(_df, numteams, nummatch, inummatch, totalmatch)

    dfs.append(_df)
   
df = pd.concat(dfs)

       Season   HomeTeam     AwayTeam  FTHG  FTAG  Score       Winner  \
0     1999/00   Bordeaux       Bastia     3     2      5     Bordeaux   
1     1999/00     Monaco   St Etienne     2     2      4         Draw   
2     1999/00    Auxerre        Nancy     2     1      3      Auxerre   
3     1999/00       Lyon  Montpellier     1     2      3  Montpellier   
4     1999/00  Marseille        Sedan     3     0      3    Marseille   
...       ...        ...          ...   ...   ...    ...          ...   
7373  2018/19     Nantes   Strasbourg     0     1      1   Strasbourg   
7374  2018/19       Nice       Monaco     2     0      2         Nice   
7375  2018/19      Nimes         Lyon     2     3      5         Lyon   
7376  2018/19      Reims     Paris SG     3     1      4        Reims   
7377  2018/19     Rennes        Lille     3     1      4       Rennes   

         Loser  
0       Bastia  
1         Draw  
2        Nancy  
3         Lyon  
4        Sedan  
...        ...  
7373

ValueError: Shape of passed values is (2, 18), indices imply (34, 18)

In [11]:
# Gets the goals scored agg arranged by teams and matchweek
def get_goals_scored(playing_stat, numteams, nummatch, inummatch, totalmatch):
    # Create a dictionary with team names as keys
    teams = {}
    for i in playing_stat.groupby('HomeTeam').mean().T.columns:
        teams[i] = []
    
    # the value corresponding to keys is a list containing the match location.
    for i in range(numteams):
        HTGS = playing_stat.iloc[i]['FTHG']
        ATGS = playing_stat.iloc[i]['FTAG']
        teams[playing_stat.iloc[i].HomeTeam].append(HTGS)
        teams[playing_stat.iloc[i].AwayTeam].append(ATGS)
    
    # Create a dataframe for goals scored where rows are teams and cols are matchweek.
    GoalsScored = pd.DataFrame(data=teams, index = [i for i in range(1,nummatch)]).T
    GoalsScored[0] = 0
    # Aggregate to get uptil that point
    for i in range(2,inummatch):
        GoalsScored[i] = GoalsScored[i] + GoalsScored[i-1]
    return GoalsScored



# Gets the goals conceded agg arranged by teams and matchweek
def get_goals_conceded(playing_stat, numteams, nummatch, inummatch, totalmatch):
    # Create a dictionary with team names as keys
    teams = {}
    for i in playing_stat.groupby('HomeTeam').mean().T.columns:
        teams[i] = []
    
    # the value corresponding to keys is a list containing the match location.
    for i in range(numteams):
        ATGC = playing_stat.iloc[i]['FTHG']
        HTGC = playing_stat.iloc[i]['FTAG']
        teams[playing_stat.iloc[i].HomeTeam].append(HTGC)
        teams[playing_stat.iloc[i].AwayTeam].append(ATGC)
    
    # Create a dataframe for goals scored where rows are teams and cols are matchweek.
    GoalsConceded = pd.DataFrame(data=teams, index = [i for i in range(1,inummatch)]).T
    GoalsConceded[0] = 0
    # Aggregate to get uptil that point
    for i in range(2,inummatch):
        GoalsConceded[i] = GoalsConceded[i] + GoalsConceded[i-1]
    return GoalsConceded

def get_gss(playing_stat, numteams, nummatch, inummatch, totalmatch):
    GC = get_goals_conceded(playing_stat, numteams, nummatch, inummatch, totalmatch)
    GS = get_goals_scored(playing_stat, numteams, nummatch, inummatch, totalmatch)
   
    j = 0
    HTGS = []
    ATGS = []
    HTGC = []
    ATGC = []

    for i in range(totalmatch):
        ht = playing_stat.iloc[i].HomeTeam
        at = playing_stat.iloc[i].AwayTeam
        HTGS.append(GS.loc[ht][j])
        ATGS.append(GS.loc[at][j])
        HTGC.append(GC.loc[ht][j])
        ATGC.append(GC.loc[at][j])
        
        if ((i + 1)% 10) == 0:
            j = j + 1
        
    playing_stat['HTGS'] = HTGS
    playing_stat['ATGS'] = ATGS
    playing_stat['HTGC'] = HTGC
    playing_stat['ATGC'] = ATGC
    
    return playing_stat


# Apply to each dataset
#playing_statistics = get_gss(playing_statistics, numteams, nummatch, inummatch, totalmatch)

In [6]:
def get_points(result):
    if result == 'W':
        return 3
    elif result == 'D':
        return 1
    else:
        return 0
    

def get_cuml_points(matchres, numteams, nummatch, inummatch, totalmatch):  

    matchres_points = matchres.applymap(get_points)
    for i in range(2,inummatch):
        matchres_points[i] = matchres_points[i] + matchres_points[i-1]
        
    matchres_points.insert(column =0, loc = 0, value = [0*i for i in range(numteams)])
    return matchres_points


def get_matchres(playing_stat, numteams, nummatch, inummatch, totalmatch):
    # Create a dictionary with team names as keys
    teams = {}
    
    for i in playing_stat.groupby('HomeTeam').mean().T.columns:
        teams[i] = []
    
   

    # the value corresponding to keys is a list containing the match result
    for i in range(len(playing_stat)):
        if playing_stat.iloc[i].FTR == 'H':
            teams[playing_stat.iloc[i].HomeTeam].append('W')
            teams[playing_stat.iloc[i].AwayTeam].append('L')
        elif playing_stat.iloc[i].FTR == 'A':
            teams[playing_stat.iloc[i].AwayTeam].append('W')
            teams[playing_stat.iloc[i].HomeTeam].append('L')
        else:
            teams[playing_stat.iloc[i].AwayTeam].append('D')
            teams[playing_stat.iloc[i].HomeTeam].append('D')
            
    return pd.DataFrame(data=teams, index = [i for i in range(1,inummatch)]).T

def get_agg_points(playing_stat, numteams, nummatch, inummatch, totalmatch):

    matchres = get_matchres(playing_stat)
    cum_pts = get_cuml_points(matchres)
    HTP = []
    ATP = []
    j = 0
    for i in range(totalmatch):
        ht = playing_stat.iloc[i].HomeTeam
        at = playing_stat.iloc[i].AwayTeam
        HTP.append(cum_pts.loc[ht][j])
        ATP.append(cum_pts.loc[at][j])

        if ((i + 1)% 10) == 0:
            j = j + 1
            
    playing_stat['HTP'] = HTP
    playing_stat['ATP'] = ATP
    return playing_stat
    playing_statistics = get_agg_points(playing_statistics)
    print(playing_statistics, numteams, nummatch, inummatch, totalmatch)

In [7]:
def get_form(playing_stat,num):

    for i in range(num,inummatch):
        form_final[i] = ''
        j = 0
        while j < num:
            form_final[i] += form[i-j]
            j += 1           
    return form_final

def add_form(playing_stat,num, numteams, nummatch, inummatch, totalmatch):
    form = get_form(playing_stat,num)
    h = ['M' for i in range(num * 10)]  # since form is not available for n MW (n*10)
    a = ['M' for i in range(num * 10)]
    
    j = num
    for i in range((num*10),totalmatch):
        ht = playing_stat.iloc[i].HomeTeam
        at = playing_stat.iloc[i].AwayTeam
        
        past = form.loc[ht][j]               # get past n results
        h.append(past[num-1])                    # 0 index is most recent
        
        past = form.loc[at][j]               # get past n results.
        a.append(past[num-1])                   # 0 index is most recent
        
        if ((i + 1)% 10) == 0:
            j = j + 1

    playing_stat['HM' + str(num)] = h                 
    playing_stat['AM' + str(num)] = a

    
    return playing_stat

def add_form_df(playing_statistics, numteams, nummatch, inummatch, totalmatch):
    playing_statistics = add_form(playing_statistics,1)
    playing_statistics = add_form(playing_statistics,2)
    playing_statistics = add_form(playing_statistics,3)
    playing_statistics = add_form(playing_statistics,4)
    playing_statistics = add_form(playing_statistics,5)
    return playing_statistics  
# Make changes to df
#playing_statistics =add_form_df(playing_statistics)
#print(playing_statistics, numteams, nummatch, inummatch, totalmatch)