In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from datetime import datetime as dt
import itertools
from pathlib import Path

%matplotlib inline

In [14]:
path='.\Datasets\\'
file = 'Bundesliga_results.csv'
df = pd.read_csv(path + file, sep=',')
numteams=18
nummatch = 34
inummatch=35
totalmatch=int(nummatch*numteams/2)
weekmatch = 9

num_seasons = len(df) / (nummatch * weekmatch)
num_seasons = int(num_seasons)

df['Season'] = np.repeat([_ for _ in range(1, num_seasons+1)], weekmatch*nummatch)

dfs = []

teams = {}

cols= ['Date','HomeTeam','AwayTeam','FTHG','FTAG','FTR'] 
df = df[cols] 

for season in range(len(dfs)):
    df=dfs[Season]
    numteams=0
    for i in df.groupby('HomeTeam').mean().T.columns:
        teams[i] = []
        numteams=numteams + 1
    nummatch = (numteams-1)*2
    inummatch= nummatch + 1
    totalmatch= int(nummatch*numteams/2)
    weekmatch=int(totalmatch/nummatch)
   

    df = get_gss(df, numteams, nummatch, inummatch, totalmatch)
    df = get_agg_points(df, numteams, nummatch, inummatch, totalmatch)
    df =add_form_df(df, numteams, nummatch, inummatch, totalmatch)
    dfs[season]=df


print(df)

           Date       HomeTeam        AwayTeam  FTHG  FTAG FTR
0      7/8/1993  Bayern Munich        Freiburg     3     1   H
1      7/8/1993       Dortmund       Karlsruhe     2     1   H
2      7/8/1993       Duisburg      Leverkusen     2     2   D
3      7/8/1993        FC Koln  Kaiserslautern     0     2   A
4      7/8/1993        Hamburg        Nurnberg     5     2   H
...         ...            ...             ...   ...   ...  ..
7645  12/5/2018     Hoffenheim        Dortmund     3     1   H
7646  12/5/2018     Leverkusen        Hannover     3     2   H
7647  12/5/2018          Mainz   Werder Bremen     1     2   A
7648  12/5/2018     Schalke 04   Ein Frankfurt     1     0   H
7649  12/5/2018      Wolfsburg         FC Koln     4     1   H

[7650 rows x 6 columns]


In [3]:
# Gets the goals scored agg arranged by teams and matchweek
def get_goals_scored(playing_stat, numteams, nummatch, inummatch, totalmatch):
    # Create a dictionary with team names as keys
    teams = {}
  
    for i in playing_stat.groupby('HomeTeam').mean().T.columns:
        teams[i] = []

    # the value corresponding to keys is a list containing the match location.
    for i in range(len(playing_stat)):
        HTGS = playing_stat.iloc[i]['FTHG']
        ATGS = playing_stat.iloc[i]['FTAG']
        teams[playing_stat.iloc[i].HomeTeam].append(HTGS)
        teams[playing_stat.iloc[i].AwayTeam].append(ATGS)  
    # Create a dataframe for goals scored where rows are teams and cols are mge(atchweek.
    GoalsScored = pd.DataFrame(data=teams, index = [i for i in range(1,inummatch )]).T
    GoalsScored[0] = 0
    # Aggregate to get uptil that point
    for i in range(1,inummatch ):
        GoalsScored[i] = GoalsScored[i] + GoalsScored[i-1]
    return GoalsScored



# Gets the goals conceded agg arranged by teams and matchweek
def get_goals_conceded(playing_stat, numteams, nummatch, inummatch, totalmatch):
    # Create a dictionary with team names as keys
    teams = {}
    for i in playing_stat.groupby('HomeTeam').mean().T.columns:
        teams[i] = []

    # the value corresponding to keys is a list containing the match location.
    for i in range(len(playing_stat)):
        ATGC = playing_stat.iloc[i]['FTHG']
        HTGC = playing_stat.iloc[i]['FTAG']
        teams[playing_stat.iloc[i].HomeTeam].append(HTGC)
        teams[playing_stat.iloc[i].AwayTeam].append(ATGC)
    
    # Create a dataframe for goals scored where rows are teams and cols are matchweek.
    GoalsConceded = pd.DataFrame(data=teams, index = [i for i in range(1,inummatch)]).T
    GoalsConceded[0] = 0
    # Aggregate to get uptil that point
    for i in range(1,inummatch):
        GoalsConceded[i] = GoalsConceded[i] + GoalsConceded[i-1]
    return GoalsConceded

def get_gss(playing_stat, numteams, nummatch, inummatch, totalmatch):

    GC = get_goals_conceded(playing_stat, numteams, nummatch, inummatch, totalmatch)
    GS = get_goals_scored(playing_stat, numteams, nummatch, inummatch, totalmatch)
   
    j = 0
    HTGS = []
    ATGS = []
    HTGC = []
    ATGC = []

    for i in range(totalmatch):
        ht = playing_stat.iloc[i].HomeTeam
        at = playing_stat.iloc[i].AwayTeam
        HTGS.append(GS.loc[ht][j])
        ATGS.append(GS.loc[at][j])
        HTGC.append(GC.loc[ht][j])
        ATGC.append(GC.loc[at][j])
        
        if ((i + 1)% 10) == 0:
            j = j + 1
        
    playing_stat['HTGS'] = HTGS
    playing_stat['ATGS'] = ATGS
    playing_stat['HTGC'] = HTGC
    playing_stat['ATGC'] = ATGC
    
    return playing_stat
    #playing_statistics = get_gss(playing_statistics, numteams, nummatch, inummatch, totalmatch)
   # print(playing_statistics)


In [4]:
def get_points(result):
    if result == 'W':
        return 3
    elif result == 'D':
        return 1
    else:
        return 0
    

def get_cuml_points(matchres, numteams, nummatch, inummatch, totalmatch):  

    matchres_points = matchres.applymap(get_points)
    for i in range(2,inummatch):
        matchres_points[i] = matchres_points[i] + matchres_points[i-1]
        
    matchres_points.insert(column =0, loc = 0, value = [0*i for i in range(numteams)])
    return matchres_points


def get_matchres(playing_stat, numteams, nummatch, inummatch, totalmatch):
    # Create a dictionary with team names as keys
    teams = {}
    
    for i in playing_stat.groupby('HomeTeam').mean().T.columns:
        teams[i] = []
    
   

    # the value corresponding to keys is a list containing the match result
    for i in range(len(playing_stat)):
        if playing_stat.iloc[i].FTR == 'H':
            teams[playing_stat.iloc[i].HomeTeam].append('W')
            teams[playing_stat.iloc[i].AwayTeam].append('L')
        elif playing_stat.iloc[i].FTR == 'A':
            teams[playing_stat.iloc[i].AwayTeam].append('W')
            teams[playing_stat.iloc[i].HomeTeam].append('L')
        else:
            teams[playing_stat.iloc[i].AwayTeam].append('D')
            teams[playing_stat.iloc[i].HomeTeam].append('D')
            
    return pd.DataFrame(data=teams, index = [i for i in range(1,inummatch)]).T

def get_agg_points(playing_stat, numteams, nummatch, inummatch, totalmatch):

    matchres = get_matchres(playing_stat, numteams, nummatch, inummatch, totalmatch)
    cum_pts = get_cuml_points(matchres, numteams, nummatch, inummatch, totalmatch)
    HTP = []
    ATP = []
    j = 0
    for i in range(totalmatch):
        ht = playing_stat.iloc[i].HomeTeam
        at = playing_stat.iloc[i].AwayTeam
        HTP.append(cum_pts.loc[ht][j])
        ATP.append(cum_pts.loc[at][j])

        if ((i + 1)% 10) == 0:
            j = j + 1
            
    playing_stat['HTP'] = HTP
    playing_stat['ATP'] = ATP
    return playing_stat
  

In [5]:
def get_form(playing_stat,num):
    form = get_matchres(playing_stat, numteams, nummatch, inummatch, totalmatch)
    form_final = form.copy()
    for i in range(num,inummatch):
        form_final[i] = ''
        j = 0
        while j < num:
            form_final[i] += form[i-j]
            j += 1           
    return form_final

def add_form(playing_stat,num, numteams, nummatch, inummatch, totalmatch):
    form = get_form(playing_stat,num)
    h = ['M' for i in range(num * 9)]  # since form is not available for n MW (n*10)
    a = ['M' for i in range(num * 90)]
    
    j = num
    for i in range((num*9),totalmatch):
        ht = playing_stat.iloc[i].HomeTeam
        at = playing_stat.iloc[i].AwayTeam
        
        past = form.loc[ht][j]               # get past n results
        h.append(past[num-1])                    # 0 index is most recent
        
        past = form.loc[at][j]               # get past n results.
        a.append(past[num-1])                   # 0 index is most recent
        
        if ((i + 1)% 9) == 0:
            j = j + 1

    playing_stat['HM' + str(num)] = h                 
    playing_stat['AM' + str(num)] = a

    
    return playing_stat

def add_form_df(playing_statistics, numteams, nummatch, inummatch, totalmatch):
    playing_statistics = add_form(playing_statistics,1, numteams, nummatch, inummatch, totalmatch)
    playing_statistics = add_form(playing_statistics,2, numteams, nummatch, inummatch, totalmatch)
    playing_statistics = add_form(playing_statistics,3, numteams, nummatch, inummatch, totalmatch)
    playing_statistics = add_form(playing_statistics,4, numteams, nummatch, inummatch, totalmatch)
    playing_statistics = add_form(playing_statistics,5, numteams, nummatch, inummatch, totalmatch)
    return playing_statistics  
# Make changes to df

In [6]:
def get_mw(playing_stat, numteams, nummatch, inummatch, totalmatch):
    j = 1
    MatchWeek = []
    for i in range(totalmatch):
        MatchWeek.append(j)
        if ((i + 1)% 9) == 0:
            j = j + 1
    playing_stat['MW'] = MatchWeek
    return playing_stat
    playing_statistics = get_mw(playing_statistics)
  

In [7]:
# Rearranging columns
cols = ['Date', 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'FTR', 'HTGS', 'ATGS', 'HTGC', 'ATGC', 'HTP', 'ATP', 'HM1', 'HM2', 'HM3',
        'HM4', 'HM5', 'AM1', 'AM2', 'AM3', 'AM4', 'AM5' ]

playing_statistics = playing_statistics[cols]

NameError: name 'playing_statistics' is not defined

In [31]:
playing_stat=df
# Gets the form points.
def get_form_points(string):
    sum = 0
    for letter in string:
        sum += get_points(letter)
    return sum

playing_stat['HTFormPtsStr'] = playing_stat['HM1'] + playing_stat['HM2'] + playing_stat['HM3'] + playing_stat['HM4'] + playing_stat['HM5']
playing_stat['ATFormPtsStr'] = playing_stat['AM1'] + playing_stat['AM2'] + playing_stat['AM3'] + playing_stat['AM4'] + playing_stat['AM5']

playing_stat['HTFormPts'] = playing_stat['HTFormPtsStr'].apply(get_form_points)
playing_stat['ATFormPts'] = playing_stat['ATFormPtsStr'].apply(get_form_points)

# Identify Win/Loss Streaks if any.
def get_3game_ws(string):
    if string[-3:] == 'WWW':
        return 1
    else:
        return 0
    
def get_5game_ws(string):
    if string == 'WWWWW':
        return 1
    else:
        return 0
    
def get_3game_ls(string):
    if string[-3:] == 'LLL':
        return 1
    else:
        return 0
    
def get_5game_ls(string):
    if string == 'LLLLL':
        return 1
    else:
        return 0
    
playing_stat['HTWinStreak3'] = playing_stat['HTFormPtsStr'].apply(get_3game_ws)
playing_stat['HTWinStreak5'] = playing_stat['HTFormPtsStr'].apply(get_5game_ws)
playing_stat['HTLossStreak3'] = playing_stat['HTFormPtsStr'].apply(get_3game_ls)
playing_stat['HTLossStreak5'] = playing_stat['HTFormPtsStr'].apply(get_5game_ls)

playing_stat['ATWinStreak3'] = playing_stat['ATFormPtsStr'].apply(get_3game_ws)
playing_stat['ATWinStreak5'] = playing_stat['ATFormPtsStr'].apply(get_5game_ws)
playing_stat['ATLossStreak3'] = playing_stat['ATFormPtsStr'].apply(get_3game_ls)
playing_stat['ATLossStreak5'] = playing_stat['ATFormPtsStr'].apply(get_5game_ls)

playing_stat.keys()

KeyError: 'HM1'

In [18]:
# Get Goal Difference
playing_stat['HTGD'] = playing_stat['HTGS'] - playing_stat['HTGC']
playing_stat['ATGD'] = playing_stat['ATGS'] - playing_stat['ATGC']

# Diff in points
playing_stat['DiffPts'] = playing_stat['HTP'] - playing_stat['ATP']
playing_stat['DiffFormPts'] = playing_stat['HTFormPts'] - playing_stat['ATFormPts']

KeyError: 'HTGS'