In [1]:
import numpy as np
import pandas as pd

In [2]:
# Read all the data files
data = []
for year in range(2003, 2020):
    file = pd.read_csv(f'Data/GameStats/PLGames{year}.csv', error_bad_lines = False, encoding = "ISO-8859-1")
    data.append(file)

In [3]:
features = {'HomeGoals': 0, 'AwayGoals': 0, 'Games': 0,
            'Wins': 0, 'Losses': 0, 'Draws': 0,
            'HalfTimeHomeGoals': 0, 'HalfTimeAwayGoals': 0,
            'HalfTimeWins' : 0, 'HalfTimeLosses' : 0, 
            'HalfTimeDraws': 0,
            'HalfTimeChoke': 0, # They were winning at halftime but lost the game
            'HalfTimeClutch': 0, # They were losing at halftime but won the game
            # 'AVGCrowd': 0, 
            'AVGHomeShots': 0, 'AVGAwayShots': 0,
            'AVGHomeShotsTarget': 0, 'AVGAwayShotsTarget': 0,
            #'AVGHomeWoodwork': 0, 'AVGAwayWoodwork': 0, 
            'AVGHomeCorners': 0, 'AVGAwayCorners': 0,
            'AVGHomeFouls': 0, 'AVGAwayFouls': 0,
            #'AVGHomeFreeKicks': 0, 'AVGAwayFreeKicks': 0,
            #'AVGHomeOffsides': 0, 'AVGAwayOffsides': 0,
            'AVGHomeYellow': 0, 'AVGAwayYellow': 0,
            'AVGHomeRed': 0, 'AVGAwayRed': 0,
            'WonAtLeast2': 0, 'WonAtLeast3': 0
           }

In [4]:
AVGFeatures = ['AVGHomeShots', 'AVGAwayShots',
               'AVGHomeShotsTarget', 'AVGAwayShotsTarget',
               #'AVGHomeWoodwork', 'AVGAwayWoodwork', 
               'AVGHomeCorners', 'AVGAwayCorners',
               'AVGHomeFouls', 'AVGAwayFouls',
               #'AVGHomeFreeKicks', 'AVGAwayFreeKicks',
               #'AVGHomeOffsides', 'AVGAwayOffsides',
               'AVGHomeYellow', 'AVGAwayYellow',
               'AVGHomeRed', 'AVGAwayRed']

In [5]:
def HomeTeamStats(Season:{'team':{'str': int}}, row):
    # Add Home Team Stats
    Season[row['HomeTeam']]['HomeGoals'] += row['FTHG'] + 0
    Season[row['HomeTeam']]['HalfTimeHomeGoals'] += row['HTHG']
    Season[row['HomeTeam']]['AVGHomeShots'] += row['HS']
    Season[row['HomeTeam']]['AVGHomeShotsTarget'] += row['HST']
    #Season[row['HomeTeam']]['AVGHomeWoodwork'] += row['HHW']
    Season[row['HomeTeam']]['AVGHomeCorners'] += row['HC']
    Season[row['HomeTeam']]['AVGHomeFouls'] += row['HF']
    #Season[row['HomeTeam']]['AVGHomeFreeKicks'] += row['HFKC']
    #Season[row['HomeTeam']]['AVGHomeOffsides'] += row['HO']
    Season[row['HomeTeam']]['AVGHomeYellow'] += row['HY']
    Season[row['HomeTeam']]['AVGHomeRed'] += row['HR']

In [6]:
def AwayTeamStats(Season: {'team':{'str': int}}, row):
    # Add Away Team Stats
    Season[row['AwayTeam']]['AwayGoals'] += row['FTAG']
    Season[row['AwayTeam']]['HalfTimeAwayGoals'] += row['HTAG']
    Season[row['AwayTeam']]['AVGAwayShots'] += row['AS']
    Season[row['AwayTeam']]['AVGAwayShotsTarget'] += row['AST']
    #Season[row['AwayTeam']]['AVGAwayWoodwork'] += row['AHW']
    Season[row['AwayTeam']]['AVGAwayCorners'] += row['AC']
    Season[row['AwayTeam']]['AVGAwayFouls'] += row['AF']
    #Season[row['AwayTeam']]['AVGAwayFreeKicks'] += row['AFKC']
    #Season[row['AwayTeam']]['AVGAwayOffsides'] += row['AO']
    Season[row['AwayTeam']]['AVGAwayYellow'] += row['AY']
    Season[row['AwayTeam']]['AVGAwayRed'] += row['AR']

In [7]:
def GameWinner(Season: {'team':{'str': int}}, row):
    # Determine Game Winner, Loser, and if there was a draw
    # Also see if a team won by at least 2 goals
    Season[row['HomeTeam']]['Games'] += 1
    Season[row['AwayTeam']]['Games'] += 1
    if row['FTR'] == 'H':
        Season[row['HomeTeam']]['Wins'] += 1
        Season[row['AwayTeam']]['Losses'] += 1
        if (row['FTHG'] - row['FTAG']) >= 2:
            Season[row['HomeTeam']]['WonAtLeast2'] += 1
        if (row['FTHG'] - row['FTAG']) >= 3:
            Season[row['HomeTeam']]['WonAtLeast3'] += 1
            
    elif row['FTR'] == 'A':
        Season[row['HomeTeam']]['Losses'] += 1
        Season[row['AwayTeam']]['Wins'] += 1
        if (row['FTAG'] - row['FTHG']) >= 2:
            Season[row['AwayTeam']]['WonAtLeast2'] += 1
        if (row['FTAG'] - row['FTHG']) >= 3:
            Season[row['AwayTeam']]['WonAtLeast3'] += 1
    else:
        # Tie
        Season[row['HomeTeam']]['Draws'] += 1
        Season[row['AwayTeam']]['Draws'] += 1

In [8]:
def HalfTimeWinner(Season: {'team':{'str': int}}, row):
    # Determine HalfTime Winner
    if row['HTR'] == 'H':
        Season[row['HomeTeam']]['HalfTimeWins'] += 1
        Season[row['AwayTeam']]['HalfTimeLosses'] += 1
    elif row['HTR'] == 'A':
        Season[row['HomeTeam']]['HalfTimeLosses'] += 1
        Season[row['AwayTeam']]['HalfTimeWins'] += 1
    else:
        # Draw
        Season[row['HomeTeam']]['HalfTimeDraws'] += 1
        Season[row['AwayTeam']]['HalfTimeDraws'] += 1

In [9]:
def HalfTimeChoke(Season: {'team':{'str': int}}, row):
    # Was the time winning by halftime and then lost the game
    # or did the reverse happen?
    if (row['HTR'] == 'H') and (row['FTR'] == 'A'):
        Season[row['HomeTeam']]['HalfTimeChoke'] += 1
        Season[row['HomeTeam']]['HalfTimeClutch'] += 1
    elif (row['HTR'] == 'A') and (row['FTR'] == 'H'):
        Season[row['HomeTeam']]['HalfTimeClutch'] += 1
        Season[row['HomeTeam']]['HalfTimeChoke'] += 1

In [10]:
def findAverage(Season: {'team':{'str': int}}, AVGFeatures: ['str']):
    # Find the average for each feature
    for AVGf in AVGFeatures:
        for team, v in Season.items():
            Season[team][AVGf] /= Season[team]['Games']

In [11]:
def appendDataFrame(Season: {'team':{'str': int}}, AllSeasonsdf, year: int):
    for team, v in Season.items():
        v['Season'] = f'{year}-{year + 1}'
        v['Team'] = team
        AllSeasonsdf = AllSeasonsdf.append(v, ignore_index = True)
    return AllSeasonsdf

In [12]:
AllSeasonsdf = pd.DataFrame()
year = 2003
for dfSeason in data:
    Season = dict()
    for index, row in dfSeason.iterrows():
        # New Team for that season
        if row['HomeTeam'] not in Season:
            Season[row['HomeTeam']] = dict(features)
        if row['AwayTeam'] not in Season:
            Season[row['AwayTeam']] = dict(features)
        
        HomeTeamStats(Season, row)
        AwayTeamStats(Season, row)
        GameWinner(Season, row)
        HalfTimeWinner(Season, row)
        HalfTimeChoke(Season, row)
    
    findAverage(Season, AVGFeatures)
    AllSeasonsdf = appendDataFrame(Season, AllSeasonsdf, year)
    year += 1

In [13]:
print(AllSeasonsdf[:4])

   AVGAwayCorners  AVGAwayFouls  AVGAwayRed  AVGAwayShots  AVGAwayShotsTarget  \
0        2.210526      6.921053    0.052632      5.552632            3.210526   
1        2.894737      6.210526    0.052632      4.421053            2.052632   
2        1.947368      6.684211    0.052632      3.763158            1.842105   
3        2.500000      8.184211    0.026316      4.500000            2.421053   

   AVGAwayYellow  AVGHomeCorners  AVGHomeFouls  AVGHomeRed  AVGHomeShots  ...  \
0       0.763158        3.105263      6.210526    0.026316      6.842105  ...   
1       0.684211        3.473684      5.842105    0.026316      6.894737  ...   
2       0.552632        2.921053      6.605263    0.078947      4.868421  ...   
3       1.026316        2.710526      6.657895    0.000000      6.578947  ...   

   HalfTimeHomeGoals  HalfTimeLosses  HalfTimeWins  HomeGoals  Losses  \
0               19.0             5.0          18.0       40.0     0.0   
1               12.0            15.0      

In [14]:
AllSeasonsdf = AllSeasonsdf[['Season', 'Team', 
                             'Wins', 'Losses', 'Draws', 'Games',
                             'HomeGoals', 'AwayGoals',
                             'WonAtLeast2', 'WonAtLeast3',
                             'HalfTimeWins', 'HalfTimeLosses', 'HalfTimeDraws',
                             'HalfTimeHomeGoals', 'HalfTimeAwayGoals', 
                             'HalfTimeClutch', 'HalfTimeChoke',
                             'AVGHomeShots', 'AVGAwayShots',
                             'AVGHomeShotsTarget', 'AVGAwayShotsTarget',
                             'AVGHomeYellow', 'AVGAwayYellow',
                             'AVGHomeRed', 'AVGAwayRed', 
                             'AVGHomeFouls', 'AVGAwayFouls',
                             'AVGHomeCorners', 'AVGAwayCorners']]

In [15]:
AllSeasonsdf.to_csv(f'Data/CombinedTeamData.csv', index=False)