In [91]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import os
import shutil
import unicodedata
import mlbgame
import glob

In [92]:
df = pd.read_csv('GL2017.csv')

In [93]:
columns = ['Date', 'Day', 'AwayTeam', 'AwayTeamGameNumber', 'HomeTeam', 'HomeTeamGameNumber', 'AwayScore', 'HomeScore',
          'AwayAtBats', 'AwayHits', 'AwayDoubles', 'AwayTriples', 'AwayHomeRuns', 'AwayRBI', 'AwayWalks', 
          'AwayIntentionalWalks', 'AwayStrikeouts', 'AwayStolenBases', 'AwayCaughtStealing', 'AwayLeftOnBase',
          'AwayIndividualERA', 'AwayTeamERA', 'AwayErrors', 'AwayStartingPitcher',
          'HomeAtBats', 'HomeHits', 'HomeDoubles', 'HomeTriples', 'HomeHomeRuns', 'HomeRBI', 'HomeWalks', 
          'HomeIntentionalWalks', 'HomeStrikeouts', 'HomeStolenBases', 'HomeCaughtStealing', 'HomeLeftOnBase',
          'HomeIndividualERA', 'HomeTeamERA', 'HomeErrors', 'HomeStartingPitcher']

In [94]:
df = df[columns]
df.head()

Unnamed: 0,Date,Day,AwayTeam,AwayTeamGameNumber,HomeTeam,HomeTeamGameNumber,AwayScore,HomeScore,AwayAtBats,AwayHits,...,HomeWalks,HomeIntentionalWalks,HomeStrikeouts,HomeStolenBases,HomeCaughtStealing,HomeLeftOnBase,HomeIndividualERA,HomeTeamERA,HomeErrors,HomeStartingPitcher
0,20170402,Sun,SFN,1,ARI,1,5,6,34,11,...,0,0,12,0,0,6,5,5,0,Zack Greinke
1,20170402,Sun,CHN,1,SLN,1,3,4,34,8,...,8,1,8,2,0,14,3,3,0,Carlos Martinez
2,20170402,Sun,NYA,1,TBA,1,3,7,36,9,...,3,0,10,1,0,9,2,2,1,Chris Archer
3,20170403,Mon,PHI,1,CIN,1,4,3,35,10,...,2,0,6,0,0,7,4,4,1,Scott Feldman
4,20170403,Mon,SDN,1,LAN,1,3,14,33,5,...,5,2,5,0,0,6,2,2,1,Clayton Kershaw


In [95]:
#removing the first 20 games of the season for each team
training_set = df.loc[df.AwayTeamGameNumber >20]
training_set = training_set.loc[df.HomeTeamGameNumber >20]

In [96]:
training_set.head()

Unnamed: 0,Date,Day,AwayTeam,AwayTeamGameNumber,HomeTeam,HomeTeamGameNumber,AwayScore,HomeScore,AwayAtBats,AwayHits,...,HomeWalks,HomeIntentionalWalks,HomeStrikeouts,HomeStolenBases,HomeCaughtStealing,HomeLeftOnBase,HomeIndividualERA,HomeTeamERA,HomeErrors,HomeStartingPitcher
280,20170424,Mon,SDN,21,ARI,21,6,7,37,10,...,2,0,8,4,0,6,5,5,1,Zack Greinke
289,20170425,Tue,SDN,22,ARI,22,3,9,37,10,...,3,0,8,2,1,9,3,3,0,Patrick Corbin
291,20170425,Tue,CIN,21,MIL,22,1,9,35,8,...,7,1,6,3,1,8,1,1,1,Zach Davies
293,20170425,Tue,LAN,21,SFN,21,2,1,32,7,...,1,0,9,1,0,8,2,2,0,Ty Blach
301,20170426,Wed,SDN,23,ARI,23,8,5,37,12,...,5,0,13,3,1,10,8,8,1,Zack Godley


In [97]:
"""X is the dataset. 
Returns a CSV containing last numOfGames games of team teamName (doesn't whether teamName is home or away).
Doesn't include currentGameNumber
"""
def get_last_n_games(teamName, numOfGames, currentGameNumber, X):
    away_games = df.loc[df.AwayTeam==teamName]
    away_games = away_games.loc[df.AwayTeamGameNumber < currentGameNumber]
    
    home_games = df.loc[df.HomeTeam==teamName]
    home_games = home_games.loc[df.HomeTeamGameNumber < currentGameNumber]
    return pd.concat([home_games, away_games])

In [98]:
"""
Possible values of column:
BATTING: BA, Runs, Walks, Strikeouts, LOB, StolenBases
"""
def return_rolling_average_of_column(df, team, column):
    numer = 0.0
    denom = 0
    away_games = df.loc[df.AwayTeam==team]
    home_games = df.loc[df.HomeTeam==team]
    
    if (column=="BA"): #we want the average batting average for this team
        awayhits = np.sum(away_games['AwayHits'])
        awayatbats = np.sum(away_games['AwayAtBats'])
        homehits = np.sum(home_games['HomeHits'])
        homeatbats = np.sum(home_games['HomeAtBats'])
        return (homehits + awayhits) / (homeatbats + awayatbats)
    
    elif (column=="Runs"):
        awayRuns = np.sum(away_games['AwayScore'])
        homeRuns = np.sum(home_games['HomeScore'])
        return (awayRuns + homeRuns) / df.shape[0]
    
    elif (column=="Walks"):
        awayWalks = np.sum(away_games['AwayWalks'])
        homeWalks = np.sum(home_games['HomeWalks'])
        return (awayWalks + homeWalks) / df.shape[0]
    
    elif (column=="Strikeouts"):
        away = np.sum(away_games['AwayStrikeouts'])
        home = np.sum(home_games['HomeStrikeouts'])
        return (away + home) / df.shape[0]
    
    elif (column=="StolenBases"):
        away = np.sum(away_games['AwayStolenBases'])
        home = np.sum(home_games['HomeStolenBases'])
        return (away + home) / df.shape[0]
    
    elif (column=="LOB"):
        away = np.sum(away_games['AwayLeftOnBase'])
        home = np.sum(home_games['HomeLeftOnBase'])
        return (away + home) / df.shape[0]

In [74]:
h = get_last_n_games('TEX', 5, 10, training_set)
x = return_rolling_average_of_column(h, 'TEX', "LOB")
x

5.444444444444445

In [99]:
rows_list = []
# for i in range(training_set.shape[0]):
for i in range(1):
    current_game = training_set.iloc[i]
    current_home = current_game['HomeTeam']
    current_away = current_game['AwayTeam']
    home_game_number = current_game['HomeTeamGameNumber']
    away_game_number = current_game['AwayTeamGameNumber']
    
    #last 20 games of home team
    last20home = get_last_n_games(current_home, 20, home_game_number, training_set)
    
    #last 20 games of away team
    last20away = get_last_n_games(current_away, 20, away_game_number, training_set)

    rows_list.append({
            'home_team': current_home,
            'away_team': current_away,
            'home_20GameRollingAvgBattingAvg': return_rolling_average_of_column(last20home, current_home, "BA"),
            'away_20GameRollingAvgBattingAvg': return_rolling_average_of_column(last20away, current_away, "BA"),
            'home_20GameRollingAvgRunsPerGame': return_rolling_average_of_column(last20home, current_home, "Runs"),
            'away_20GameRollingAvgRunsPerGame': return_rolling_average_of_column(last20away, current_away, "Runs"),
            'home_20GameRollingAvgWalksPerGame': return_rolling_average_of_column(last20home, current_home, "Walks"),
            'away_20GameRollingAvgWalksPerGame': return_rolling_average_of_column(last20away, current_away, "Walks"),
            'home_20GameRollingAvgStrikeoutsPerGame': return_rolling_average_of_column(last20home, current_home, "Strikeouts"),
            'away_20GameRollingAvgStrikeoutsPerGame': return_rolling_average_of_column(last20away, current_away, "Strikeouts"),
            'home_20GameRollingAvgSBPerGame': return_rolling_average_of_column(last20home, current_home, "StolenBases"),
            'away_20GameRollingAvgSBPerGame': return_rolling_average_of_column(last20away, current_away, "StolenBases"),
            'home_20GameRollingAvgLOBPerGame': return_rolling_average_of_column(last20home, current_home, "LOB"),
            'away_20GameRollingAvgLOBPerGame': return_rolling_average_of_column(last20away, current_away, "LOB"),
        })
    

In [100]:
f = pd.DataFrame(rows_list)
f

Unnamed: 0,away_20GameRollingAvgBattingAvg,away_20GameRollingAvgLOBPerGame,away_20GameRollingAvgRunsPerGame,away_20GameRollingAvgSBPerGame,away_20GameRollingAvgStrikeoutsPerGame,away_20GameRollingAvgWalksPerGame,away_team,home_20GameRollingAvgBattingAvg,home_20GameRollingAvgLOBPerGame,home_20GameRollingAvgRunsPerGame,home_20GameRollingAvgSBPerGame,home_20GameRollingAvgStrikeoutsPerGame,home_20GameRollingAvgWalksPerGame,home_team
0,0.21169,5.65,3.15,0.35,8.35,2.8,SDN,0.265217,6.8,5.25,0.8,9.4,3.1,ARI
