In [None]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import os
import shutil
import unicodedata
import mlbgame
import glob

In [None]:
df = pd.read_csv('GL2017.csv')

In [None]:
columns = ['Date', 'Day', 'AwayTeam', 'AwayTeamGameNumber', 'HomeTeam', 'HomeTeamGameNumber', 'AwayScore', 'HomeScore',
          'AwayAtBats', 'AwayHits', 'AwayDoubles', 'AwayTriples', 'AwayHomeRuns', 'AwayRBI', 'AwayWalks', 
          'AwayIntentionalWalks', 'AwayStrikeouts', 'AwayStolenBases', 'AwayCaughtStealing', 'AwayLeftOnBase',
          'AwayIndividualERA', 'AwayTeamERA', 'AwayErrors', 'AwayStartingPitcher',
          'HomeAtBats', 'HomeHits', 'HomeDoubles', 'HomeTriples', 'HomeHomeRuns', 'HomeRBI', 'HomeWalks', 
          'HomeIntentionalWalks', 'HomeStrikeouts', 'HomeStolenBases', 'HomeCaughtStealing', 'HomeLeftOnBase',
          'HomeIndividualERA', 'HomeTeamERA', 'HomeErrors', 'HomeStartingPitcher']

In [None]:
df = df[columns]
df.head()

In [None]:
#removing the first 20 games of the season for each team
training_set = df.loc[df.AwayTeamGameNumber >20]
training_set = training_set.loc[df.HomeTeamGameNumber >20]

In [None]:
print (training_set.shape)
training_set.head()

In [None]:
"""X is the dataset. 
Returns a CSV containing last numOfGames games of team teamName (doesn't whether teamName is home or away).
Doesn't include currentGameNumber
"""
def get_last_n_games(teamName, numOfGames, currentGameNumber, X):
    away_games = df.loc[df.AwayTeam==teamName]
    away_games = away_games.loc[df.AwayTeamGameNumber < currentGameNumber]
    away_games = away_games.loc[df.AwayTeamGameNumber >= (currentGameNumber - numOfGames)]
    
    home_games = df.loc[df.HomeTeam==teamName]
    home_games = home_games.loc[df.HomeTeamGameNumber < currentGameNumber]
    home_games = home_games.loc[df.HomeTeamGameNumber >= (currentGameNumber - numOfGames)]
    
    return pd.concat([home_games, away_games])

In [None]:
"""
Possible values of column:
BATTING: BA, Runs, Walks, Strikeouts, LOB, StolenBases, OPS
"""
def return_rolling_average_of_column(df, team, column):
    numer = 0.0
    denom = 0
    away_games = df.loc[df.AwayTeam==team]
    home_games = df.loc[df.HomeTeam==team]
    
    if (column=="BA"): #we want the average batting average for this team
        awayhits = np.sum(away_games['AwayHits'])
        awayatbats = np.sum(away_games['AwayAtBats'])
        homehits = np.sum(home_games['HomeHits'])
        homeatbats = np.sum(home_games['HomeAtBats'])
        return (homehits + awayhits) / (homeatbats + awayatbats)
    
    elif (column=="Runs"):
        awayRuns = np.sum(away_games['AwayScore'])
        homeRuns = np.sum(home_games['HomeScore'])
        return (awayRuns + homeRuns) / df.shape[0]
    
    elif (column=="Walks"):
        awayWalks = np.sum(away_games['AwayWalks'])
        homeWalks = np.sum(home_games['HomeWalks'])
        return (awayWalks + homeWalks) / df.shape[0]
    
    elif (column=="Strikeouts"):
        away = np.sum(away_games['AwayStrikeouts'])
        home = np.sum(home_games['HomeStrikeouts'])
        return (away + home) / df.shape[0]
    
    elif (column=="StolenBases"):
        away = np.sum(away_games['AwayStolenBases'])
        home = np.sum(home_games['HomeStolenBases'])
        return (away + home) / df.shape[0]
    
    elif (column=="LOB"):
        away = np.sum(away_games['AwayLeftOnBase'])
        home = np.sum(home_games['HomeLeftOnBase'])
        return (away + home) / df.shape[0]
    
    elif (column=="OPS"): ##TO COMPLETE!!!!!!!
        #first calculating slugging
        home_doubles = np.sum(home_games['HomeDoubles'])
        away_doubles = np.sum(away_games['AwayDoubles'])
        home_triples = np.sum(home_games['HomeTriples'])
        away_triples = np.sum(away_games['AwayTriples'])
        home_hrs = np.sum(home_games['HomeHomeRuns'])
        away_hrs = np.sum(away_games['AwayHomeRuns'])
        homeatbats = np.sum(home_games['HomeAtBats'])
        awayatbats = np.sum(away_games['AwayAtBats'])
        home_hits = np.sum(home_games['HomeHits'])
        away_hits = np.sum(away_games['AwayHits'])
        home_singles = home_hits - (home_doubles + home_triples + home_hrs)
        away_singles = away_hits - (away_doubles  +away_triples + away_hrs)
        total_bases = (home_singles + away_singles) + (2* (home_doubles+away_doubles)) + (3* (home_triples_away_triples))+(4*(home_hrs+away_hrs))
        slugging = total_bases / (homeatbats + awayatbats)
    
    #TODO: Calculate OBP and SLG and combine to get OPS

In [None]:
h = get_last_n_games('TEX', 5, 10, training_set)
x = return_rolling_average_of_column(h, 'TEX', "LOB")
x

In [None]:
rows_list = []
for i in range(training_set.shape[0]):
    if (i%25==0):
        print (i)
    current_game = training_set.iloc[i]
    current_home = current_game['HomeTeam']
    current_away = current_game['AwayTeam']
    home_game_number = current_game['HomeTeamGameNumber']
    away_game_number = current_game['AwayTeamGameNumber']
    
    #last 20 games of home team
    last20home = get_last_n_games(current_home, 20, home_game_number, training_set)
    #last 20 games of away team
    last20away = get_last_n_games(current_away, 20, away_game_number, training_set)
    
    #last 10 games of home team
    last10home = get_last_n_games(current_home, 10, home_game_number, training_set)
    #last 10 games of away team
    last10away = get_last_n_games(current_away, 10, away_game_number, training_set)
    
    #last 5 games of home team
    last5home = get_last_n_games(current_home, 5, home_game_number, training_set)
    #last 5 games of away team
    last5away = get_last_n_games(current_away, 5, away_game_number, training_set)

    rows_list.append({
            'home_team': current_home,
            'away_team': current_away,
            
            #20 game hitting stats
            'home_20GameAvgBattingAvg': return_rolling_average_of_column(last20home, current_home, "BA"),
            'away_20GameAvgBattingAvg': return_rolling_average_of_column(last20away, current_away, "BA"),
            'home_20GameAvgRunsPerGame': return_rolling_average_of_column(last20home, current_home, "Runs"),
            'away_20GameAvgRunsPerGame': return_rolling_average_of_column(last20away, current_away, "Runs"),
            'home_20GameAvgWalksPerGame': return_rolling_average_of_column(last20home, current_home, "Walks"),
            'away_20GameAvgWalksPerGame': return_rolling_average_of_column(last20away, current_away, "Walks"),
            'home_20GameAvgStrikeoutsPerGame': return_rolling_average_of_column(last20home, current_home, "Strikeouts"),
            'away_20GameAvgStrikeoutsPerGame': return_rolling_average_of_column(last20away, current_away, "Strikeouts"),
            'home_20GameAvgSBPerGame': return_rolling_average_of_column(last20home, current_home, "StolenBases"),
            'away_20GameAvgSBPerGame': return_rolling_average_of_column(last20away, current_away, "StolenBases"),
            'home_20GameAvgLOBPerGame': return_rolling_average_of_column(last20home, current_home, "LOB"),
            'away_20GameAvgLOBPerGame': return_rolling_average_of_column(last20away, current_away, "LOB"),
            
            #10 game hitting stats
            'home_10GameAvgBattingAvg': return_rolling_average_of_column(last10home, current_home, "BA"),
            'away_10GameAvgBattingAvg': return_rolling_average_of_column(last10away, current_away, "BA"),
            'home_10GameAvgRunsPerGame': return_rolling_average_of_column(last10home, current_home, "Runs"),
            'away_10GameAvgRunsPerGame': return_rolling_average_of_column(last10away, current_away, "Runs"),
            'home_10GameAvgWalksPerGame': return_rolling_average_of_column(last10home, current_home, "Walks"),
            'away_10GameAvgWalksPerGame': return_rolling_average_of_column(last10away, current_away, "Walks"),
            'home_10GameAvgStrikeoutsPerGame': return_rolling_average_of_column(last10home, current_home, "Strikeouts"),
            'away_10GameAvgStrikeoutsPerGame': return_rolling_average_of_column(last10away, current_away, "Strikeouts"),
            'home_10GameAvgSBPerGame': return_rolling_average_of_column(last10home, current_home, "StolenBases"),
            'away_10GameAvgSBPerGame': return_rolling_average_of_column(last10away, current_away, "StolenBases"),
            'home_10GameAvgLOBPerGame': return_rolling_average_of_column(last10home, current_home, "LOB"),
            'away_10GameAvgLOBPerGame': return_rolling_average_of_column(last10away, current_away, "LOB"),
            
            #5 game hitting stats
            'home_5GameAvgBattingAvg': return_rolling_average_of_column(last5home, current_home, "BA"),
            'away_5GameAvgBattingAvg': return_rolling_average_of_column(last5away, current_away, "BA"),
            'home_5GameAvgRunsPerGame': return_rolling_average_of_column(last5home, current_home, "Runs"),
            'away_5GameAvgRunsPerGame': return_rolling_average_of_column(last5away, current_away, "Runs"),
            'home_5GameAvgWalksPerGame': return_rolling_average_of_column(last5home, current_home, "Walks"),
            'away_5GameAvgWalksPerGame': return_rolling_average_of_column(last5away, current_away, "Walks"),
            'home_5GameAvgStrikeoutsPerGame': return_rolling_average_of_column(last5home, current_home, "Strikeouts"),
            'away_5GameAvgStrikeoutsPerGame': return_rolling_average_of_column(last5away, current_away, "Strikeouts"),
            'home_5GameAvgSBPerGame': return_rolling_average_of_column(last5home, current_home, "StolenBases"),
            'away_5GameAvgSBPerGame': return_rolling_average_of_column(last5away, current_away, "StolenBases"),
            'home_5GameAvgLOBPerGame': return_rolling_average_of_column(last5home, current_home, "LOB"),
            'away_5GameAvgLOBPerGame': return_rolling_average_of_column(last5away, current_away, "LOB"),
        })
    
f = pd.DataFrame(rows_list)
f.to_csv('test.csv', 
         index=None, 
         columns = ['home_team', 'away_team', 'home_20GameAvgBattingAvg', 'home_10GameAvgBattingAvg','home_5GameAvgBattingAvg',
                    'home_20GameAvgRunsPerGame','home_10GameAvgRunsPerGame','home_5GameAvgRunsPerGame',
                    'home_20GameAvgWalksPerGame','home_10GameAvgWalksPerGame','home_5GameAvgWalksPerGame',
                    'home_20GameAvgStrikeoutsPerGame','home_10GameAvgStrikeoutsPerGame','home_5GameAvgStrikeoutsPerGame',
                    'home_20GameAvgSBPerGame','home_10GameAvgSBPerGame', 'home_5GameAvgSBPerGame',
                    'home_20GameAvgLOBPerGame','home_10GameAvgLOBPerGame','home_5GameAvgLOBPerGame',
                    'away_20GameAvgBattingAvg','away_10GameAvgBattingAvg','away_5GameAvgBattingAvg',
                    'away_20GameAvgRunsPerGame','away_10GameAvgRunsPerGame','away_5GameAvgRunsPerGame',
                    'away_20GameAvgWalksPerGame','away_10GameAvgWalksPerGame', 'away_5GameAvgWalksPerGame',
                    'away_20GameAvgStrikeoutsPerGame','away_10GameAvgStrikeoutsPerGame', 'away_5GameAvgStrikeoutsPerGame'
                    'away_20GameAvgSBPerGame', 'away_10GameAvgSBPerGame', 'away_5GameAvgSBPerGame',
                    'away_20GameAvgLOBPerGame', 'away_10GameAvgLOBPerGame','away_5GameAvgLOBPerGame'
                   ])