# Baseball Analytics
Some questions I want to answer:
1. Does handedness really matter when it comes to pitching/batting? Do lefties really do better against right-handed batters?
2. Can you predict a batters OBP?
3. Does the number of outs affect a batters performance? 

In [26]:
import pandas as pd
import numpy as np
import re
import os

## What to do with raw data?
I think the best way to go about this is to create a games, players and play by play dataset. The *games* dataset will have
1. a unique ID
2. Team info, away, home, etc.
3. Weather info
4. All fields with `info` tag in the txt file.

The *players* dataset will have
1. player ID's
2. team ID's (coupled with years for trades?)
3. position
4. handedness
5. any other demos I can find

The *play-by-play* data will have all information pertaining to individual plays in games!
1. GameID
2. PlayID
3. PlayerID's (all players who were involved)
4. pitch sequence
5. other events?

Check out retrosheet's [detailed descriptions](https://www.retrosheet.org/eventfile.htm) of the play-by-play files. 


In [27]:
# Here we're just going to read in our data line by line. For now we are just dealing with 2010 data
folderpath = r'Data/Retrosheet/2010-2019/2010'
filepaths = [os.path.join(folderpath, name) for name in os.listdir(folderpath) if "ROS" not in name]
data = []

for path in filepaths:
    with open(path, 'r') as f:
        file = f.readlines()
        data = data + file

In [28]:
# here I'm just grabbing all of the info classes so I don't have to type them out myself
# NOTE: There could be more classes than this; we can improve this later if we want
infoClassList = ['gameID']

for line in data:
    flag = re.search(r'([a-z]*),', line).group(1)
    if (flag == 'info'):
        infoObj = re.search(r'info,([A-Za-z\d]*),(.*)\n', line)
        infoClass = infoObj.group(1)
        if infoClass not in infoClassList:
            infoClassList.append(infoClass)
            
infoClassList.append('starters')

# we need a dictionary to append to the games dataframe
appDict = {}
for el in infoClassList:
    appDict[el]=None

In [29]:
startersList = []
gameID = ''
homePitcher = ''
awayPitcher = ''
switch = False # this will be used to denote when batters switch hit
# I deal with dictionaries here because it is WAY faster than appending to pandas dataframes. 
# Seriously, orders of magnitude faster
playsDict = {}
gamesDict = {}
gIdx = 0
pIdx = 0
# let's just parse through each line and get what we need.
for line in data:
    flag = re.search(r'([a-z]*),(.*)\n', line).group(1)
    if flag == 'id':
        if appDict['gameID'] is not None:
            # this prevents us from writing Null on the first pass through
            appDict['starters'] = startersList
            gamesDict[gIdx] = appDict
            # need to reset these quantities now that we have a new game
            startersList = []
            gameID = ''
            homePitcher = ''
            awayPitcher = ''
            
            gIdx += 1
            
        gameID = re.search(r'id,([A-Z\d]*)\n', line).group(1)
        appDict['gameID'] = gameID
        
    elif flag == 'info': 
        # this is information about the game: time, weather, etc.
        infoObj = re.search(r'info,([\w]*),(.*)\n', line)
        infoClass = infoObj.group(1)
        infoVal = infoObj.group(2)
        appDict[infoClass] = infoVal
        
    elif flag == 'start':
        # This is information about the starters of the game. 
        startObj = re.search(r'start,([\w-]*),"(.*)",([\d]*),([\d]*),([\d]*)\n', line)
        playerID = startObj.group(1)
        homeAway = int(startObj.group(3))
        position = int(startObj.group(5))
        #print('ID', playerID,'homeAway', homeAway,'pos', position)
        if position == 1:
            if homeAway == 0:
                homePitcher = playerID
            else:
                awayPitcher = playerID
        # not using these for now
        # playerName = startObj.group(2)
        # battingPosition = startObj.grostartersList = group(4)
        startersList.append([playerID, int(homeAway), int(position)])
        
    elif flag == 'play':
        playObj = re.search(r'play,([\d]*),([\d]*),([\w-]*),(.*),(.*),(.*)\n', line)
        inning = int(playObj.group(1))
        homeAway = int(playObj.group(2))
        playerID = playObj.group(3)
        count = playObj.group(4)
        pitches = playObj.group(5)
        playString = playObj.group(6)
        # print(homePitcher,awayPitcher)
        if homeAway == 0:
            tempPitch = homePitcher
        else:
            tempPitch = awayPitcher
        temp = {
            'gameID':gameID,
            'inning':inning,
            'h/a':homeAway,
            'playerID':playerID,
            'pitcherID':tempPitch,
            'eventCount':count,
            'pitches':pitches,
            'playString':playString
        }
        playsDict[pIdx] = temp
        pIdx += 1
        
    elif flag == 'sub':
        # This string is the same as the start string. The only info I care
        # about right now is if the pitcher changes. I may want more later!
        subObj = re.search(r'sub,([\w-]*),"(.*)",([\d]*),([\d]*),([\d]*)\n', line)
        playerID = subObj.group(1)
        homeAway = int(subObj.group(3))
        position = int(subObj.group(5))
        if position == 1:
            #print(playerID)
            if homeAway == 0:
                homePitcher = playerID
            else:
                awayPitcher = playerID
        # not using these for now
        # playerName = subObj.group(2)
        # battingPosition = subObj.grostartersList = group(4)
    elif flag == 'badj':
        switch = True
        print('SWITCH!')
        # we need this to inform each subsequent play row. 
        # The hard part is turning it off when there is a new batter!
    elif (flag != 'data') and (flag != 'version') and (flag != 'com'): 
            print('uh-oh:', flag)
        
        

In [36]:
# now let's parse this data and get some dataframes out of it.
games = pd.DataFrame.from_dict(gamesDict, 'index', columns = infoClassList)

# this only has a few items so we can hand code these features. I haven't decided how I want to split up the pitches and play strings yet
# I may need to add a "switch" feature for when a batter switches hands
plays = pd.DataFrame.from_dict(playsDict, 'index', columns = ['gameID', 'inning', 'h/a', 'playerID', 'pitcherID', 'eventCount', 'pitches', 'playString']) 

In [37]:
li = plays.pitcherID.unique().tolist()
for p in li:
    if 'kers' in p:
        print(p)

kersc001


In [38]:
jverlander = plays[plays['pitcherID'] == 'verlj001']

In [39]:
ckershaw = plays[plays['pitcherID'] == 'kersc001']

In [40]:
ckershaw

Unnamed: 0,gameID,inning,h/a,playerID,pitcherID,eventCount,pitches,playString
1933,CHN201005250,1,0,furcr001,kersc001,32,BFBFBC,K
1934,CHN201005250,1,0,martr004,kersc001,00,X,S9/G
1935,CHN201005250,1,0,kempm001,kersc001,32,B1FBBS>S,K+SB2
1936,CHN201005250,1,0,ramim002,kersc001,30,BBBB,W
1937,CHN201005250,1,0,lonej001,kersc001,02,CCX,64(1)/FO/G
...,...,...,...,...,...,...,...,...
209180,SDN201009070,8,0,gibbj002,kersc001,11,.BFX,8!/F
209181,SDN201009070,8,0,podss001,kersc001,00,,NP
209182,SDN201009070,8,0,podss001,kersc001,12,.BCFFS,K
209183,SDN201009070,8,0,furcr001,kersc001,32,BBCSBB,W


In [41]:
plays.inning.unique().tolist()

['1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 '10',
 '11',
 '12',
 '13',
 '14',
 '15',
 '16',
 '17',
 '18',
 '19',
 '20']