# Baseball Analytics
Some questions I want to answer:
1. Does handedness really matter when it comes to pitching/batting? Do lefties really do better against right-handed batters?
2. Can you predict a batters OBP?
3. Does the number of outs affect a batters performance? 

In [1]:
import pandas as pd
import numpy as np
import re
import os

## What to do with raw data?
I think the best way to go about this is to create a games, players and play by play dataset. The *games* dataset will have
1. a unique ID
2. Team info, away, home, etc.
3. Weather info
4. All fields with `info` tag in the txt file.

The *players* dataset will have
1. player ID's
2. team ID's (coupled with years for trades?)
3. position
4. handedness
5. any other demos I can find

The *play-by-play* data will have all information pertaining to individual plays in games!
1. GameID
2. PlayID
3. PlayerID's (all players who were involved)
4. pitch sequence
5. other events?

Check out retrosheet's [detailed descriptions](https://www.retrosheet.org/eventfile.htm) of the play-by-play files. 


In [2]:
# Here we're just going to read in our data line by line. For now we are just dealing with 2010 data
folderpath = r'Data/Retrosheet/2010-2019/2010'
filepaths = [os.path.join(folderpath, name) for name in os.listdir(folderpath) if "ROS" not in name]
data = []

for path in filepaths:
    with open(path, 'r') as f:
        file = f.readlines()
        data = data + file

In [3]:
# here I'm just grabbing all of the info classes so I don't have to type them out myself
# NOTE: There could be more classes than this; we can improve this later if we want
infoClassList = ['gameID']

for line in data:
    flag = re.search(r'([a-z]*),', line).group(1)
    if (flag == 'info'):
        infoObj = re.search(r'info,([A-Za-z\d]*),(.*)\n', line)
        infoClass = infoObj.group(1)
        if infoClass not in infoClassList:
            infoClassList.append(infoClass)
            
infoClassList.append('starters')

# we need a dictionary to append to the games dataframe
appDict = {}
for el in infoClassList:
    appDict[el]=None

In [4]:
startersList = []
gameID = ''
homePitcher = ''
awayPitcher = ''
switchID = ''
switch = False # this will be used to denote when batters switch hit
# I deal with dictionaries here because it is WAY faster than appending to pandas dataframes. 
# Seriously, orders of magnitude faster
playsDict = {}
gamesDict = {}
gIdx = 0
pIdx = 0
# let's just parse through each line and get what we need.
for line in data:
    flag = re.search(r'([a-z]*),(.*)\n', line).group(1)
    if flag == 'id':
        if appDict['gameID'] is not None:
            # this prevents us from writing Null on the first pass through
            appDict['starters'] = startersList
            gamesDict[gIdx] = appDict
            # need to reset these quantities now that we have a new game
            startersList = []
            gameID = ''
            homePitcher = ''
            awayPitcher = ''
            
            gIdx += 1
            
        gameID = re.search(r'id,([A-Z\d]*)\n', line).group(1)
        appDict['gameID'] = gameID
        
    elif flag == 'info': 
        # this is information about the game: time, weather, etc.
        infoObj = re.search(r'info,([\w]*),(.*)\n', line)
        infoClass = infoObj.group(1)
        infoVal = infoObj.group(2)
        appDict[infoClass] = infoVal
        
    elif flag == 'start':
        # This is information about the starters of the game. 
        startObj = re.search(r'start,([\w-]*),"(.*)",([\d]*),([\d]*),([\d]*)\n', line)
        playerID = startObj.group(1)
        homeAway = int(startObj.group(3))
        position = int(startObj.group(5))
        #print('ID', playerID,'homeAway', homeAway,'pos', position)
        if position == 1:
            if homeAway == 0:
                homePitcher = playerID
            else:
                awayPitcher = playerID
        # not using these for now
        # playerName = startObj.group(2)
        # battingPosition = startObj.grostartersList = group(4)
        startersList.append([playerID, int(homeAway), int(position)])
        
    elif flag == 'play':
        playObj = re.search(r'play,([\d]*),([\d]*),([\w-]*),(.*),(.*),(.*)\n', line)
        inning = int(playObj.group(1))
        homeAway = int(playObj.group(2))
        playerID = playObj.group(3)
        count = playObj.group(4)
        pitches = playObj.group(5)
        playString = playObj.group(6)
        # print(homePitcher,awayPitcher)
        if homeAway == 0:
            tempPitch = homePitcher
        else:
            tempPitch = awayPitcher
            
        if switchID != playerID:
            switch = False
        temp = {
            'gameID':gameID,
            'inning':inning,
            'h/a':homeAway,
            'playerID':playerID,
            'switch':switch,
            'pitcherID':tempPitch,
            'eventCount':count,
            'pitches':pitches,
            'playString':playString
        }
        playsDict[pIdx] = temp
        pIdx += 1
        
    elif flag == 'sub':
        # This string is the same as the start string. The only info I care
        # about right now is if the pitcher changes. I may want more later!
        subObj = re.search(r'sub,([\w-]*),"(.*)",([\d]*),([\d]*),([\d]*)\n', line)
        playerID = subObj.group(1)
        homeAway = int(subObj.group(3))
        position = int(subObj.group(5))
        if position == 1:
            #print(playerID)
            if homeAway == 0:
                homePitcher = playerID
            else:
                awayPitcher = playerID
        # not using these for now
        # playerName = subObj.group(2)
        # battingPosition = subObj.grostartersList = group(4)
    elif flag == 'badj':
        # flip switch hitting to true
        switch = True
        subObj = re.search(r'badj,([\w-]*),([A-Z])\n', line)
        # we need to know when the batter switches to the next batter (some at bats contain more than one row)
        switchID = subObj.group(1)
        
    elif flag == 'padj':
        # a siwtch pitching flag! rarely happens.
        print('pitching adjustment: ', line)
        
    elif (flag != 'data') and (flag != 'version') and (flag != 'com'):
        # this is just a check to make sure nothing weird is happening!
        # data rows indicate errors at the end of the game
        # version rows are not in use anymore
        # com rows are just notes on strange or uncertain plays
        print('uh-oh: ', line)
        
        

In [5]:
# we're just going to put these dictionaries into dataframes for ease of analysis.
games = pd.DataFrame.from_dict(gamesDict, 'index', columns = infoClassList)

# this only has a few items so we can hand code these features. I haven't decided how I want to split up the pitches and play strings yet
# I may need to add a "switch" feature for when a batter switches hands
plays = pd.DataFrame.from_dict(playsDict, 'index', columns = ['gameID', 'inning', 'h/a', 'playerID', 'switch', 'pitcherID', 'eventCount', 'pitches', 'playString']) 

In [12]:
# let's get the rosters for 2010 as well. 
rosterpaths = [os.path.join(folderpath, name) for name in os.listdir(folderpath) if "ROS" in name]
rdata = []

for path in rosterpaths:
    with open(path, 'r') as f:
        file = f.readlines()
        rdata = rdata + file

In [18]:
playersDict = {}
i = 0
for line in rdata:
    playerObj = re.search(r'(.*),(.*),(.*),([A-Z]),([A-Z]),([A-Z]*),([\w]*)\n', line)
    playerID = playerObj.group(1)
    last = playerObj.group(2)
    first = playerObj.group(3)
    batHand = playerObj.group(4)
    throwHand = playerObj.group(5)
    team = playerObj.group(6)
    position = playerObj.group(7)
    temp = {
        'playerID' : playerID,
        'last' : last,
        'first' : first,
        'batHand' : batHand,
        'throwHand' : throwHand,
        'team' : team,
        'position' : position
    }
    playersDict[i] = temp
    i += 1

In [19]:
# making a dataframe out of the players dictionary, just like before. 
players = pd.DataFrame.from_dict(playersDict, 'index', columns = ['playerID','last','first','batHand','throwHand','team','position'])

In [20]:
players.head(20)

Unnamed: 0,playerID,last,first,batHand,throwHand,team,position
0,adamm001,Adams,Mike,R,R,SDN,P
1,baxtm001,Baxter,Mike,L,R,SDN,1B
2,bellh001,Bell,Heath,R,R,SDN,P
3,blank002,Blanks,Kyle,R,R,SDN,OF
4,cabre001,Cabrera,Everth,B,R,SDN,SS
5,corrk001,Correia,Kevin,R,R,SDN,P
6,cunna001,Cunningham,Aaron,R,R,SDN,OF
7,denoc001,Denorfia,Chris,R,R,SDN,OF
8,dural001,Durango,Luis,B,R,SDN,OF
9,ecksd001,Eckstein,David,R,R,SDN,2B


In [9]:
ckershaw[ckershaw.playString.str.contains("K")]

Unnamed: 0,gameID,inning,h/a,playerID,switch,pitcherID,eventCount,pitches,playString
1933,CHN201005250,1,0,furcr001,False,kersc001,32,BFBFBC,K
1935,CHN201005250,1,0,kempm001,False,kersc001,32,B1FBBS>S,K+SB2
1948,CHN201005250,3,0,kersc001,False,kersc001,32,CFBBBFC,K
1958,CHN201005250,4,0,paulx001,False,kersc001,02,SFS,K
1965,CHN201005250,5,0,kersc001,False,kersc001,12,BCFS,K
...,...,...,...,...,...,...,...,...,...
209152,SDN201009070,5,0,ethia001,False,kersc001,12,BCSS,K23
209153,SDN201009070,5,0,kempm001,False,kersc001,02,SSC,K
209158,SDN201009070,6,0,barar001,False,kersc001,22,CBBCFS,K
209182,SDN201009070,8,0,podss001,False,kersc001,12,.BCFFS,K


In [10]:
a = 1
c = 0
if a == 1:
    b = 2
if c == 0:
    print(b)
    b = 3
    print(b)


2
3
