In [None]:
# Constructing the data set for Strokes Gained analysis - Evan Callaghan

# In the data collection process, we are asking for as little shot-level data as possible from our participants. In order to get every golfer
# on board for collection, it needs to be simple. Therefore, we are only collecting the following: golf course name, set of tees played, date played,
# hole number, shot number, distance to hole after shot, shot lie type, and penalties incurred. With the steps provided in this program, we will be
# able to calculate all necessary variables for a strokes gained analysis of each golfer.

# The output of this program will be a shot-level data set that is fully prepared for Strokes Gained analysis.

# Ordering:
# 1. Reading in all data (main)
# 2. Creating/initializing all variables
# 3. Merging golf course data with shot level data
# 4. Re-structuring the data to look nicer
# 5. Calculating shot location
# 6. Calculating location lie type
# 7. Calculating shot category
# 8. Calculating penalty stroke value
# 9. Calculating location baseline
# 10. Calculating next location baseline
# 11. Calculating Strokes Gained
# 12. Calculating gross strokes
# 12. Defining a shot as a putt or not
# 13. Defining a tee-shot as a fairway or not
# 14. Defining an approach as a GIR or not
# 15. Exporting the data frame as a csv

In [15]:
import pandas as pd
import numpy as np
pd.set_option('display.max_rows', None, 'display.max_columns', None)


## Reading the golfer data set
strokes = pd.read_csv('Building_Algorithm.csv')

## Reading in all data frames associated with golf courses
otter = pd.read_csv('Otter_Creek_Data.csv')
ankeny = pd.read_csv('Ankeny_Golf_Data.csv')
lemars = pd.read_csv('Lemars_Data.csv')
lemars2 = pd.read_csv('Lemars_Rd2_Data.csv')

## Reading in baseline data
shot = pd.read_csv('Shot_Baseline.csv')
putt = pd.read_csv('Putt_Baseline.csv')

## ...

## Starting by calling the Initial_Function
strokes = Initial_Function(strokes)

## Printing the resulting data frame
strokes.head()

Unnamed: 0,Golfer,Round_ID,Date,Golf_Course,Hole_Number,Hole_Par,Hole_Yardage,Shot_Number,Shot_Category,Location,Location_Lie_Type,Location_Baseline,Next_Location,Next_Location_Lie_Type,Next_Location_Baseline,Penalty_Incurred,Penalty_Stroke_Value,Strokes_Gained,Gross_Strokes,Putt,Fairway,GIR
0,Evan Callaghan,1001,2021-09-10,Otter Creek,1,4,374,1,Tee,374.0,Tee,3.95,97.0,Rough,3.01,,0.0,-0.06,1.0,0,0,0.0
1,Evan Callaghan,1001,2021-09-10,Otter Creek,1,4,374,2,Approach,97.0,Rough,3.01,16.0,Green,2.13,,0.0,-0.12,1.0,0,0,1.0
2,Evan Callaghan,1001,2021-09-10,Otter Creek,1,4,374,3,Putt,16.0,Green,2.13,4.0,Green,1.69,,0.0,-0.56,1.0,1,0,0.0
3,Evan Callaghan,1001,2021-09-10,Otter Creek,1,4,374,4,Putt,4.0,Green,1.69,1.0,Green,1.04,,0.0,-0.35,1.0,1,0,0.0
4,Evan Callaghan,1001,2021-09-10,Otter Creek,1,4,374,5,Putt,1.0,Green,1.04,0.0,Hole,0.0,,0.0,0.04,1.0,1,0,0.0


In [1]:
def Initial_Function(strokes):
    
    strokes['Shot_Category'] = np.nan
    strokes['Location'] = 0
    strokes['Location_Lie_Type'] = np.nan
    strokes['Location_Baseline'] = 0
    strokes['Next_Location_Baseline'] = 0
    strokes['Penalty_Stroke_Value'] = 0
    strokes['Strokes_Gained'] = 0
    strokes['Gross_Strokes'] = 0
    strokes['Putt'] = 0
    strokes['Fairway'] = 0
    strokes['GIR'] = 0
    
    
    strokes['Golfer'] = strokes['Golfer'].astype(str)
    strokes['Round_ID'] = strokes['Round_ID'].astype(int)
    strokes['Hole_Number'] = strokes['Hole_Number'].astype(int)
    strokes['Shot_Number'] = strokes['Shot_Number'].astype(int)
    strokes['Next_Location'] = strokes['Next_Location'].astype(float)
    strokes['Location'] = strokes['Location'].astype(float)
    strokes['Next_Location_Baseline'] = strokes['Next_Location_Baseline'].astype(float)
    strokes['Location_Baseline'] = strokes['Location_Baseline'].astype(float)
    strokes['Penalty_Stroke_Value'] = strokes['Penalty_Stroke_Value'].astype(float)
    strokes['Strokes_Gained'] = strokes['Strokes_Gained'].astype(float)
    strokes['Putt'] = strokes['Putt'].astype(float)
    strokes['Fairway'] = strokes['Fairway'].astype(float)
    strokes['GIR'] = strokes['GIR'].astype(float)
    
    strokes = Hole_Par_and_Yardage(strokes)
    
    return strokes


In [2]:
def Hole_Par_and_Yardage(strokes):
    
    ## Creating subsets based on golf course
    ## Then performing a left join with that golf course data set (contains info for each hole)
    ## Finally, concatenating all subsets back together 
    
    ## Otter Creek
    otter_sub = strokes[strokes['Golf_Course'] == 'Otter Creek']
    otter_sub = otter_sub.merge(otter, on = 'Hole_Number', how = 'left')
    
    ## Ankeny Golf
    ankeny_sub = strokes[strokes['Golf_Course'] == 'Ankeny Golf']
    ankeny_sub = ankeny_sub.merge(ankeny, on = 'Hole_Number', how = 'left')
    
    ## Lemars
    lemars_sub = strokes[strokes['Golf_Course'] == 'Lemars']
    lemars_sub = lemars_sub.merge(lemars, on = 'Hole_Number', how = 'left')
    
    lemars_sub2 = strokes[strokes['Golf_Course'] == 'Lemars_Rd2']
    lemars_sub2 = lemars_sub2.merge(lemars2, on = 'Hole_Number', how = 'left')

    ## OTHER COURSES TO BE ADDED HERE
    ##other_sub = strokes[strokes['Golf_Course'] == 'Other Course']
    ##other_sub = other_sub.merge(other, on = 'Hole_Number', how = 'left')

    strokes = pd.concat([otter_sub, ankeny_sub, lemars_sub, lemars_sub2]).reset_index()
    
    strokes = strokes.drop(columns = ['index'], axis = 1)
    
    strokes = Restructure_Data(strokes)

    return strokes

In [3]:
def Restructure_Data(strokes):
    
    strokes = strokes[['Golfer', 'Round_ID', 'Date', 'Golf_Course', 'Hole_Number', 'Hole_Par', 'Hole_Yardage', 'Shot_Number', 'Shot_Category', 'Location', 
                   'Location_Lie_Type', 'Location_Baseline', 'Next_Location', 'Next_Location_Lie_Type', 'Next_Location_Baseline', 'Penalty_Incurred', 
                   'Penalty_Stroke_Value', 'Strokes_Gained', 'Gross_Strokes', 'Putt', 'Fairway', 'GIR']]
    
    strokes = Location(strokes)
    
    return strokes

In [4]:
def Location(strokes):
    
    n = strokes.shape[0]

    for i in range(0, n):
        
        ## If the shot number is one, then the Location is the hole yardage
        if (strokes.iloc[i, 7] == 1):
            strokes.iloc[i, 9] = strokes.iloc[i, 6]
    
        ## Else, the Location is the Next_Location of the previous shot
        else:
            strokes.iloc[i, 9] = strokes.iloc[i-1, 12]
    
    strokes = Location_Lie_Type(strokes)
    
    return strokes

In [5]:
def Location_Lie_Type(strokes):

    n = strokes.shape[0]

    for i in range(0, n):
    
        ## If the shot number is one, then the Location_Lie_Type is Tee
        if (strokes.iloc[i, 7] == 1):
            strokes.iloc[i, 10] = 'Tee'
    
        ## Else, the Location_Lie_Type is the Next_Location_Lie_Type of the previous shot
        else:
            strokes.iloc[i, 10] = strokes.iloc[i-1, 13]
    
    strokes = Shot_Category(strokes)
    
    return strokes

In [6]:
def Shot_Category(strokes):  

    n = strokes.shape[0]

    for i in range(0, n):
    
        ## If the shot number is one, then the shot category is either Tee or Approach
        if (strokes.iloc[i, 7] == 1):
            
            ## If the Hole_Par is three, then the shot category is Approach
            if (strokes.iloc[i, 5] == 3):
                strokes.iloc[i, 8] = 'Approach'
                
            ## Else (par 4 or 5), the shot category is Tee
            else:
                strokes.iloc[i, 8] = 'Tee'
    
        ## Else-if the Location_Lie_Type is Green, then the shot category is Putt
        elif (strokes.iloc[i, 10] == 'Green'):
            strokes.iloc[i, 8] = 'Putt'
        
        ## Else-if location <= 45, then shot category is Arond-the-Green
        elif (strokes.iloc[i, 9] <= 45):
             strokes.iloc[i, 8] = 'Around-the-Green'
        
        else:
            strokes.iloc[i, 8] = 'Approach'
    
    strokes = Penalty_Stroke_Value(strokes)
    
    return strokes

In [7]:
def Penalty_Stroke_Value(strokes):
    
    n = strokes.shape[0]

    for i in range(0, n):
        
        ## If Penalty Incurred is PA, then Penalty_Stroke_Value is 1
        if (strokes.iloc[i, 15] == 'PA'):
            strokes.iloc[i, 16] = 1
        
        ## Else-if Penalty Incurred is OB, then Penalty_Stroke_Value is 2
        elif (strokes.iloc[i, 15] == 'OB'):
            strokes.iloc[i, 16] = 2
        
        ## Else, Penalty Stroke Value is 0
        else:
            strokes.iloc[i, 16] = 0
    
    strokes = Location_Baseline(strokes)
    
    return strokes

In [8]:
## Calculating location baseline

def Location_Baseline(strokes):
    
    n = strokes.shape[0]
    
    for i in range(0, n):
        
        ## If the Location Lie Type is Green...
        if (strokes.iloc[i, 10] == 'Green'):
            
            ## Yardage is equal to Location
            yardage = int(strokes.iloc[i, 9])
            
            ## Location Baseline is equal to the average number of strokes to hole-out from that yardage
            strokes.iloc[i, 11] = putt.iloc[yardage-1, 1]
        
        ## Else...
        else:
            
            ## If Location Lie Type is Tee, then
            if (strokes.iloc[i, 10] == 'Tee'):
                
                ## Yardage is equal to Location
                yardage = int(strokes.iloc[i, 9])
                
                ## Location Baseline is set to the average number of strokes to hole-out from that yardage
                strokes.iloc[i, 11] = shot.iloc[yardage-3, 1]
            
            ## If Location Lie Type is Fairway, then
            elif (strokes.iloc[i, 10] == 'Fairway'):
                
                ## Yardage is equal to Location
                yardage = int(strokes.iloc[i, 9])
                
                ## Location Baseline is set to the average number of strokes to hole-out from that yardage
                strokes.iloc[i, 11] = shot.iloc[yardage-3, 2]
            
            ## If Location Lie Type is Fairway, then
            elif (strokes.iloc[i, 10] == 'Rough'):
                
                ## Yardage is equal to Location
                yardage = int(strokes.iloc[i, 9])
                
                ## Location Baseline is set to the average number of strokes to hole-out from that yardage
                strokes.iloc[i, 11] = shot.iloc[yardage-3, 3]
                
            ## If Location Lie Type is Fescue, then
            elif (strokes.iloc[i, 10] == 'Fescue'):
                
                ## Yardage is equal to Location
                yardage = int(strokes.iloc[i, 9])
                
                ## Location Baseline is set to the average number of strokes to hole-out from that yardage
                strokes.iloc[i, 11] = shot.iloc[yardage-3, 3]
                
            ## If Location Lie Type is Sand, then
            elif (strokes.iloc[i, 10] == 'Sand'):
                
                ## Yardage is equal to Location
                yardage = int(strokes.iloc[i, 9])
                
                ## Location Baseline is set to the average number of strokes to hole-out from that yardage
                strokes.iloc[i, 11] = shot.iloc[yardage-3, 4]
    
    strokes = Next_Location_Baseline(strokes)
    
    return strokes

In [9]:
## Calculating next location baseline

def Next_Location_Baseline(strokes):
    
    n = strokes.shape[0]
    
    for i in range(0, n):
        
        ## If Next Location Lie Type is Hole, then the Next Location Baseline is zero
        if (strokes.iloc[i, 13] == 'Hole'):
            
            strokes.iloc[i, 14] = 0
            
        ## Else-if the Next Location Lie Type is Green...
        elif (strokes.iloc[i, 13] == 'Green'):
            
            ## Yardage is equal to Next Location
            yardage = int(strokes.iloc[i, 12])
            
            ## Location Baseline is equal to the average number of strokes to hole-out from that yardage
            strokes.iloc[i, 14] = putt.iloc[yardage-1, 1]
        
        ## Else...
        else:
            
            ## If Next Location Lie Type is Tee, then
            if (strokes.iloc[i, 13] == 'Tee'):
                
                ## Yardage is equal to Location
                yardage = int(strokes.iloc[i, 12])
                
                ## Next Location Baseline is set to the average number of strokes to hole-out from that yardage
                strokes.iloc[i, 14] = shot.iloc[yardage-3, 1]
            
            ## If Next Location Lie Type is Fairway, then
            elif (strokes.iloc[i, 13] == 'Fairway'):
                
                ## Yardage is equal to Location
                yardage = int(strokes.iloc[i, 12])
                
                ## Next Location Baseline is set to the average number of strokes to hole-out from that yardage
                strokes.iloc[i, 14] = shot.iloc[yardage-3, 2]
            
            ## If Next Location Lie Type is Rough, then
            elif (strokes.iloc[i, 13] == 'Rough'):
                
                ## Yardage is equal to Location
                yardage = int(strokes.iloc[i, 12])
                
                ## Next Location Baseline is set to the average number of strokes to hole-out from that yardage
                strokes.iloc[i, 14] = shot.iloc[yardage-3, 3]
            
             ## If Next Location Lie Type is Fescue, then
            elif (strokes.iloc[i, 13] == 'Fescue'):
                
                ## Yardage is equal to Location
                yardage = int(strokes.iloc[i, 12])
                
                ## Next Location Baseline is set to the average number of strokes to hole-out from that yardage
                strokes.iloc[i, 14] = shot.iloc[yardage-3, 3]
                
            ## If Next Location Lie Type is Sand, then
            elif (strokes.iloc[i, 13] == 'Sand'):
                
                ## Yardage is equal to Location
                yardage = int(strokes.iloc[i, 12])
                
                ## Location Baseline is set to the average number of strokes to hole-out from that yardage
                strokes.iloc[i, 14] = shot.iloc[yardage-3, 4]
    
    strokes = Strokes_Gained(strokes)
    
    return strokes

In [10]:
def Strokes_Gained(strokes):
    
    n = strokes.shape[0]
    
    for i in range (0, n):
        
        ## Strokes Gained = Location Baseline - Next Location Baseline - 1 - Penalty Stroke Value
        strokes.iloc[i, 17] = strokes.iloc[i, 11] - strokes.iloc[i, 14] - 1 - strokes.iloc[i, 16]
        
    strokes['Strokes_Gained'] = strokes['Strokes_Gained'].round(3)
    
    strokes = Gross_Strokes(strokes)
    
    return strokes

In [11]:
def Gross_Strokes(strokes):
    
    ## The gross strokes of a shot is one plus the number of penalty strokes incurred on that particular shot
    strokes['Gross_Strokes'] = 1 + strokes['Penalty_Stroke_Value']
    
    strokes = Putt(strokes)
    
    return strokes

In [12]:
def Putt(strokes):
    
    ## When shot category is Putt, putts = 1. ELse, 0
    strokes['Putt'] = np.where(strokes['Shot_Category'] == 'Putt', 1, 0)
    
    strokes = Fairway(strokes)
    
    return strokes 

In [13]:
def Fairway(strokes):
    
    ## When shot category is Tee, and next_location_lie_type is fairway, Fairway = 1. Else, 0
    strokes['Fairway'] = np.where((strokes['Shot_Category'] == 'Tee') & (strokes['Next_Location_Lie_Type'] == 'Fairway'), 1, 0)
    
    strokes = GIR(strokes)
    
    return strokes

In [14]:
def GIR(strokes):
    
    n = strokes.shape[0]
    
    for i in range(0, n):
        
        ## If Hole Par is three...
        if (strokes.iloc[i, 5] == 3):
            
            ## If Shot Number is one and the Next Location Lie Type is Hole, then GIR is one
            if (strokes.iloc[i, 7] == 1 and (strokes.iloc[i, 13] == 'Hole' or strokes.iloc[i, 13] == 'Green')):
                strokes.iloc[i, 21] = 1

            ## Else, GIR is zero
            else:
                strokes.iloc[i, 21] = 0
                
        ## Else-if Hole Par is four...
        elif (strokes.iloc[i, 5] == 4):
            
            ## If Shot Number is one and the Next Location Lie Type is Hole, then GIR is one
            if (strokes.iloc[i, 7] == 1 and (strokes.iloc[i, 13] == 'Hole' or strokes.iloc[i, 13] == 'Green')):
                strokes.iloc[i, 21] = 1
                
            ## Else-if Shot Number is two and the Next Location Lie Type is Hole, then GIR is one
            elif (strokes.iloc[i, 7] == 2 and (strokes.iloc[i, 13] == 'Hole' or strokes.iloc[i, 13] == 'Green')):
                strokes.iloc[i, 21] = 1
            
            ## Else, GIR is zero
            else:
                strokes.iloc[i, 21] = 0
        
        elif (strokes.iloc[i, 5] == 5):
            
            ## If Shot Number is one and the Next Location Lie Type is Hole, then GIR is one
            if (strokes.iloc[i, 7] == 1 and (strokes.iloc[i, 13] == 'Hole' or strokes.iloc[i, 13] == 'Green')):
                strokes.iloc[i, 21] = 1
                
            ## Else-if Shot Number is two and the Next Location Lie Type is Hole, then GIR is one
            elif (strokes.iloc[i, 7] == 2 and (strokes.iloc[i, 13] == 'Hole' or strokes.iloc[i, 13] == 'Green')):
                strokes.iloc[i, 21] = 1
            
            ## Else-if Shot Number is three and the Next Location Lie Type is Hole, then GIR is one
            elif (strokes.iloc[i, 7] == 3 and (strokes.iloc[i, 13] == 'Hole' or strokes.iloc[i, 13] == 'Green')):
                strokes.iloc[i, 21] = 1
            
            ## Else, GIR is zero
            else:
                strokes.iloc[i, 21] = 0 
                
    return strokes

In [16]:
## Final Step:

## Exporting the data frame as a csv file

strokes.to_csv('Shot_Level_Data_Test.csv', index = False, header = True)