In [1]:
# Import dependencies
import pandas as pd
import math
import datetime as dt

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Read data
plays_df = pd.read_csv('https://storage.googleapis.com/big-data-bowl/plays.csv')
players_df = pd.read_csv('https://storage.googleapis.com/big-data-bowl/players.csv')
pffscouting_df = pd.read_csv('https://storage.googleapis.com/big-data-bowl/pffScoutingData.csv')
games_df = pd.read_csv('https://storage.googleapis.com/big-data-bowl/games.csv')
ol_players_pff_supplemental_df = pd.read_csv('https://storage.googleapis.com/big-data-bowl/players_pff_supp.csv')
college_conference_map_df = pd.read_csv('https://storage.googleapis.com/big-data-bowl/college_conference_map.csv')

players_df.set_index('nflId', inplace=True)
players_df.update(ol_players_pff_supplemental_df.set_index('nflId'))
players_df.reset_index()

players_df['birthDate'].replace('NA','', inplace=True)
players_df['birthDate'] = pd.to_datetime(players_df['birthDate'])
players_df['age'] = players_df['birthDate'].map(lambda x: dt.date.today().year-x.year)
players_df['age'] = players_df['age'].fillna(0)
players_df['heightCm'] = players_df['height'].map(lambda x: int(x.split('-')[0])*30.48+int(x.split('-')[1])*2.54)

mapping = dict(college_conference_map_df[['collegeName','conference']].values)
players_df['conference'] = players_df.collegeName.map(mapping)
mapping = dict(college_conference_map_df[['collegeName','conferenceId']].values)
players_df['conferenceId'] = players_df.collegeName.map(mapping)

players_df.to_csv('gs://big-data-bowl/players_supplemented.csv')

print("Plays: "  + str(plays_df.shape))
print("Players: "  + str(players_df.shape))
print("Scouting: "  + str(pffscouting_df.shape))
print("Games: "  + str(games_df.shape))

Plays: (8557, 32)
Players: (1679, 10)
Scouting: (188254, 15)
Games: (122, 7)


In [3]:
players_df["officialPosition"].value_counts()

WR     210
CB     192
DE     142
OLB    136
RB     126
TE     121
DT     114
T      114
G      107
ILB     81
FS      72
SS      69
QB      60
C       55
NT      34
MLB     27
FB      15
LB       2
DB       2
Name: officialPosition, dtype: int64

In [4]:
combined_df = pd.merge(pffscouting_df, plays_df,  how='left', left_on=['gameId','playId'], right_on=['gameId','playId']) 
combined_df = pd.merge(combined_df, players_df, how='left', left_on=['nflId'], right_on=['nflId']) 
combined_df = pd.merge(combined_df, players_df, how='left', left_on=['pff_nflIdBlockedPlayer'], right_on=['nflId'], suffixes=(None, '_x')) 
combined_df = pd.merge(combined_df, games_df, how='left', left_on=['gameId'], right_on=['gameId']) 


combined_df['posTeamIsHomeTeam'] = 0
combined_df.loc[combined_df['possessionTeam'] == combined_df['homeTeamAbbr'], 'posTeamIsHomeTeam'] = 1

combined_df['posTeamOwnSideline'] = 0
combined_df.loc[combined_df['possessionTeam'] == combined_df['yardlineSide'], 'posTeamOwnSideline'] = 1

combined_df['playerCommittedFoul'] = 0
combined_df.loc[combined_df['foulNFLId1'] == combined_df['nflId'], 'playerCommittedFoul'] = 1 
combined_df.loc[combined_df['foulNFLId2'] == combined_df['nflId'], 'playerCommittedFoul'] = 1 
combined_df.loc[combined_df['foulNFLId3'] == combined_df['nflId'], 'playerCommittedFoul'] = 1 

combined_df['inTrouble'] = 0
combined_df.loc[combined_df['pff_hitAllowed'] == 1, 'inTrouble'] = 1
combined_df.loc[combined_df['pff_hurryAllowed'] == 1, 'inTrouble'] = 1
combined_df.loc[combined_df['pff_sackAllowed'] == 1, 'inTrouble'] = 1
combined_df.loc[combined_df['pff_beatenByDefender'] == 1, 'inTrouble'] = 1
combined_df.loc[combined_df['playerCommittedFoul'] == 1, 'inTrouble'] = 1

combined_df['gameClockInside2Minutes'] = 0
combined_df['gameClockInside2Minutes'] = combined_df['gameClock'].map(lambda x: int(x.split(':')[0]) < 2) 

combined_df['gameClockInside2MinutesQ2'] = 0
combined_df.loc[combined_df['gameClockInside2Minutes'] & (combined_df['quarter'] == 2), 'gameClockInside2MinutesQ2'] = 1

combined_df['gameClockInside2MinutesQ4'] = 0
combined_df.loc[combined_df['gameClockInside2Minutes'] & (combined_df['quarter'] == 4), 'gameClockInside2MinutesQ4'] = 1

combined_df['inOvertime'] = 0
combined_df.loc[combined_df['quarter'] == 5, 'inOvertime'] = 1

combined_df = combined_df.apply(lambda x: x.fillna(0) if x.dtype.kind in 'biufc' else x.fillna('0'))

combined_df

print('dataframe size: {}'.format(combined_df.shape))
combined_df

dataframe size: (188254, 79)


Unnamed: 0,gameId,playId,nflId,pff_role,pff_positionLinedUp,pff_hit,pff_hurry,pff_sack,pff_beatenByDefender,pff_hitAllowed,...,homeTeamAbbr,visitorTeamAbbr,posTeamIsHomeTeam,posTeamOwnSideline,playerCommittedFoul,inTrouble,gameClockInside2Minutes,gameClockInside2MinutesQ2,gameClockInside2MinutesQ4,inOvertime
0,2021090900,97,25511,Pass,QB,0.0,0.0,0.0,0.0,0.0,...,TB,DAL,1,1,0,0,False,0,0,0
1,2021090900,97,35481,Pass Route,TE-L,0.0,0.0,0.0,0.0,0.0,...,TB,DAL,1,1,0,0,False,0,0,0
2,2021090900,97,35634,Pass Route,LWR,0.0,0.0,0.0,0.0,0.0,...,TB,DAL,1,1,0,0,False,0,0,0
3,2021090900,97,39985,Pass Route,HB-R,0.0,0.0,0.0,0.0,0.0,...,TB,DAL,1,1,0,0,False,0,0,0
4,2021090900,97,40151,Pass Block,C,0.0,0.0,0.0,0.0,0.0,...,TB,DAL,1,1,0,0,False,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
188249,2021110100,4433,52507,Pass Block,LT,0.0,0.0,0.0,0.0,0.0,...,KC,NYG,0,1,0,1,True,0,1,0
188250,2021110100,4433,52546,Coverage,SCBoR,0.0,0.0,0.0,0.0,0.0,...,KC,NYG,0,1,0,0,True,0,1,0
188251,2021110100,4433,52573,Pass Route,SLoWR,0.0,0.0,0.0,0.0,0.0,...,KC,NYG,0,1,0,0,True,0,1,0
188252,2021110100,4433,52585,Pass Rush,LEO,0.0,0.0,0.0,0.0,0.0,...,KC,NYG,0,1,0,0,True,0,1,0


In [5]:
combined_df['personnelO'].value_counts()

1 RB, 1 TE, 3 WR                126500
1 RB, 2 TE, 2 WR                 33022
2 RB, 1 TE, 2 WR                  9614
1 RB, 3 TE, 1 WR                  4994
1 RB, 0 TE, 4 WR                  4048
2 RB, 2 TE, 1 WR                  2420
0 RB, 1 TE, 4 WR                  1958
2 RB, 0 TE, 3 WR                  1452
6 OL, 1 RB, 1 TE, 2 WR             902
0 RB, 2 TE, 3 WR                   616
6 OL, 1 RB, 2 TE, 1 WR             440
0 RB, 0 TE, 5 WR                   308
2 RB, 3 TE, 0 WR                   286
2 QB, 1 RB, 1 TE, 2 WR             264
6 OL, 2 RB, 2 TE, 0 WR             242
6 OL, 1 RB, 3 TE, 0 WR             176
6 OL, 2 RB, 0 TE, 2 WR             110
2 QB, 1 RB, 0 TE, 3 WR             110
6 OL, 2 RB, 1 TE, 1 WR             110
0 RB, 3 TE, 2 WR                    88
2 QB, 1 RB, 2 TE, 1 WR              88
2 QB, 2 RB, 0 TE, 2 WR              88
3 RB, 0 TE, 2 WR                    66
6 OL, 1 RB, 0 TE, 3 WR              66
1 RB, 1 TE, 2 WR,1 LB               66
2 QB, 2 RB, 1 TE, 1 WR   

In [6]:
offensePersonnelGrouping = {
    "1 RB, 1 TE, 3 WR":"11135",
    "1 RB, 2 TE, 2 WR":"11225",
    "2 RB, 1 TE, 2 WR":"12125",
    "1 RB, 3 TE, 1 WR":"11315",
    "1 RB, 0 TE, 4 WR":"11045",
    "2 RB, 2 TE, 1 WR":"12215",
    "0 RB, 1 TE, 4 WR":"10145",
    "2 RB, 0 TE, 3 WR":"12035",
    "6 OL, 1 RB, 1 TE, 2 WR":"11126",
    "0 RB, 2 TE, 3 WR":"10235",
    "6 OL, 1 RB, 2 TE, 1 WR":"11216",
    "0 RB, 0 TE, 5 WR":"10055",
    "2 RB, 3 TE, 0 WR":"12305",
    "2 QB, 1 RB, 1 TE, 2 WR":"21125",
    "6 OL, 2 RB, 2 TE, 0 WR":"12206",
    "6 OL, 1 RB, 3 TE, 0 WR":"11306",
    "2 QB, 1 RB, 0 TE, 3 WR":"21035",
    "6 OL, 2 RB, 1 TE, 1 WR":"12116",
    "6 OL, 2 RB, 0 TE, 2 WR":"12026",
    "2 QB, 2 RB, 0 TE, 2 WR":"22025",
    "2 QB, 1 RB, 2 TE, 1 WR":"21215",
    "0 RB, 3 TE, 2 WR":"10325",
    "1 RB, 1 TE, 2 WR,1 LB":"11126",
    "2 QB, 2 RB, 1 TE, 1 WR":"22115",
    "3 RB, 0 TE, 2 WR":"13025",
    "6 OL, 1 RB, 0 TE, 3 WR":"11036",
    "2 QB, 6 OL, 1 RB, 1 TE, 1 WR":"21116",
    "2 QB, 1 RB, 3 TE, 0 WR":"21305",
    "1 RB, 4 TE, 0 WR":"11405",
    "7 OL, 1 RB, 0 TE, 2 WR":"11027",
    "Other":"99"
}
combined_df["offensePersonnelGrouping"] = combined_df["personnelO"].map(offensePersonnelGrouping)

In [7]:
combined_df['personnelD'].value_counts()

4 DL, 2 LB, 5 DB    53240
2 DL, 4 LB, 5 DB    33902
3 DL, 3 LB, 5 DB    25234
2 DL, 3 LB, 6 DB    17578
4 DL, 3 LB, 4 DB    15004
3 DL, 4 LB, 4 DB    12232
4 DL, 1 LB, 6 DB    10098
3 DL, 2 LB, 6 DB     8536
1 DL, 4 LB, 6 DB     4598
1 DL, 5 LB, 5 DB     2772
5 DL, 2 LB, 4 DB     1474
5 DL, 1 LB, 5 DB      638
2 DL, 2 LB, 7 DB      594
4 DL, 4 LB, 3 DB      506
2 DL, 5 LB, 4 DB      418
1 DL, 3 LB, 7 DB      418
3 DL, 1 LB, 7 DB      220
5 DL, 3 LB, 3 DB      132
6 DL, 3 LB, 2 DB       88
6 DL, 2 LB, 3 DB       88
0 DL, 3 LB, 8 DB       88
5 DL, 5 LB, 1 DB       66
4 DL, 5 LB, 2 DB       66
4 DL, 6 LB, 1 DB       66
0 DL, 5 LB, 6 DB       44
6 DL, 1 LB, 4 DB       44
1 DL, 2 LB, 8 DB       44
0                      22
3 DL, 5 LB, 3 DB       22
6 DL, 4 LB, 1 DB       22
Name: personnelD, dtype: int64

In [8]:
defensePersonnelGrouping = {
    "4 DL, 2 LB, 5 DB":"425",
    "2 DL, 4 LB, 5 DB":"245",
    "3 DL, 3 LB, 5 DB":"335",
    "2 DL, 3 LB, 6 DB":"236",
    "4 DL, 3 LB, 4 DB":"434",
    "3 DL, 4 LB, 4 DB":"344",
    "4 DL, 1 LB, 6 DB":"416",
    "3 DL, 2 LB, 6 DB":"326",
    "1 DL, 4 LB, 6 DB":"146",
    "1 DL, 5 LB, 5 DB":"155",
    "5 DL, 2 LB, 4 DB":"524",
    "5 DL, 1 LB, 5 DB":"515",
    "2 DL, 2 LB, 7 DB":"227",
    "4 DL, 4 LB, 3 DB":"443",
    "1 DL, 3 LB, 7 DB":"137",
    "2 DL, 5 LB, 4 DB":"254",
    "3 DL, 1 LB, 7 DB":"317",
    "5 DL, 3 LB, 3 DB":"533",
    "0 DL, 3 LB, 8 DB":"038",
    "6 DL, 3 LB, 2 DB":"632",
    "6 DL, 2 LB, 3 DB":"623",
    "4 DL, 6 LB, 1 DB":"461",
    "5 DL, 5 LB, 1 DB":"551",
    "4 DL, 5 LB, 2 DB":"452",
    "6 DL, 1 LB, 4 DB":"614",
    "0 DL, 5 LB, 6 DB":"056",
    "1 DL, 2 LB, 8 DB":"128",
    "3 DL, 5 LB, 3 DB":"353",
    "6 DL, 4 LB, 1 DB":"641",
    "Other":"99" 
}
combined_df["defensePersonnelGrouping"] = combined_df["personnelD"].map(defensePersonnelGrouping)

In [9]:
offensePositionLinedUp = {
    "LG":"1",
    "LT":"2",
    "C":"3",
    "RG":"4",
    "RT":"5",
    "TE-iR":"99",
    "TE-R":"99",
    "TE-iL":"99",
    "TE-L":"99",
    "FB-L":"99",
    "NRT":"99",
    "DRT":"99",
    "NLT":"99",
    "LWR":"99",
    "TE-oL":"99",    
    "TE-oR":"99",
    "FB-R":"99",   
    "Other":"99"
}
combined_df["pff_positionLinedUpCode"] = combined_df["pff_positionLinedUp"].map(offensePositionLinedUp)

In [10]:
blockType = {
    "PP":"1",
    "PT":"2",
    "PA":"3",
    "SW":"4",
    "CL":"5",
    "NB":"6",
    "PR":"7",
    "UP":"8",
    "SR":"9",
    "CH":"10",
    "BH":"11",
    "Other":"99"
}
combined_df["pff_blockTypeCode"] = combined_df["pff_blockType"].map(blockType)

In [11]:
teams = {
    "KC":"1",
    "DET":"2",
    "TB":"3",
    "MIA":"4",
    "NYG":"5",
    "WAS":"6",
    "CAR":"7",
    "DEN":"8",
    "ATL":"9",
    "NE":"10",
    "TEN":"11",
    "LAC":"12",
    "BAL":"13",
    "NYJ":"14",
    "LV":"15",
    "JAX":"16",
    "DAL":"17",
    "IND":"18",
    "CIN":"19",
    "PHI":"20",
    "MIN":"21",
    "BUF":"22",
    "HOU":"23",
    "LA":"24",
    "CLE":"25",
    "CHI":"26",
    "PIT":"27",
    "ARI":"28",
    "SEA":"29",
    "GB":"30",
    "SF":"31",
    "NO":"32",
    "Other":"99"
}

combined_df["possessionTeamCode"] = combined_df["possessionTeam"].map(teams)
combined_df["defensiveTeamCode"] = combined_df["defensiveTeam"].map(teams)

In [12]:
passResult = {
    "C":"1",
    "I":"2",
    "S":"3",
    "R":"4",
    "IN":"5",
    "Other":"99"
}
combined_df["passResultCode"] = combined_df["passResult"].map(passResult)

In [13]:
offenseFormation = {
    "SHOTGUN":"1",
    "EMPTY":"2",
    "SINGLEBACK":"3",
    "I_FORM":"4",
    "PISTOL":"5",
    "JUMBO":"6",
    "WILDCAT":"7",    
    "Other":"99"
}

combined_df["offenseFormationCode"] = combined_df["offenseFormation"].map(offenseFormation)

In [14]:
officialPositionDefense = {
    "DE":"1",
    "DT":"2",
    "OLB":"3",
    "NT":"4",
    "ILB":"5",
    "MLB":"6",
    "SS":"7",    
    "FS":"8",
    "CB":"9",
    "G":"10",
    "RB":"11",
    "LB":"12",     
    "Other":"99"
}
combined_df["officialPosition_xCode"] = combined_df["officialPosition_x"].map(officialPositionDefense)

In [15]:
dropBackType = {
    "TRADITIONAL":"1",
    "SCRAMBLE":"2",
    "DESIGNED_ROLLOUT_RIGHT":"3",
    "DESIGNED_ROLLOUT_LEFT":"4",
    "SCRAMBLE_ROLLOUT_RIGHT":"5",
    "SCRAMBLE_ROLLOUT_LEFT":"6",
    "DESIGNED_RUN":"7",    
    "UNKNOWN":"8",    
    "Other":"99"
}
combined_df["dropBackTypeCode"] = combined_df["dropBackType"].map(dropBackType)

In [16]:
pff_passCoverage = {
    "Cover-3":"1",
    "Cover-1":"2",
    "Cover-2":"3",
    "Quarters":"4",
    "Cover-6":"5",
    "Red Zone":"6",
    "Cover-0":"7",    
    "2-Man":"8", 
    "Bracket":"9",
    "Prevent":"10",    
    "Goal Line":"11",     
    "Miscellaneous":"12", 
    "Other":"99"
}
combined_df["pff_passCoverageCode"] = combined_df["pff_passCoverage"].map(pff_passCoverage)

In [17]:
pff_passCoverageType = {
    "Zone":"1",
    "Man":"2",
    "Other":"3"
}
combined_df["pff_passCoverageTypeCode"] = combined_df["pff_passCoverageType"].map(pff_passCoverageType)

In [18]:
# Save to CSV
combined_df.to_csv('gs://big-data-bowl/presnap-data-all.csv')

In [27]:
# Filter for offensive line positions
oline_df = combined_df[combined_df['officialPosition'].isin(['G','C','T'])]

In [28]:
# Save to CSV
oline_df.to_csv('gs://big-data-bowl/presnap-data-oline-all.csv')

In [29]:
oline_df.columns

Index(['gameId', 'playId', 'nflId', 'pff_role', 'pff_positionLinedUp',
       'pff_hit', 'pff_hurry', 'pff_sack', 'pff_beatenByDefender',
       'pff_hitAllowed', 'pff_hurryAllowed', 'pff_sackAllowed',
       'pff_nflIdBlockedPlayer', 'pff_blockType', 'pff_backFieldBlock',
       'playDescription', 'quarter', 'down', 'yardsToGo', 'possessionTeam',
       'defensiveTeam', 'yardlineSide', 'yardlineNumber', 'gameClock',
       'preSnapHomeScore', 'preSnapVisitorScore', 'passResult', 'penaltyYards',
       'prePenaltyPlayResult', 'playResult', 'foulName1', 'foulNFLId1',
       'foulName2', 'foulNFLId2', 'foulName3', 'foulNFLId3',
       'absoluteYardlineNumber', 'offenseFormation', 'personnelO',
       'defendersInBox', 'personnelD', 'dropBackType', 'pff_playAction',
       'pff_passCoverage', 'pff_passCoverageType', 'height', 'weight',
       'birthDate', 'collegeName', 'officialPosition', 'displayName', 'age',
       'heightCm', 'conference', 'conferenceId', 'height_x', 'weight_x',
     

In [30]:
# Remove all object columns to get an encoded file
oline_df_encoded = oline_df.drop(columns=['pff_role', 'pff_positionLinedUp', 'pff_blockType', 'possessionTeam', 'defensiveTeam', 'pff_hit','pff_hurry','pff_sack', 
                                          'playDescription', 'displayName', 'season', 'gameDate', 'gameTimeEastern', 'passResult', 'offenseFormation', 'personnelO',
                                          'personnelD', 'dropBackType', 'pff_passCoverage', 'pff_passCoverageType', 'conference', 'officialPosition_x', 'conference_x',
                                          'homeTeamAbbr', 'visitorTeamAbbr', 'officialPosition', 'foulName1', 'foulNFLId1', 'foulName2', 'foulNFLId2', 'foulName3', 'foulNFLId3',
                                          'pff_backFieldBlock', 'height', 'birthDate', 'gameClock', 'collegeName', 'yardlineSide', 'pff_hitAllowed','pff_hurryAllowed','pff_sackAllowed',
                                          'pff_beatenByDefender', 'gameClockInside2Minutes', 'playerCommittedFoul', 'height_x', 'displayName_x', 'birthDate_x', 'collegeName_x'])

In [31]:
oline_df_encoded.columns

Index(['gameId', 'playId', 'nflId', 'pff_nflIdBlockedPlayer', 'quarter',
       'down', 'yardsToGo', 'yardlineNumber', 'preSnapHomeScore',
       'preSnapVisitorScore', 'penaltyYards', 'prePenaltyPlayResult',
       'playResult', 'absoluteYardlineNumber', 'defendersInBox',
       'pff_playAction', 'weight', 'age', 'heightCm', 'conferenceId',
       'weight_x', 'age_x', 'heightCm_x', 'conferenceId_x', 'week',
       'posTeamIsHomeTeam', 'posTeamOwnSideline', 'inTrouble',
       'gameClockInside2MinutesQ2', 'gameClockInside2MinutesQ4', 'inOvertime',
       'offensePersonnelGrouping', 'defensePersonnelGrouping',
       'pff_positionLinedUpCode', 'pff_blockTypeCode', 'possessionTeamCode',
       'defensiveTeamCode', 'passResultCode', 'offenseFormationCode',
       'officialPosition_xCode', 'dropBackTypeCode', 'pff_passCoverageCode',
       'pff_passCoverageTypeCode'],
      dtype='object')

In [32]:
# Convert all values to numeric
oline_df_encoded = oline_df_encoded.apply(pd.to_numeric)

In [33]:
# Save to CSV
oline_df_encoded.to_csv('gs://big-data-bowl/presnap-data-oline-encoded.csv')