In [1]:
# Import dependencies
import pandas as pd
import math
import datetime as dt

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Read data
plays_df = pd.read_csv('https://storage.googleapis.com/big-data-bowl/plays.csv')
players_df = pd.read_csv('https://storage.googleapis.com/big-data-bowl/players.csv')
pffscouting_df = pd.read_csv('https://storage.googleapis.com/big-data-bowl/pffScoutingData.csv')
games_df = pd.read_csv('https://storage.googleapis.com/big-data-bowl/games.csv')
ol_players_pff_supplemental_df = pd.read_csv('https://storage.googleapis.com/big-data-bowl/ol_players_pff_supplemental_data.csv')

players_df.set_index('nflId', inplace=True)
players_df.update(ol_players_pff_supplemental_df.set_index('nflId'))
players_df.reset_index()

players_df['birthDate'].replace('NA','', inplace=True)
players_df['birthDate'] = pd.to_datetime(players_df['birthDate'])
players_df['age'] = players_df['birthDate'].map(lambda x: dt.date.today().year-x.year)
players_df['age'] = players_df['age'].fillna(0)
players_df['heightCm'] = players_df['height'].map(lambda x: int(x.split('-')[0])*30.48+int(x.split('-')[1])*2.54)

conference_map = {
    # 0 = None
    # 1 = ACC
    "1": [
            "Wake Forest",
            "Virginia Tech",
            "Virginia",
            "Syracuse",
            "Pittsburgh",
            "North Carolina State",
            "North Carolina",
            "Miami",
            "Louisville",
            "Georgia Tech",
            "Florida State",
            "Duke",
            "Clemson",
            "Boston College"
    ],
    # 2 = American
    "2": [
            "Tulsa",
            "Tulane",
            "Temple",
            "Southern Methodist",
            "South Florida",
            "Navy",
            "Memphis",
            "Houston",
            "East Carolina",
            "Cincinnati",
            "Central Florida",
    ],
    # 3 = Big12
    "3": [
            "West Virginia",
            "Texas Tech",
            "Texas Christian",
            "Texas",
            "Oklahoma State",
            "Oklahoma",
            "Kansas State",
            "Kansas",
            "Iowa State",
            "Baylor"
    ],
    # 4 = Big10
    "4": [
            "Wisconsin",
            "Rutgers",
            "Purdue",
            "Penn State",
            "Ohio State",
            "Northwestern",
            "Nebraska",
            "Minnesota",
            "Michigan State",
            "Michigan",
            "Maryland",
            "Iowa",
            "Indiana",
            "Illinois",   
    ],
    # 5 = C-USA
    "5": [
            "Western Kentucky",
            "Texas-El Paso",
            "Southern Mississippi",
            "Old Dominion",
            "Louisiana Tech",
            "Florida International",
            "Florida Atlantic",
            "Texas-San Antonio",
            "Rice",
            "North Texas",
            "North Carolina-Charlotte",
            "Middle Tennessee",
            "Marshall",
            "Alabama-Birmingham"
    ],
    # 6 = Indpendent
    "6": [
            "Notre Dame",
            "Connecticut",
            "Brigham Young",
            "Massachusetts",
            "Liberty",
            "Army"
    ],
    # 7 = MAC
    "7": [
            "Western Michigan",
            "Toledo",
            "Northern Illinois",
            "Central Michigan",
            "Buffalo",
            "Ohio",
            "Miami, O.",
            "Kent State",
            "Eastern Michigan",
            "Bowling Green",
            "Ball State",
            "Akron"
    ],
    # 8 = Mountain West
    "8": [
            "Wyoming",
            "Utah State",
            "San Jose State",
            "Fresno State",
            "Colorado State",
            "Boise State",
            "San Diego State",
            "Nevada-Las Vegas",
            "Nevada",
            "Hawaii",
            "Air Force",
            "New Mexico"
    ],
    # 9 = Pac-12
    "9": [
            "Colorado",
            "Washington",
            "Utah",
            "UCLA",
            "Stanford",
            "Southern California",
            "Oregon State",
            "Oregon",
            "California",
            "Arizona State",
            "Arizona",
            "Washington State"
    ],
    # 10 = SBC (Sun Belt)
    "10": [
            "Louisiana-Lafayette",
            "Troy",
            "Texas State",
            "South Alabama",
            "Louisiana-Monroe",
            "Georgia State",
            "Georgia Southern",
            "Coastal Carolina",
            "Arkansas State",
            "Appalachian State",
            "James Madison"
    ],
    # 11 = SEC
    "11": [
            "Vanderbilt",
            "Texas A&amp;M",
            "Tennessee",
            "South Carolina",
            "Missouri",
            "Mississippi State",
            "Mississippi",
            "Louisiana State",
            "Kentucky",
            "Georgia",
            "Florida",
            "Auburn",
            "Arkansas",
            "Alabama"       
    ]
    # 99 = Other Non-FBS

}

players_df["playersFBSConference"] = 0
players_df["playersFBSConference"] = (
    players_df["collegeName"]
    .apply(lambda x: [k for k in conference_map.keys() if x in conference_map[k]])
    .str[0]
    .fillna("99")
)


print("Plays: "  + str(plays_df.shape))
print("Players: "  + str(players_df.shape))
print("Scouting: "  + str(pffscouting_df.shape))
print("Games: "  + str(games_df.shape))

Plays: (8557, 32)
Players: (1679, 9)
Scouting: (188254, 15)
Games: (122, 7)


In [3]:
combined_df = pd.merge(pffscouting_df, plays_df,  how='left', left_on=['gameId','playId'], right_on=['gameId','playId']) 
combined_df = pd.merge(combined_df, players_df, how='left', left_on=['nflId'], right_on=['nflId']) 
combined_df = pd.merge(combined_df, players_df, how='left', left_on=['pff_nflIdBlockedPlayer'], right_on=['nflId'], suffixes=(None, '_x')) 
combined_df = pd.merge(combined_df, games_df, how='left', left_on=['gameId'], right_on=['gameId']) 


combined_df['posTeamIsHomeTeam'] = 0
combined_df.loc[combined_df['possessionTeam'] == combined_df['homeTeamAbbr'], 'posTeamIsHomeTeam'] = 1

combined_df['posTeamOwnSideline'] = 0
combined_df.loc[combined_df['possessionTeam'] == combined_df['yardlineSide'], 'posTeamOwnSideline'] = 1

combined_df['playerCommittedFoul'] = 0
combined_df.loc[combined_df['foulNFLId1'] == combined_df['nflId'], 'playerCommittedFoul'] = 1 
combined_df.loc[combined_df['foulNFLId2'] == combined_df['nflId'], 'playerCommittedFoul'] = 1 
combined_df.loc[combined_df['foulNFLId3'] == combined_df['nflId'], 'playerCommittedFoul'] = 1 

combined_df['inTrouble'] = 0
combined_df.loc[combined_df['pff_hitAllowed'] == 1, 'inTrouble'] = 1
combined_df.loc[combined_df['pff_hurryAllowed'] == 1, 'inTrouble'] = 1
combined_df.loc[combined_df['pff_sackAllowed'] == 1, 'inTrouble'] = 1
combined_df.loc[combined_df['pff_beatenByDefender'] == 1, 'inTrouble'] = 1
combined_df.loc[combined_df['playerCommittedFoul'] == 1, 'inTrouble'] = 1

combined_df['gameClockInside2Minutes'] = 0
combined_df['gameClockInside2Minutes'] = combined_df['gameClock'].map(lambda x: int(x.split(':')[0]) < 2) 

combined_df['gameClockInside2MinutesQ2'] = 0
combined_df.loc[combined_df['gameClockInside2Minutes'] & (combined_df['quarter'] == 2), 'gameClockInside2MinutesQ2'] = 1

combined_df['gameClockInside2MinutesQ4'] = 0
combined_df.loc[combined_df['gameClockInside2Minutes'] & (combined_df['quarter'] == 4), 'gameClockInside2MinutesQ4'] = 1

combined_df['inOvertime'] = 0
combined_df.loc[combined_df['quarter'] == 5, 'inOvertime'] = 1

combined_df = combined_df.apply(lambda x: x.fillna(0) if x.dtype.kind in 'biufc' else x.fillna('0'))

combined_df

print('dataframe size: {}'.format(combined_df.shape))
combined_df

dataframe size: (188254, 77)


Unnamed: 0,gameId,playId,nflId,pff_role,pff_positionLinedUp,pff_hit,pff_hurry,pff_sack,pff_beatenByDefender,pff_hitAllowed,...,homeTeamAbbr,visitorTeamAbbr,posTeamIsHomeTeam,posTeamOwnSideline,playerCommittedFoul,inTrouble,gameClockInside2Minutes,gameClockInside2MinutesQ2,gameClockInside2MinutesQ4,inOvertime
0,2021090900,97,25511,Pass,QB,0.0,0.0,0.0,0.0,0.0,...,TB,DAL,1,1,0,0,False,0,0,0
1,2021090900,97,35481,Pass Route,TE-L,0.0,0.0,0.0,0.0,0.0,...,TB,DAL,1,1,0,0,False,0,0,0
2,2021090900,97,35634,Pass Route,LWR,0.0,0.0,0.0,0.0,0.0,...,TB,DAL,1,1,0,0,False,0,0,0
3,2021090900,97,39985,Pass Route,HB-R,0.0,0.0,0.0,0.0,0.0,...,TB,DAL,1,1,0,0,False,0,0,0
4,2021090900,97,40151,Pass Block,C,0.0,0.0,0.0,0.0,0.0,...,TB,DAL,1,1,0,0,False,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
188249,2021110100,4433,52507,Pass Block,LT,0.0,0.0,0.0,0.0,0.0,...,KC,NYG,0,1,0,1,True,0,1,0
188250,2021110100,4433,52546,Coverage,SCBoR,0.0,0.0,0.0,0.0,0.0,...,KC,NYG,0,1,0,0,True,0,1,0
188251,2021110100,4433,52573,Pass Route,SLoWR,0.0,0.0,0.0,0.0,0.0,...,KC,NYG,0,1,0,0,True,0,1,0
188252,2021110100,4433,52585,Pass Rush,LEO,0.0,0.0,0.0,0.0,0.0,...,KC,NYG,0,1,0,0,True,0,1,0


In [4]:
combined_df.columns

Index(['gameId', 'playId', 'nflId', 'pff_role', 'pff_positionLinedUp',
       'pff_hit', 'pff_hurry', 'pff_sack', 'pff_beatenByDefender',
       'pff_hitAllowed', 'pff_hurryAllowed', 'pff_sackAllowed',
       'pff_nflIdBlockedPlayer', 'pff_blockType', 'pff_backFieldBlock',
       'playDescription', 'quarter', 'down', 'yardsToGo', 'possessionTeam',
       'defensiveTeam', 'yardlineSide', 'yardlineNumber', 'gameClock',
       'preSnapHomeScore', 'preSnapVisitorScore', 'passResult', 'penaltyYards',
       'prePenaltyPlayResult', 'playResult', 'foulName1', 'foulNFLId1',
       'foulName2', 'foulNFLId2', 'foulName3', 'foulNFLId3',
       'absoluteYardlineNumber', 'offenseFormation', 'personnelO',
       'defendersInBox', 'personnelD', 'dropBackType', 'pff_playAction',
       'pff_passCoverage', 'pff_passCoverageType', 'height', 'weight',
       'birthDate', 'collegeName', 'officialPosition', 'displayName', 'age',
       'heightCm', 'playersFBSConference', 'height_x', 'weight_x',
       'bir

In [5]:
# Filter for offensive line positions
oline_df = combined_df[combined_df['officialPosition'].isin(['G','C','T'])]
print("Offensive Line Plays: "  + str(oline_df.shape))

Offensive Line Plays: (42886, 77)


In [6]:
# Shape the dataframe so predictions can be made
oline_df = oline_df.drop(columns=['gameId', 'playId', 'pff_role', 'pff_hit','pff_hurry','pff_sack', 'playDescription', 'displayName', 'season', 'gameDate', 'gameTimeEastern',
                                                            'homeTeamAbbr', 'visitorTeamAbbr', 'officialPosition', 'foulName1', 'foulNFLId1', 'foulName2', 'foulNFLId2', 'foulName3', 'foulNFLId3',
                                                            'pff_backFieldBlock', 'height', 'birthDate', 'gameClock', 'collegeName', 'yardlineSide', 'pff_hitAllowed','pff_hurryAllowed','pff_sackAllowed',
                                                            'pff_beatenByDefender', 'gameClockInside2Minutes', 'playerCommittedFoul', 'height_x', 'displayName_x', 'birthDate_x', 'collegeName_x'])

In [7]:
oline_df['pff_positionLinedUp'].value_counts()

C        8557
LG       8557
RG       8557
RT       8555
LT       8553
TE-iR      37
TE-R       23
TE-iL      21
TE-L       15
FB-L        3
NRT         2
DRT         1
NLT         1
LWR         1
TE-oL       1
TE-oR       1
FB-R        1
Name: pff_positionLinedUp, dtype: int64

In [8]:
offensePosition = {"pff_positionLinedUp": {
    "LG":"1",
    "LT":"2",
    "C":"3",
    "RG":"4",
    "RT":"5",
    "TE-iR":"99",
    "TE-R":"99",
    "TE-iL":"99",
    "TE-L":"99",
    "FB-L":"99",
    "NRT":"99",
    "DRT":"99",
    "NLT":"99",
    "LWR":"99",
    "TE-oL":"99",    
    "TE-oR":"99",
    "FB-R":"99",   
    "Other":"99" }
}
oline_df = oline_df.replace(offensePosition)


In [9]:
oline_df['pff_blockType'].value_counts()

PP    24286
PT     5807
PA     5696
SW     3010
CL     2432
NB     1072
PR      249
UP      217
SR       86
0        25
BH        4
CH        2
Name: pff_blockType, dtype: int64

In [10]:
blockType = {"pff_blockType": {
    "PP":"1",
    "PT":"2",
    "PA":"3",
    "SW":"4",
    "CL":"5",
    "NB":"6",
    "PR":"7",
    "UP":"8",
    "SR":"9",
    "CH":"10",
    "BH":"11",
    "Other":"99" }
}
oline_df = oline_df.replace(blockType)

In [11]:
oline_df['personnelO'].value_counts()

1 RB, 1 TE, 3 WR                28751
1 RB, 2 TE, 2 WR                 7505
2 RB, 1 TE, 2 WR                 2187
1 RB, 3 TE, 1 WR                 1135
1 RB, 0 TE, 4 WR                  920
2 RB, 2 TE, 1 WR                  550
0 RB, 1 TE, 4 WR                  445
2 RB, 0 TE, 3 WR                  330
6 OL, 1 RB, 1 TE, 2 WR            247
0 RB, 2 TE, 3 WR                  140
6 OL, 1 RB, 2 TE, 1 WR            120
0 RB, 0 TE, 5 WR                   70
6 OL, 2 RB, 2 TE, 0 WR             66
2 RB, 3 TE, 0 WR                   65
2 QB, 1 RB, 1 TE, 2 WR             60
6 OL, 1 RB, 3 TE, 0 WR             48
6 OL, 2 RB, 0 TE, 2 WR             30
6 OL, 2 RB, 1 TE, 1 WR             30
2 QB, 1 RB, 0 TE, 3 WR             25
0 RB, 3 TE, 2 WR                   20
2 QB, 1 RB, 2 TE, 1 WR             20
2 QB, 2 RB, 0 TE, 2 WR             20
6 OL, 1 RB, 0 TE, 3 WR             18
3 RB, 0 TE, 2 WR                   15
1 RB, 1 TE, 2 WR,1 LB              15
2 QB, 2 RB, 1 TE, 1 WR             15
2 QB, 6 OL, 

In [12]:
offensePersonnelGrouping = {"personnelO": {
    "1 RB, 1 TE, 3 WR":"11135",
    "1 RB, 2 TE, 2 WR":"11225",
    "2 RB, 1 TE, 2 WR":"12125",
    "1 RB, 3 TE, 1 WR":"11315",
    "1 RB, 0 TE, 4 WR":"11045",
    "2 RB, 2 TE, 1 WR":"12215",
    "0 RB, 1 TE, 4 WR":"10145",
    "2 RB, 0 TE, 3 WR":"12035",
    "6 OL, 1 RB, 1 TE, 2 WR":"11126",
    "0 RB, 2 TE, 3 WR":"10235",
    "6 OL, 1 RB, 2 TE, 1 WR":"11216",
    "0 RB, 0 TE, 5 WR":"10055",
    "2 RB, 3 TE, 0 WR":"12305",
    "2 QB, 1 RB, 1 TE, 2 WR":"21125",
    "6 OL, 2 RB, 2 TE, 0 WR":"12206",
    "6 OL, 1 RB, 3 TE, 0 WR":"11306",
    "2 QB, 1 RB, 0 TE, 3 WR":"21035",
    "6 OL, 2 RB, 1 TE, 1 WR":"12116",
    "6 OL, 2 RB, 0 TE, 2 WR":"12026",
    "2 QB, 2 RB, 0 TE, 2 WR":"22025",
    "2 QB, 1 RB, 2 TE, 1 WR":"21215",
    "0 RB, 3 TE, 2 WR":"10325",
    "1 RB, 1 TE, 2 WR,1 LB":"11126",
    "2 QB, 2 RB, 1 TE, 1 WR":"22115",
    "3 RB, 0 TE, 2 WR":"13025",
    "6 OL, 1 RB, 0 TE, 3 WR":"11036",
    "2 QB, 6 OL, 1 RB, 1 TE, 1 WR":"21116",
    "2 QB, 1 RB, 3 TE, 0 WR":"21305",
    "1 RB, 4 TE, 0 WR":"11405",
    "7 OL, 1 RB, 0 TE, 2 WR":"11027",
    "Other":"99" }
}
oline_df = oline_df.replace(offensePersonnelGrouping)                       

In [13]:
oline_df['personnelD'].value_counts()

4 DL, 2 LB, 5 DB    12108
2 DL, 4 LB, 5 DB     7708
3 DL, 3 LB, 5 DB     5748
2 DL, 3 LB, 6 DB     3996
4 DL, 3 LB, 4 DB     3443
3 DL, 4 LB, 4 DB     2801
4 DL, 1 LB, 6 DB     2295
3 DL, 2 LB, 6 DB     1940
1 DL, 4 LB, 6 DB     1045
1 DL, 5 LB, 5 DB      630
5 DL, 2 LB, 4 DB      337
5 DL, 1 LB, 5 DB      147
2 DL, 2 LB, 7 DB      135
4 DL, 4 LB, 3 DB      120
2 DL, 5 LB, 4 DB       95
1 DL, 3 LB, 7 DB       95
3 DL, 1 LB, 7 DB       50
5 DL, 3 LB, 3 DB       34
6 DL, 3 LB, 2 DB       22
6 DL, 2 LB, 3 DB       21
0 DL, 3 LB, 8 DB       20
5 DL, 5 LB, 1 DB       18
4 DL, 5 LB, 2 DB       16
4 DL, 6 LB, 1 DB       16
0 DL, 5 LB, 6 DB       10
6 DL, 1 LB, 4 DB       10
1 DL, 2 LB, 8 DB       10
3 DL, 5 LB, 3 DB        6
0                       5
6 DL, 4 LB, 1 DB        5
Name: personnelD, dtype: int64

In [14]:
defensePersonnelGrouping = {"personnelD": {
    "4 DL, 2 LB, 5 DB":"425",
    "2 DL, 4 LB, 5 DB":"245",
    "3 DL, 3 LB, 5 DB":"335",
    "2 DL, 3 LB, 6 DB":"236",
    "4 DL, 3 LB, 4 DB":"434",
    "3 DL, 4 LB, 4 DB":"344",
    "4 DL, 1 LB, 6 DB":"416",
    "3 DL, 2 LB, 6 DB":"326",
    "1 DL, 4 LB, 6 DB":"146",
    "1 DL, 5 LB, 5 DB":"155",
    "5 DL, 2 LB, 4 DB":"524",
    "5 DL, 1 LB, 5 DB":"515",
    "2 DL, 2 LB, 7 DB":"227",
    "4 DL, 4 LB, 3 DB":"443",
    "1 DL, 3 LB, 7 DB":"137",
    "2 DL, 5 LB, 4 DB":"254",
    "3 DL, 1 LB, 7 DB":"317",
    "5 DL, 3 LB, 3 DB":"533",
    "0 DL, 3 LB, 8 DB":"038",
    "6 DL, 3 LB, 2 DB":"632",
    "6 DL, 2 LB, 3 DB":"623",
    "4 DL, 6 LB, 1 DB":"461",
    "5 DL, 5 LB, 1 DB":"551",
    "4 DL, 5 LB, 2 DB":"452",
    "6 DL, 1 LB, 4 DB":"614",
    "0 DL, 5 LB, 6 DB":"056",
    "1 DL, 2 LB, 8 DB":"128",
    "3 DL, 5 LB, 3 DB":"353",
    "6 DL, 4 LB, 1 DB":"641",
    "Other":"99" }
}
oline_df = oline_df.replace(defensePersonnelGrouping)

In [15]:
oline_df.columns

Index(['nflId', 'pff_positionLinedUp', 'pff_nflIdBlockedPlayer',
       'pff_blockType', 'quarter', 'down', 'yardsToGo', 'possessionTeam',
       'defensiveTeam', 'yardlineNumber', 'preSnapHomeScore',
       'preSnapVisitorScore', 'passResult', 'penaltyYards',
       'prePenaltyPlayResult', 'playResult', 'absoluteYardlineNumber',
       'offenseFormation', 'personnelO', 'defendersInBox', 'personnelD',
       'dropBackType', 'pff_playAction', 'pff_passCoverage',
       'pff_passCoverageType', 'weight', 'age', 'heightCm',
       'playersFBSConference', 'weight_x', 'officialPosition_x', 'age_x',
       'heightCm_x', 'playersFBSConference_x', 'week', 'posTeamIsHomeTeam',
       'posTeamOwnSideline', 'inTrouble', 'gameClockInside2MinutesQ2',
       'gameClockInside2MinutesQ4', 'inOvertime'],
      dtype='object')

In [16]:
oline_df['possessionTeam'].value_counts()

KC     1697
DET    1646
TB     1557
MIA    1545
NYG    1496
WAS    1492
CAR    1471
DEN    1460
ATL    1448
NE     1436
TEN    1387
LAC    1380
BAL    1361
NYJ    1355
JAX    1320
LV     1320
DAL    1313
CIN    1294
IND    1292
PHI    1276
BUF    1274
MIN    1271
HOU    1255
LA     1255
CHI    1248
CLE    1245
PIT    1190
ARI    1177
SEA    1165
GB     1130
SF     1095
NO     1035
Name: possessionTeam, dtype: int64

In [17]:
possessionTeams = {"possessionTeam": {
    "KC":"1",
    "DET":"2",
    "TB":"3",
    "MIA":"4",
    "NYG":"5",
    "WAS":"6",
    "CAR":"7",
    "DEN":"8",
    "ATL":"9",
    "NE":"10",
    "TEN":"11",
    "LAC":"12",
    "BAL":"13",
    "NYJ":"14",
    "LV":"15",
    "JAX":"16",
    "DAL":"17",
    "IND":"18",
    "CIN":"19",
    "PHI":"20",
    "MIN":"21",
    "BUF":"22",
    "HOU":"23",
    "LA":"24",
    "CLE":"25",
    "CHI":"26",
    "PIT":"27",
    "ARI":"28",
    "SEA":"29",
    "GB":"30",
    "SF":"31",
    "NO":"32",
    "Other":"99" }
}
oline_df = oline_df.replace(possessionTeams)

defensiveTeams = {"defensiveTeam": {
    "KC":"1",
    "DET":"2",
    "TB":"3",
    "MIA":"4",
    "NYG":"5",
    "WAS":"6",
    "CAR":"7",
    "DEN":"8",
    "ATL":"9",
    "NE":"10",
    "TEN":"11",
    "LAC":"12",
    "BAL":"13",
    "NYJ":"14",
    "LV":"15",
    "JAX":"16",
    "DAL":"17",
    "IND":"18",
    "CIN":"19",
    "PHI":"20",
    "MIN":"21",
    "BUF":"22",
    "HOU":"23",
    "LA":"24",
    "CLE":"25",
    "CHI":"26",
    "PIT":"27",
    "ARI":"28",
    "SEA":"29",
    "GB":"30",
    "SF":"31",
    "NO":"32",
    "Other":"99" }
}
oline_df = oline_df.replace(defensiveTeams)

In [18]:
oline_df['passResult'].value_counts()

C     23156
I     13805
S      2719
R      2251
IN      955
Name: passResult, dtype: int64

In [19]:
passResult = {"passResult": {
    "C":"1",
    "I":"2",
    "S":"3",
    "R":"4",
    "IN":"5",
    "Other":"99" }
}
oline_df = oline_df.replace(passResult)

In [20]:
oline_df['offenseFormation'].value_counts()

SHOTGUN       27421
EMPTY          6981
SINGLEBACK     5999
I_FORM         1500
PISTOL          775
JUMBO           165
0                35
WILDCAT          10
Name: offenseFormation, dtype: int64

In [21]:
offenseFormation = {"offenseFormation": {
    "SHOTGUN":"1",
    "EMPTY":"2",
    "SINGLEBACK":"3",
    "I_FORM":"4",
    "PISTOL":"5",
    "JUMBO":"6",
    "WILDCAT":"7",    
    "Other":"99" }
}
oline_df = oline_df.replace(offenseFormation)

In [22]:
oline_df['officialPosition_x'].value_counts()

DE     14622
DT     14470
OLB     7144
NT      4360
0       1108
ILB      704
MLB      183
SS       118
FS        85
CB        81
G          7
RB         2
LB         2
Name: officialPosition_x, dtype: int64

In [23]:
officialPositionDefense = {"officialPosition_x": {
    "DE":"1",
    "DT":"2",
    "OLB":"3",
    "NT":"4",
    "ILB":"5",
    "MLB":"6",
    "SS":"7",    
    "FS":"8",
    "CB":"9",
    "G":"10",
    "RB":"11",
    "LB":"12",     
    "Other":"99" }
}
oline_df = oline_df.replace(officialPositionDefense)

In [24]:
oline_df['dropBackType'].value_counts()

TRADITIONAL               32780
SCRAMBLE                   4499
0                          2647
DESIGNED_ROLLOUT_RIGHT     1434
DESIGNED_ROLLOUT_LEFT       750
SCRAMBLE_ROLLOUT_RIGHT      629
SCRAMBLE_ROLLOUT_LEFT       116
DESIGNED_RUN                 26
UNKNOWN                       5
Name: dropBackType, dtype: int64

In [25]:
dropBackType = {"dropBackType": {
    "TRADITIONAL":"1",
    "SCRAMBLE":"2",
    "DESIGNED_ROLLOUT_RIGHT":"3",
    "DESIGNED_ROLLOUT_LEFT":"4",
    "SCRAMBLE_ROLLOUT_RIGHT":"5",
    "SCRAMBLE_ROLLOUT_LEFT":"6",
    "DESIGNED_RUN":"7",    
    "UNKNOWN":"8",    
    "Other":"99" }
}
oline_df = oline_df.replace(dropBackType)

In [26]:
oline_df['pff_passCoverage'].value_counts()

Cover-3          13363
Cover-1          10072
Cover-2           5429
Quarters          5172
Cover-6           4032
Red Zone          1885
Cover-0           1358
2-Man             1000
Bracket            235
Prevent            160
Goal Line          140
Miscellaneous       40
Name: pff_passCoverage, dtype: int64

In [27]:
pff_passCoverage = {"pff_passCoverage": {
    "Cover-3":"1",
    "Cover-1":"2",
    "Cover-2":"3",
    "Quarters":"4",
    "Cover-6":"5",
    "Red Zone":"6",
    "Cover-0":"7",    
    "2-Man":"8", 
    "Bracket":"9",
    "Prevent":"10",    
    "Goal Line":"11",     
    "Miscellaneous":"12", 
    "Other":"99" }
}
oline_df = oline_df.replace(pff_passCoverage)

In [28]:
oline_df['pff_passCoverageType'].value_counts()

Zone     27996
Man      12430
Other     2460
Name: pff_passCoverageType, dtype: int64

In [29]:
pff_passCoverageType = {"pff_passCoverageType": {
    "Zone":"1",
    "Man":"2",
    "Other":"3" }
}
oline_df = oline_df.replace(pff_passCoverageType)

In [30]:
# Convert all values to numeric
oline_df = oline_df.apply(pd.to_numeric)

In [31]:
# Save to CSV
oline_df.to_csv('gs://big-data-bowl/play_analysis_oline_17.csv')