In [1]:
# Import dependencies
import pandas as pd
import math
import datetime as dt

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Read data
plays_df = pd.read_csv('https://storage.googleapis.com/big-data-bowl/plays.csv')
players_df = pd.read_csv('https://storage.googleapis.com/big-data-bowl/players.csv')
pffscouting_df = pd.read_csv('https://storage.googleapis.com/big-data-bowl/pffScoutingData.csv')
games_df = pd.read_csv('https://storage.googleapis.com/big-data-bowl/games.csv')
ol_players_pff_supplemental_df = pd.read_csv('https://storage.googleapis.com/big-data-bowl/ol_players_pff_supplemental_data.csv')

players_df.set_index('nflId', inplace=True)
players_df.update(ol_players_pff_supplemental_df.set_index('nflId'))
players_df.reset_index()

players_df['birthDate'].replace('NA','', inplace=True)
players_df['birthDate'] = pd.to_datetime(players_df['birthDate'])
players_df['age'] = players_df['birthDate'].map(lambda x: dt.date.today().year-x.year)
players_df['age'] = players_df['age'].fillna(0)
players_df['heightCm'] = players_df['height'].map(lambda x: int(x.split('-')[0])*30.48+int(x.split('-')[1])*2.54)

conference_map = {
    # 0 = None
    # 1 = ACC
    "ACC": [
            "Wake Forest",
            "Virginia Tech",
            "Virginia",
            "Syracuse",
            "Pittsburgh",
            "North Carolina State",
            "North Carolina",
            "Miami",
            "Louisville",
            "Georgia Tech",
            "Florida State",
            "Duke",
            "Clemson",
            "Boston College"
    ],
    # 2 = American
    "American": [
            "Tulsa",
            "Tulane",
            "Temple",
            "Southern Methodist",
            "South Florida",
            "Navy",
            "Memphis",
            "Houston",
            "East Carolina",
            "Cincinnati",
            "Central Florida",
    ],
    # 3 = Big12
    "Big 12": [
            "West Virginia",
            "Texas Tech",
            "Texas Christian",
            "Texas",
            "Oklahoma State",
            "Oklahoma",
            "Kansas State",
            "Kansas",
            "Iowa State",
            "Baylor"
    ],
    # 4 = Big10
    "Big 10": [
            "Wisconsin",
            "Rutgers",
            "Purdue",
            "Penn State",
            "Ohio State",
            "Northwestern",
            "Nebraska",
            "Minnesota",
            "Michigan State",
            "Michigan",
            "Maryland",
            "Iowa",
            "Indiana",
            "Illinois",   
    ],
    # 5 = C-USA
    "C-USA": [
            "Western Kentucky",
            "Texas-El Paso",
            "Southern Mississippi",
            "Old Dominion",
            "Louisiana Tech",
            "Florida International",
            "Florida Atlantic",
            "Texas-San Antonio",
            "Rice",
            "North Texas",
            "North Carolina-Charlotte",
            "Middle Tennessee",
            "Marshall",
            "Alabama-Birmingham"
    ],
    # 6 = Independent
    "Independent": [
            "Notre Dame",
            "Connecticut",
            "Brigham Young",
            "Massachusetts",
            "Liberty",
            "Army"
    ],
    # 7 = MAC
    "MAC": [
            "Western Michigan",
            "Toledo",
            "Northern Illinois",
            "Central Michigan",
            "Buffalo",
            "Ohio",
            "Miami, O.",
            "Kent State",
            "Eastern Michigan",
            "Bowling Green",
            "Ball State",
            "Akron"
    ],
    # 8 = Mountain West
    "Mountain West": [
            "Wyoming",
            "Utah State",
            "San Jose State",
            "Fresno State",
            "Colorado State",
            "Boise State",
            "San Diego State",
            "Nevada-Las Vegas",
            "Nevada",
            "Hawaii",
            "Air Force",
            "New Mexico"
    ],
    # 9 = Pac-12
    "Pac-12": [
            "Colorado",
            "Washington",
            "Utah",
            "UCLA",
            "Stanford",
            "Southern California",
            "Oregon State",
            "Oregon",
            "California",
            "Arizona State",
            "Arizona",
            "Washington State"
    ],
    # 10 = SBC (Sun Belt)
    "SBC": [
            "Louisiana-Lafayette",
            "Troy",
            "Texas State",
            "South Alabama",
            "Louisiana-Monroe",
            "Georgia State",
            "Georgia Southern",
            "Coastal Carolina",
            "Arkansas State",
            "Appalachian State",
            "James Madison"
    ],
    # 11 = SEC
    "SEC": [
            "Vanderbilt",
            "Texas A&amp;M",
            "Tennessee",
            "South Carolina",
            "Missouri",
            "Mississippi State",
            "Mississippi",
            "Louisiana State",
            "Kentucky",
            "Georgia",
            "Florida",
            "Auburn",
            "Arkansas",
            "Alabama"       
    ]
    # 99 = Other Non-FBS

}

players_df["playersFBSConference"] = "Other"
players_df["playersFBSConference"] = (
    players_df["collegeName"]
    .apply(lambda x: [k for k in conference_map.keys() if x in conference_map[k]])
    .str[0]
)

players_df.to_csv('gs://big-data-bowl/players_supplemented.csv')

print("Plays: "  + str(plays_df.shape))
print("Players: "  + str(players_df.shape))
print("Scouting: "  + str(pffscouting_df.shape))
print("Games: "  + str(games_df.shape))

Plays: (8557, 32)
Players: (1679, 9)
Scouting: (188254, 15)
Games: (122, 7)


In [3]:
combined_df = pd.merge(pffscouting_df, plays_df,  how='left', left_on=['gameId','playId'], right_on=['gameId','playId']) 
combined_df = pd.merge(combined_df, players_df, how='left', left_on=['nflId'], right_on=['nflId']) 
combined_df = pd.merge(combined_df, players_df, how='left', left_on=['pff_nflIdBlockedPlayer'], right_on=['nflId'], suffixes=(None, '_x')) 
combined_df = pd.merge(combined_df, games_df, how='left', left_on=['gameId'], right_on=['gameId']) 


combined_df['posTeamIsHomeTeam'] = 0
combined_df.loc[combined_df['possessionTeam'] == combined_df['homeTeamAbbr'], 'posTeamIsHomeTeam'] = 1

combined_df['posTeamOwnSideline'] = 0
combined_df.loc[combined_df['possessionTeam'] == combined_df['yardlineSide'], 'posTeamOwnSideline'] = 1

combined_df['playerCommittedFoul'] = 0
combined_df.loc[combined_df['foulNFLId1'] == combined_df['nflId'], 'playerCommittedFoul'] = 1 
combined_df.loc[combined_df['foulNFLId2'] == combined_df['nflId'], 'playerCommittedFoul'] = 1 
combined_df.loc[combined_df['foulNFLId3'] == combined_df['nflId'], 'playerCommittedFoul'] = 1 

combined_df['inTrouble'] = 0
combined_df.loc[combined_df['pff_hitAllowed'] == 1, 'inTrouble'] = 1
combined_df.loc[combined_df['pff_hurryAllowed'] == 1, 'inTrouble'] = 1
combined_df.loc[combined_df['pff_sackAllowed'] == 1, 'inTrouble'] = 1
combined_df.loc[combined_df['pff_beatenByDefender'] == 1, 'inTrouble'] = 1
combined_df.loc[combined_df['playerCommittedFoul'] == 1, 'inTrouble'] = 1

combined_df['gameClockInside2Minutes'] = 0
combined_df['gameClockInside2Minutes'] = combined_df['gameClock'].map(lambda x: int(x.split(':')[0]) < 2) 

combined_df['gameClockInside2MinutesQ2'] = 0
combined_df.loc[combined_df['gameClockInside2Minutes'] & (combined_df['quarter'] == 2), 'gameClockInside2MinutesQ2'] = 1

combined_df['gameClockInside2MinutesQ4'] = 0
combined_df.loc[combined_df['gameClockInside2Minutes'] & (combined_df['quarter'] == 4), 'gameClockInside2MinutesQ4'] = 1

combined_df['inOvertime'] = 0
combined_df.loc[combined_df['quarter'] == 5, 'inOvertime'] = 1

combined_df = combined_df.apply(lambda x: x.fillna(0) if x.dtype.kind in 'biufc' else x.fillna('0'))

combined_df

print('dataframe size: {}'.format(combined_df.shape))
combined_df

dataframe size: (188254, 77)


Unnamed: 0,gameId,playId,nflId,pff_role,pff_positionLinedUp,pff_hit,pff_hurry,pff_sack,pff_beatenByDefender,pff_hitAllowed,...,homeTeamAbbr,visitorTeamAbbr,posTeamIsHomeTeam,posTeamOwnSideline,playerCommittedFoul,inTrouble,gameClockInside2Minutes,gameClockInside2MinutesQ2,gameClockInside2MinutesQ4,inOvertime
0,2021090900,97,25511,Pass,QB,0.0,0.0,0.0,0.0,0.0,...,TB,DAL,1,1,0,0,False,0,0,0
1,2021090900,97,35481,Pass Route,TE-L,0.0,0.0,0.0,0.0,0.0,...,TB,DAL,1,1,0,0,False,0,0,0
2,2021090900,97,35634,Pass Route,LWR,0.0,0.0,0.0,0.0,0.0,...,TB,DAL,1,1,0,0,False,0,0,0
3,2021090900,97,39985,Pass Route,HB-R,0.0,0.0,0.0,0.0,0.0,...,TB,DAL,1,1,0,0,False,0,0,0
4,2021090900,97,40151,Pass Block,C,0.0,0.0,0.0,0.0,0.0,...,TB,DAL,1,1,0,0,False,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
188249,2021110100,4433,52507,Pass Block,LT,0.0,0.0,0.0,0.0,0.0,...,KC,NYG,0,1,0,1,True,0,1,0
188250,2021110100,4433,52546,Coverage,SCBoR,0.0,0.0,0.0,0.0,0.0,...,KC,NYG,0,1,0,0,True,0,1,0
188251,2021110100,4433,52573,Pass Route,SLoWR,0.0,0.0,0.0,0.0,0.0,...,KC,NYG,0,1,0,0,True,0,1,0
188252,2021110100,4433,52585,Pass Rush,LEO,0.0,0.0,0.0,0.0,0.0,...,KC,NYG,0,1,0,0,True,0,1,0


In [4]:
combined_df['personnelO'].value_counts()

1 RB, 1 TE, 3 WR                126500
1 RB, 2 TE, 2 WR                 33022
2 RB, 1 TE, 2 WR                  9614
1 RB, 3 TE, 1 WR                  4994
1 RB, 0 TE, 4 WR                  4048
2 RB, 2 TE, 1 WR                  2420
0 RB, 1 TE, 4 WR                  1958
2 RB, 0 TE, 3 WR                  1452
6 OL, 1 RB, 1 TE, 2 WR             902
0 RB, 2 TE, 3 WR                   616
6 OL, 1 RB, 2 TE, 1 WR             440
0 RB, 0 TE, 5 WR                   308
2 RB, 3 TE, 0 WR                   286
2 QB, 1 RB, 1 TE, 2 WR             264
6 OL, 2 RB, 2 TE, 0 WR             242
6 OL, 1 RB, 3 TE, 0 WR             176
6 OL, 2 RB, 0 TE, 2 WR             110
2 QB, 1 RB, 0 TE, 3 WR             110
6 OL, 2 RB, 1 TE, 1 WR             110
0 RB, 3 TE, 2 WR                    88
2 QB, 1 RB, 2 TE, 1 WR              88
2 QB, 2 RB, 0 TE, 2 WR              88
3 RB, 0 TE, 2 WR                    66
6 OL, 1 RB, 0 TE, 3 WR              66
1 RB, 1 TE, 2 WR,1 LB               66
2 QB, 2 RB, 1 TE, 1 WR   

In [7]:
offensePersonnelGrouping = {
    "1 RB, 1 TE, 3 WR":"11135",
    "1 RB, 2 TE, 2 WR":"11225",
    "2 RB, 1 TE, 2 WR":"12125",
    "1 RB, 3 TE, 1 WR":"11315",
    "1 RB, 0 TE, 4 WR":"11045",
    "2 RB, 2 TE, 1 WR":"12215",
    "0 RB, 1 TE, 4 WR":"10145",
    "2 RB, 0 TE, 3 WR":"12035",
    "6 OL, 1 RB, 1 TE, 2 WR":"11126",
    "0 RB, 2 TE, 3 WR":"10235",
    "6 OL, 1 RB, 2 TE, 1 WR":"11216",
    "0 RB, 0 TE, 5 WR":"10055",
    "2 RB, 3 TE, 0 WR":"12305",
    "2 QB, 1 RB, 1 TE, 2 WR":"21125",
    "6 OL, 2 RB, 2 TE, 0 WR":"12206",
    "6 OL, 1 RB, 3 TE, 0 WR":"11306",
    "2 QB, 1 RB, 0 TE, 3 WR":"21035",
    "6 OL, 2 RB, 1 TE, 1 WR":"12116",
    "6 OL, 2 RB, 0 TE, 2 WR":"12026",
    "2 QB, 2 RB, 0 TE, 2 WR":"22025",
    "2 QB, 1 RB, 2 TE, 1 WR":"21215",
    "0 RB, 3 TE, 2 WR":"10325",
    "1 RB, 1 TE, 2 WR,1 LB":"11126",
    "2 QB, 2 RB, 1 TE, 1 WR":"22115",
    "3 RB, 0 TE, 2 WR":"13025",
    "6 OL, 1 RB, 0 TE, 3 WR":"11036",
    "2 QB, 6 OL, 1 RB, 1 TE, 1 WR":"21116",
    "2 QB, 1 RB, 3 TE, 0 WR":"21305",
    "1 RB, 4 TE, 0 WR":"11405",
    "7 OL, 1 RB, 0 TE, 2 WR":"11027",
    "Other":"99"
}
combined_df["offensePersonnelGrouping"] = combined_df["personnelO"].map(offensePersonnelGrouping)

In [8]:
combined_df['personnelD'].value_counts()

4 DL, 2 LB, 5 DB    53240
2 DL, 4 LB, 5 DB    33902
3 DL, 3 LB, 5 DB    25234
2 DL, 3 LB, 6 DB    17578
4 DL, 3 LB, 4 DB    15004
3 DL, 4 LB, 4 DB    12232
4 DL, 1 LB, 6 DB    10098
3 DL, 2 LB, 6 DB     8536
1 DL, 4 LB, 6 DB     4598
1 DL, 5 LB, 5 DB     2772
5 DL, 2 LB, 4 DB     1474
5 DL, 1 LB, 5 DB      638
2 DL, 2 LB, 7 DB      594
4 DL, 4 LB, 3 DB      506
2 DL, 5 LB, 4 DB      418
1 DL, 3 LB, 7 DB      418
3 DL, 1 LB, 7 DB      220
5 DL, 3 LB, 3 DB      132
6 DL, 3 LB, 2 DB       88
6 DL, 2 LB, 3 DB       88
0 DL, 3 LB, 8 DB       88
5 DL, 5 LB, 1 DB       66
4 DL, 5 LB, 2 DB       66
4 DL, 6 LB, 1 DB       66
0 DL, 5 LB, 6 DB       44
6 DL, 1 LB, 4 DB       44
1 DL, 2 LB, 8 DB       44
0                      22
3 DL, 5 LB, 3 DB       22
6 DL, 4 LB, 1 DB       22
Name: personnelD, dtype: int64

In [10]:
defensePersonnelGrouping = {
    "4 DL, 2 LB, 5 DB":"425",
    "2 DL, 4 LB, 5 DB":"245",
    "3 DL, 3 LB, 5 DB":"335",
    "2 DL, 3 LB, 6 DB":"236",
    "4 DL, 3 LB, 4 DB":"434",
    "3 DL, 4 LB, 4 DB":"344",
    "4 DL, 1 LB, 6 DB":"416",
    "3 DL, 2 LB, 6 DB":"326",
    "1 DL, 4 LB, 6 DB":"146",
    "1 DL, 5 LB, 5 DB":"155",
    "5 DL, 2 LB, 4 DB":"524",
    "5 DL, 1 LB, 5 DB":"515",
    "2 DL, 2 LB, 7 DB":"227",
    "4 DL, 4 LB, 3 DB":"443",
    "1 DL, 3 LB, 7 DB":"137",
    "2 DL, 5 LB, 4 DB":"254",
    "3 DL, 1 LB, 7 DB":"317",
    "5 DL, 3 LB, 3 DB":"533",
    "0 DL, 3 LB, 8 DB":"038",
    "6 DL, 3 LB, 2 DB":"632",
    "6 DL, 2 LB, 3 DB":"623",
    "4 DL, 6 LB, 1 DB":"461",
    "5 DL, 5 LB, 1 DB":"551",
    "4 DL, 5 LB, 2 DB":"452",
    "6 DL, 1 LB, 4 DB":"614",
    "0 DL, 5 LB, 6 DB":"056",
    "1 DL, 2 LB, 8 DB":"128",
    "3 DL, 5 LB, 3 DB":"353",
    "6 DL, 4 LB, 1 DB":"641",
    "Other":"99" 
}
combined_df["defensePersonnelGrouping"] = combined_df["personnelD"].map(defensePersonnelGrouping)

In [11]:
# Filter for offensive line positions
oline_df = combined_df[combined_df['officialPosition'].isin(['G','C','T'])]

In [12]:
# Save to CSV
oline_df.to_csv('gs://big-data-bowl/presnap-data-oline-all.csv')