In [None]:
# Import dependencies
import pandas as pd
import math
import datetime as dt

import warnings
warnings.filterwarnings('ignore')

In [None]:
# Read data
plays_df = pd.read_csv('https://storage.googleapis.com/big-data-bowl/plays.csv')
players_df = pd.read_csv('https://storage.googleapis.com/big-data-bowl/players.csv')
pffscouting_df = pd.read_csv('https://storage.googleapis.com/big-data-bowl/pffScoutingData.csv')
games_df = pd.read_csv('https://storage.googleapis.com/big-data-bowl/games.csv')

print("Plays: "  + str(plays_df.shape))
print("Players: "  + str(players_df.shape))
print("Scouting: "  + str(pffscouting_df.shape))
print("Games: "  + str(games_df.shape))

Plays: (8557, 32)
Players: (1679, 7)
Scouting: (188254, 15)
Games: (122, 7)


In [None]:
combined_df = pd.merge(pffscouting_df, plays_df,  how='left', left_on=['gameId','playId'], right_on=['gameId','playId']) 
combined_df = pd.merge(combined_df, players_df, how='left', left_on=['nflId'], right_on=['nflId']) 
combined_df = pd.merge(combined_df, players_df, how='left', left_on=['pff_nflIdBlockedPlayer'], right_on=['nflId'], suffixes=(None, '_x')) 
combined_df = pd.merge(combined_df, games_df, how='left', left_on=['gameId'], right_on=['gameId']) 


combined_df['birthDate'].replace('NA','', inplace=True)
combined_df['birthDate'] = pd.to_datetime(combined_df['birthDate'])
combined_df['age'] = combined_df['birthDate'].map(lambda x: dt.date.today().year-x.year)
combined_df['age'] = combined_df['age'].fillna(0)
combined_df['heightCm'] = combined_df['height'].map(lambda x: int(x.split('-')[0])*30.48+int(x.split('-')[1])*2.54)

combined_df['posTeamIsHomeTeam'] = 0
combined_df.loc[combined_df['possessionTeam'] == combined_df['homeTeamAbbr'], 'posTeamIsHomeTeam'] = 1

combined_df['posTeamOwnSideline'] = 0
combined_df.loc[combined_df['possessionTeam'] == combined_df['yardlineSide'], 'posTeamOwnSideline'] = 1

combined_df['playerCommittedFoul'] = 0
combined_df.loc[combined_df['foulNFLId1'] == combined_df['nflId'], 'playerCommittedFoul'] = 1 
combined_df.loc[combined_df['foulNFLId2'] == combined_df['nflId'], 'playerCommittedFoul'] = 1 
combined_df.loc[combined_df['foulNFLId3'] == combined_df['nflId'], 'playerCommittedFoul'] = 1 

combined_df["inTrouble"] = combined_df[['pff_hitAllowed','pff_hurryAllowed','pff_sackAllowed','pff_beatenByDefender','playerCommittedFoul']].any(axis='columns')

combined_df['gameClockInside2Minutes'] = 0
combined_df['gameClockInside2Minutes'] = combined_df['gameClock'].map(lambda x: int(x.split(':')[0]) < 2) 

combined_df['gameClockInside2MinutesQ2'] = 0
combined_df.loc[combined_df['gameClockInside2Minutes'] & (combined_df['quarter'] == 2), 'gameClockInside2MinutesQ2'] = 1

combined_df['gameClockInside2MinutesQ4'] = 0
combined_df.loc[combined_df['gameClockInside2Minutes'] & (combined_df['quarter'] == 4), 'gameClockInside2MinutesQ4'] = 1

combined_df['inOvertime'] = 0
combined_df.loc[combined_df['quarter'] == 5, 'inOvertime'] = 1

conference_map = {
    # 1 = ACC
    "1": [
            "Wake Forest",
            "Virginia Tech",
            "Virginia",
            "Syracuse",
            "Pittsburgh",
            "North Carolina State",
            "North Carolina",
            "Miami",
            "Louisville",
            "Georgia Tech",
            "Florida State",
            "Duke",
            "Clemson",
            "Boston College"
    ],
    # 2 = American
    "2": [
            "Tulsa",
            "Tulane",
            "Temple",
            "Southern Methodist",
            "South Florida",
            "Navy",
            "Memphis",
            "Houston",
            "East Carolina",
            "Cincinnati",
            "Central Florida",
    ],
    # 3 = Big12
    "3": [
            "West Virginia",
            "Texas Tech",
            "Texas Christian",
            "Texas",
            "Oklahoma State",
            "Oklahoma",
            "Kansas State",
            "Kansas",
            "Iowa State",
            "Baylor"
    ],
    # 4 = Big10
    "4": [
            "Wisconsin",
            "Rutgers",
            "Purdue",
            "Penn State",
            "Ohio State",
            "Northwestern",
            "Nebraska",
            "Minnesota",
            "Michigan State",
            "Michigan",
            "Maryland",
            "Iowa",
            "Indiana",
            "Illinois",   
    ],
    # 5 = C-USA
    "5": [
            "Western Kentucky",
            "Texas-El Paso",
            "Southern Mississippi",
            "Old Dominion",
            "Louisiana Tech",
            "Florida International",
            "Florida Atlantic",
            "Texas-San Antonio",
            "Rice",
            "North Texas",
            "North Carolina-Charlotte",
            "Middle Tennessee",
            "Marshall",
            "Alabama-Birmingham"
    ],
    # 6 = Indpendent
    "6": [
            "Notre Dame",
            "Connecticut",
            "Brigham Young",
            "Massachusetts",
            "Liberty",
            "Army"
    ],
    # 7 = MAC
    "7": [
            "Western Michigan",
            "Toledo",
            "Northern Illinois",
            "Central Michigan",
            "Buffalo",
            "Ohio",
            "Miami, O.",
            "Kent State",
            "Eastern Michigan",
            "Bowling Green",
            "Ball State",
            "Akron"
    ],
    # 8 = Mountain West
    "8": [
            "Wyoming",
            "Utah State",
            "San Jose State",
            "Fresno State",
            "Colorado State",
            "Boise State",
            "San Diego State",
            "Nevada-Las Vegas",
            "Nevada",
            "Hawaii",
            "Air Force",
            "New Mexico"
    ],
    # 9 = Pac-12
    "9": [
            "Colorado",
            "Washington",
            "Utah",
            "UCLA",
            "Stanford",
            "Southern California",
            "Oregon State",
            "Oregon",
            "California",
            "Arizona State",
            "Arizona",
            "Washington State"
    ],
    # 10 = SBC (Sun Belt)
    "10": [
            "Louisiana-Lafayette",
            "Troy",
            "Texas State",
            "South Alabama",
            "Louisiana-Monroe",
            "Georgia State",
            "Georgia Southern",
            "Coastal Carolina",
            "Arkansas State",
            "Appalachian State",
            "James Madison"
    ],
    # 11 = SEC
    "11": [
            "Vanderbilt",
            "Texas A&amp;M",
            "Tennessee",
            "South Carolina",
            "Missouri",
            "Mississippi State",
            "Mississippi",
            "Louisiana State",
            "Kentucky",
            "Georgia",
            "Florida",
            "Auburn",
            "Arkansas",
            "Alabama"       
    ]
    # 99 = Other Non-FBS

}

combined_df["offensivPlayersFBSConference"] = (
    combined_df["collegeName"]
    .apply(lambda x: [k for k in conference_map.keys() if x in conference_map[k]])
    .str[0]
    .fillna("99")
)

combined_df = combined_df.apply(lambda x: x.fillna(0) if x.dtype.kind in 'biufc' else x.fillna('Other'))

combined_df

print('dataframe size: {}'.format(combined_df.shape))
combined_df

In [None]:
combined_df.columns

In [None]:
combined_df['offenseFormation'].value_counts()

In [None]:
# Filter for offensive line positions
combined_df_oline = combined_df[combined_df['officialPosition'].isin(['G','C','T'])]
print("Offensive Line Plays: "  + str(combined_df_oline.shape))
combined_df_oline

In [None]:
# Shape the dataframe so predictions can be made
combined_df_oline_cleaned = combined_df_oline.drop(columns=['gameId', 'playId', 'pff_role', 'pff_hit','pff_hurry','pff_sack', 'playDescription', 'displayName', 'season', 'gameDate', 'gameTimeEastern',
                                                            'homeTeamAbbr', 'visitorTeamAbbr', 'officialPosition', 'foulName1', 'foulNFLId1', 'foulName2', 'foulNFLId2', 'foulName3', 'foulNFLId3',
                                                            'pff_backFieldBlock', 'height', 'birthDate', 'gameClock', 'collegeName', 'yardlineSide', 'pff_hitAllowed','pff_hurryAllowed','pff_sackAllowed',
                                                            'pff_beatenByDefender', 'gameClockInside2Minutes', 'playerCommittedFoul'])

combined_df_oline_cleaned




In [None]:
combined_df_oline_cleaned.loc[~combined_df_oline_cleaned["pff_positionLinedUp"].isin(["C", "LG", "RG", "RT", "LT"])] = "Other"
combined_df_oline_cleaned['pff_positionLinedUp'].value_counts()

In [None]:
offensePosition = {"pff_positionLinedUp": {
    "LG":"1",
    "LT":"2",
    "C":"3",
    "RG":"4",
    "RT":"5" }
}
combined_df_oline_cleaned = combined_df_oline_cleaned.replace(offensePosition)


In [None]:
combined_df_oline_cleaned['pff_blockType'].value_counts()

In [None]:
blockType = {"pff_blockType": {
    "PP":"1",
    "PT":"2",
    "PA":"3",
    "SW":"4",
    "CL":"5",
    "NB":"6",
    "PR":"7",
    "UP":"8",
    "SR":"9",
    "CH":"10",
    "Other":"99" }
}
combined_df_oline_cleaned = combined_df_oline_cleaned.replace(blockType)

In [None]:
combined_df_oline_cleaned['personnelO'].value_counts()

In [None]:
offensePersonnelGrouping = {"personnelO": {
    "1 RB, 1 TE, 3 WR":"11135",
    "1 RB, 2 TE, 2 WR":"11225",
    "2 RB, 1 TE, 2 WR":"12125",
    "1 RB, 3 TE, 1 WR":"11315",
    "1 RB, 0 TE, 4 WR":"11045",
    "2 RB, 2 TE, 1 WR":"12215",
    "0 RB, 1 TE, 4 WR":"10145",
    "2 RB, 0 TE, 3 WR":"12035",
    "6 OL, 1 RB, 1 TE, 2 WR":"11126",
    "0 RB, 2 TE, 3 WR":"10235",
    "6 OL, 1 RB, 2 TE, 1 WR":"11216",
    "0 RB, 0 TE, 5 WR":"10055",
    "2 RB, 3 TE, 0 WR":"12305",
    "2 QB, 1 RB, 1 TE, 2 WR":"21125",
    "6 OL, 2 RB, 2 TE, 0 WR":"12206",
    "6 OL, 1 RB, 3 TE, 0 WR":"11306",
    "2 QB, 1 RB, 0 TE, 3 WR":"21035",
    "6 OL, 2 RB, 1 TE, 1 WR":"12116",
    "6 OL, 2 RB, 0 TE, 2 WR":"12026",
    "2 QB, 2 RB, 0 TE, 2 WR":"22025",
    "2 QB, 1 RB, 2 TE, 1 WR":"21215",
    "0 RB, 3 TE, 2 WR":"10325",
    "1 RB, 1 TE, 2 WR,1 LB":"11126",
    "2 QB, 2 RB, 1 TE, 1 WR":"22115",
    "3 RB, 0 TE, 2 WR":"13025",
    "6 OL, 1 RB, 0 TE, 3 WR":"11036",
    "2 QB, 6 OL, 1 RB, 1 TE, 1 WR":"21116",
    "2 QB, 1 RB, 3 TE, 0 WR":"21305",
    "1 RB, 4 TE, 0 WR":"11405",
    "7 OL, 1 RB, 0 TE, 2 WR":"11027",
    "Other":"99" }
}
combined_df_oline_cleaned = combined_df_oline_cleaned.replace(offensePersonnelGrouping)

combined_df_oline_cleaned                            

In [None]:
combined_df_oline_cleaned['personnelD'].value_counts()

In [None]:
defensePersonnelGrouping = {"personnelD": {
    "4 DL, 2 LB, 5 DB":"425",
    "2 DL, 4 LB, 5 DB":"245",
    "3 DL, 3 LB, 5 DB":"335",
    "2 DL, 3 LB, 6 DB":"236",
    "4 DL, 3 LB, 4 DB":"434",
    "3 DL, 4 LB, 4 DB":"344",
    "4 DL, 1 LB, 6 DB":"416",
    "3 DL, 2 LB, 6 DB":"326",
    "1 DL, 4 LB, 6 DB":"146",
    "1 DL, 5 LB, 5 DB":"155",
    "5 DL, 2 LB, 4 DB":"524",
    "5 DL, 1 LB, 5 DB":"515",
    "2 DL, 2 LB, 7 DB":"227",
    "4 DL, 4 LB, 3 DB":"443",
    "1 DL, 3 LB, 7 DB":"137",
    "2 DL, 5 LB, 4 DB":"254",
    "3 DL, 1 LB, 7 DB":"317",
    "5 DL, 3 LB, 3 DB":"533",
    "0 DL, 3 LB, 8 DB":"038",
    "6 DL, 3 LB, 2 DB":"632",
    "6 DL, 2 LB, 3 DB":"623",
    "4 DL, 6 LB, 1 DB":"461",
    "5 DL, 5 LB, 1 DB":"551",
    "4 DL, 5 LB, 2 DB":"452",
    "6 DL, 1 LB, 4 DB":"614",
    "0 DL, 5 LB, 6 DB":"056",
    "1 DL, 2 LB, 8 DB":"128",
    "3 DL, 5 LB, 3 DB":"353",
    "6 DL, 4 LB, 1 DB":"641",
    "Other":"99" }
}
combined_df_oline_cleaned = combined_df_oline_cleaned.replace(defensePersonnelGrouping)

combined_df_oline_cleaned

In [None]:
combined_df_oline_cleaned.columns

In [None]:
combined_df_oline_cleaned['possessionTeam'].value_counts()

In [None]:
possessionTeams = {"possessionTeam": {
    "KC":"1",
    "DET":"2",
    "TB":"3",
    "MIA":"4",
    "NYG":"5",
    "WAS":"6",
    "CAR":"7",
    "DEN":"8",
    "ATL":"9",
    "NE":"10",
    "TEN":"11",
    "LAC":"12",
    "BAL":"13",
    "NYJ":"14",
    "LV":"15",
    "JAX":"16",
    "DAL":"17",
    "IND":"18",
    "CIN":"19",
    "PHI":"20",
    "MIN":"21",
    "BUF":"22",
    "HOU":"23",
    "LA":"24",
    "CLE":"25",
    "CHI":"26",
    "PIT":"27",
    "ARI":"28",
    "SEA":"29",
    "GB":"30",
    "SF":"31",
    "NO":"32",
    "Other":"99" }
}
combined_df_oline_cleaned = combined_df_oline_cleaned.replace(possessionTeams)

defensiveTeams = {"defensiveTeam": {
    "KC":"1",
    "DET":"2",
    "TB":"3",
    "MIA":"4",
    "NYG":"5",
    "WAS":"6",
    "CAR":"7",
    "DEN":"8",
    "ATL":"9",
    "NE":"10",
    "TEN":"11",
    "LAC":"12",
    "BAL":"13",
    "NYJ":"14",
    "LV":"15",
    "JAX":"16",
    "DAL":"17",
    "IND":"18",
    "CIN":"19",
    "PHI":"20",
    "MIN":"21",
    "BUF":"22",
    "HOU":"23",
    "LA":"24",
    "CLE":"25",
    "CHI":"26",
    "PIT":"27",
    "ARI":"28",
    "SEA":"29",
    "GB":"30",
    "SF":"31",
    "NO":"32",
    "Other":"99" }
}
combined_df_oline_cleaned = combined_df_oline_cleaned.replace(defensiveTeams)


combined_df_oline_cleaned

In [None]:
combined_df_oline_cleaned['passResult'].value_counts()

In [None]:
passResult = {"passResult": {
    "C":"1",
    "I":"2",
    "S":"3",
    "R":"4",
    "IN":"5",
    "Other":"99" }
}
combined_df_oline_cleaned = combined_df_oline_cleaned.replace(passResult)

In [None]:
combined_df_oline_cleaned['offenseFormation'].value_counts()

In [None]:
offenseFormation = {"offenseFormation": {
    "SHOTGUN":"1",
    "EMPTY":"2",
    "SINGLEBACK":"3",
    "I_FORM":"4",
    "PISTOL":"5",
    "JUMBO":"6",
    "WILDCAT":"7",    
    "Other":"99" }
}
combined_df_oline_cleaned = combined_df_oline_cleaned.replace(offenseFormation)

In [None]:
combined_df_oline_cleaned['dropBackType'].value_counts()

In [None]:
dropBackType = {"dropBackType": {
    "TRADITIONAL":"1",
    "SCRAMBLE":"2",
    "DESIGNED_ROLLOUT_RIGHT":"3",
    "DESIGNED_ROLLOUT_LEFT":"4",
    "SCRAMBLE_ROLLOUT_RIGHT":"5",
    "SCRAMBLE_ROLLOUT_LEFT":"6",
    "DESIGNED_RUN":"7",    
    "UNKNOWN":"8",    
    "Other":"99" }
}
combined_df_oline_cleaned = combined_df_oline_cleaned.replace(dropBackType)

In [None]:
combined_df_oline_cleaned['pff_passCoverage'].value_counts()

In [None]:
pff_passCoverage = {"pff_passCoverage": {
    "Cover-3":"1",
    "Cover-1":"2",
    "Cover-2":"3",
    "Quarters":"4",
    "Cover-6":"5",
    "Red Zone":"6",
    "Cover-0":"7",    
    "2-Man":"8", 
    "Bracket":"9",
    "Prevent":"10",    
    "Goal Line":"11",     
    "Miscellaneous":"12", 
    "Other":"99" }
}
combined_df_oline_cleaned = combined_df_oline_cleaned.replace(pff_passCoverage)

In [None]:
combined_df_oline_cleaned['pff_passCoverageType'].value_counts()

In [None]:
pff_passCoverageType = {"pff_passCoverageType": {
    "Zone":"1",
    "Man":"2",
    "Other":"3" }
}
combined_df_oline_cleaned = combined_df_oline_cleaned.replace(pff_passCoverageType)

In [None]:
# Save to CSV
combined_df_oline_cleaned.to_csv('gs://big-data-bowl/play_analysis_oline.csv')