# Extract and Transform Concussion Qualitative Data

In [3]:
import polars as pl
import numpy as np

In [42]:
def data_loader(dataset): 
    """
    Accepts the desired dataset string and opens the file as either a polars dataframe
    or a lazyframe. Lazyloading is used for the larger tracking datasets. 
    """
    import polars as pl  # type: ignore
    import numpy as np # type: ignore

    valid_datasets = ['plays', 'injuries', 'role_data', 'punt_data', 'play_information', 'game_data', 'video_review']
    if dataset not in valid_datasets: 
        raise ValueError(f"Invalid dataset name '{dataset}'. Valid options are: {valid_datasets}")

    try:
        # Injury Datasets
        if dataset == 'plays':
            PlayList_path = "F:/Data/nfl-playing-surface-analytics/PlayList.csv"
            df = pl.read_csv(PlayList_path)
        elif dataset == 'injuries':
            InjuryRecord_path = "F:/Data/nfl-playing-surface-analytics/InjuryRecord.csv"
            df = pl.read_csv(InjuryRecord_path)

        # Concussion Datasets
        elif dataset == 'role_data':
            play_player_role_data_path = "F:/Data/NFL-Punt-Analytics-Competition/play_player_role_data.csv"
            df = pl.read_csv(play_player_role_data_path)
        elif dataset == 'punt_data':
            player_punt_data_path = "F:/Data/NFL-Punt-Analytics-Competition/player_punt_data.csv"
            df = pl.read_csv(player_punt_data_path)
        elif dataset == 'play_information':
            play_information_path = "F:/Data/NFL-Punt-Analytics-Competition/play_information.csv"
            df = pl.read_csv(play_information_path)
        elif dataset == 'game_data':
            game_data_path = "F:/Data/NFL-Punt-Analytics-Competition/game_data.csv"
            df = pl.read_csv(game_data_path)
        elif dataset == 'video_review':
            video_review_path = "F:/Data/NFL-Punt-Analytics-Competition/video_review.csv"
            df = pl.read_csv(video_review_path)

        return df
    
    except Exception as e: 
        print(f"An error occurred while loading the dataset '{dataset}': {e}")
        return None

In [101]:
def table_joiner(analysis):
    """
    Joins the plays and injuries tables in the surface injury data 
    """
    import polars as pl # type: ignore
    # from DataHandler import data_loader

    valid_analyses = ['injury', 'concussion']
    if analysis not in valid_analyses: 
        raise ValueError(f"Invalid dataset name '{analysis}'. Valid options are: {valid_analyses}")


    try: 
        #Injury Data Loader
        if analysis == 'injury':
            plays = data_loader('plays')
            injuries = data_loader('injuries')

            df = (
                plays.join(injuries, on="PlayKey", how='left')
                .select([
                    pl.col("PlayKey")
                    , pl.col("Position")
                    , pl.col("StadiumType")
                    , pl.col("FieldType")
                    , pl.col("Temperature")
                    , pl.col("Weather")
                    , pl.col("PlayType")
                    , pl.col("BodyPart")
                    , pl.col("DM_M1")
                    , pl.col("DM_M7")
                    , pl.col("DM_M28")
                    , pl.col("DM_M42")
                ])
            )

        # Concussion Data Loader
        elif analysis == 'concussion':
            role_data = data_loader('role_data')
            punt_data = data_loader('punt_data')
            play_information = data_loader('play_information')
            game_data = data_loader('game_data')
            video_review = data_loader('video_review')


            df = (
                role_data
                .join(
                    punt_data
                    , left_on="GSISID"
                    , right_on="GSISID"
                    , how="left"
                    , suffix="_punt"
                )
                .join(
                    play_information
                    , left_on=["GameKey", "PlayID"]
                    , right_on=["GameKey", "PlayID"]
                    , how="left"
                    , suffix="_play"
                )
                .join(
                    game_data
                    , left_on="GameKey"
                    , right_on="GameKey"
                    , how="left"
                    , suffix="_game"
                )
                .join(
                    video_review
                    , left_on=["GameKey", "PlayID", "GSISID"]
                    , right_on=["GameKey", "PlayID", "GSISID"]
                    , how="left"
                    , suffix="_video"
                )
                .with_columns([
                    pl.concat_str([
                        pl.col("GSISID").cast(pl.Utf8)
                        , pl.lit("-")
                        , pl.col("GameKey").cast(pl.Utf8)
                        , pl.lit("-")
                        , pl.col("PlayID").cast(pl.Utf8)
                    ]).alias("PlayKey"),
                    pl.concat_str([
                        pl.col("Primary_Partner_GSISID").cast(pl.Utf8)
                        , pl.lit("-")
                        , pl.col("GameKey").cast(pl.Utf8)
                        , pl.lit("-")
                        , pl.col("PlayID").cast(pl.Utf8)
                    ]).alias("OpponentKey")
                    , pl.when(pl.col("Primary_Partner_GSISID") == "Unclear")
                        .then(pl.lit("00000"))
                        .otherwise(pl.col("Primary_Partner_GSISID"))
                        .cast(pl.Int64)
                        .alias("Primary_Partner_GSISID")
                ])
                .select([
                    "PlayKey"
                    , "GSISID"
                    , "GameKey"
                    , "PlayID"
                    , "Position"
                    , 'Number'
                    , "Role"
                    , "Game_Date"
                    , "YardLine"
                    , "Quarter"
                    , "Play_Type"
                    , "Poss_Team"
                    , "Score_Home_Visiting"
                    , "Game_Site"
                    , "Start_Time"
                    , "HomeTeamCode"
                    , "VisitTeamCode"
                    , "StadiumType"
                    , "Turf"
                    , "GameWeather"
                    , "Temperature"
                    , "Player_Activity_Derived"
                    , "Primary_Impact_Type"
                    , "Primary_Partner_Activity_Derived"
                    , "Primary_Partner_GSISID"
                    , "OpponentKey"
                ])
                .unique()
            )



        print(f"Tables are holding hands. How cute.")
        return df
    
    except Exception as e: 
        print(f"An error occurred while processing the '{analysis}' analysis: {e}.")
        return None

In [102]:
table_joiner('concussion').head()

Tables are holding hands. How cute.


PlayKey,GSISID,GameKey,PlayID,Position,Number,Role,Game_Date,YardLine,Quarter,Play_Type,Poss_Team,Score_Home_Visiting,Game_Site,Start_Time,HomeTeamCode,VisitTeamCode,StadiumType,Turf,GameWeather,Temperature,Player_Activity_Derived,Primary_Impact_Type,Primary_Partner_Activity_Derived,Primary_Partner_GSISID,OpponentKey
str,i64,i64,i64,str,str,str,str,str,i64,str,str,str,str,str,str,str,str,str,str,f64,str,str,str,i64,str
"""32732-387-3026""",32732,387,3026,"""WR""","""85""","""VLi""","""08/31/2017""","""CHI 26""",3,"""Punt""","""CHI""","""0 - 19""","""Chicago""","""19:00""","""CHI""","""CLV""","""Outdoor""","""Grass""","""Partly cloudy, lows to upper 5…",72.0,,,,,
"""31665-178-1639""",31665,178,1639,"""RB""","""34""","""PDR1""","""10/30/2016""","""DET 49""",2,"""Punt""","""HST""","""14 - 0""","""Houston""","""12:00""","""HST""","""DET""","""Retractable Roof""","""Artificial""","""Mostly Sunny""",80.0,,,,,
"""29931-128-355""",29931,128,355,"""FB""","""39o""","""PLW""","""10/03/2016""","""MIN 8""",1,"""Punt""","""MIN""","""0 - 0""","""Minneapolis""","""19:30""","""MIN""","""NYG""","""Indoor""","""Field Turf""","""Indoors""",,,,,,
"""29033-395-3873""",29033,395,3873,"""WR""","""14""","""PDR1""","""08/31/2017""","""NO 45""",4,"""Punt""","""NO""","""13 - 14""","""New Orleans""","""19:00""","""NO""","""BLT""","""Indoors (Domed)""","""UBU Speed Series-S5-M""","""Sunny""",86.0,,,,,
"""31940-606-3671""",31940,606,3671,"""DE""","""92""","""PLG""","""12/11/2017""","""NE 21""",4,"""Punt""","""NE""","""27 - 17""","""Miami Gardens""","""20:30""","""MIA""","""NE""","""Outdoor""","""Natural Grass""","""Clear""",55.0,,,,,


In [103]:
concussion = table_joiner('concussion')

Tables are holding hands. How cute.


In [104]:
def injury_interpolator(df, analysis): 
    """
    Creates two new columns, IsInjured and IsSevere, where any injury sets IsInjured to 1, and any injury over 28 days provides a 1 in IsSevere
    """
    import polars as pl

    if analysis == 'injury':
        df = df.with_columns([ 
            pl.when(pl.col("DM_M1") == 1).then(1).otherwise(0).alias("IsInjured")
            , pl.when(pl.col("DM_M28") == 1).then(1).otherwise(0).alias("IsSevere")
            ])

    elif analysis == 'concussion':
        df = df.with_columns([ 
            pl.when(pl.col("Primary_Impact_Type").is_not_null()).then(1).otherwise(0).alias("IsInjured")
            , pl.col("Player_Activity_Derived").fill_null("No_Injury")
            , pl.col("Primary_Impact_Type").fill_null("No_Injury")
            , pl.col("Primary_Partner_Activity_Derived").fill_null("No_Injury")
            , pl.col("Primary_Partner_GSISID").fill_null(00000)
            , pl.col("OpponentKey").fill_null("None")
            ])
        
    print("Injury columns have been added.")
    return df

In [105]:
concussion = injury_interpolator(concussion, 'concussion')
concussion.head(2)


Injury columns have been added.


PlayKey,GSISID,GameKey,PlayID,Position,Number,Role,Game_Date,YardLine,Quarter,Play_Type,Poss_Team,Score_Home_Visiting,Game_Site,Start_Time,HomeTeamCode,VisitTeamCode,StadiumType,Turf,GameWeather,Temperature,Player_Activity_Derived,Primary_Impact_Type,Primary_Partner_Activity_Derived,Primary_Partner_GSISID,OpponentKey,IsInjured
str,i64,i64,i64,str,str,str,str,str,i64,str,str,str,str,str,str,str,str,str,str,f64,str,str,str,i64,str,i32
"""33790-509-2050""",33790,509,2050,"""OLB""","""50""","""PLT""","""10/29/2017""","""CIN 42""",2,"""Punt""","""CIN""","""10 - 10""","""Cincinnati""","""13:00""","""CIN""","""IND""","""Outdoor""","""UBU Sports Speed S5-M""","""Cloudy""",39.0,"""No_Injury""","""No_Injury""","""No_Injury""",0,"""None""",0
"""33410-372-2482""",33410,372,2482,"""CB""","""32""","""VR""","""08/24/2017""","""CAR 44""",3,"""Punt""","""JAX""","""10 - 10""","""Jacksonville""","""19:30""","""JAX""","""CAR""","""Open""","""Grass""",,84.0,"""No_Injury""","""No_Injury""","""No_Injury""",0,"""None""",0


In [106]:
def stadium_cleaner(df):
    """
    Noramlizes all stadium types to be either indoor or outdoor per game records. Some of the dome 
    stadiums were listed as open or closed for different games, and these were accounted for.
    All games with dates were checked to ensure null values were indeed outdoor games. 
    """
    import polars as pl  # type: ignore

    stadium_dict = {
        'Outdoor': 'Outdoor'
        , 'Indoors': 'Indoor'
        , 'Oudoor': 'Outdoor'
        , 'Outdoors': 'Outdoor'
        , 'Open': 'Outdoor'
        , 'Closed Dome': 'Indoor'
        , 'Domed, closed': 'Indoor'
        , 'Dome': 'Indoor'
        , 'Indoor': 'Indoor'
        , 'Domed': 'Indoor'
        , 'Retr. Roof-Closed': 'Indoor'
        , 'Outdoor Retr Roof-Open': 'Outdoor'
        , 'Retractable Roof': 'Indoor'
        , 'Ourdoor': 'Outdoor'
        , 'Indoor, Roof Closed': 'Indoor'
        , 'Retr. Roof - Closed': 'Indoor'
        , 'Bowl': 'Outdoor'
        , 'Outddors': 'Outdoor'
        , 'Retr. Roof-Open': 'Outdoor'
        , 'Dome, closed': 'Indoor'
        , 'Indoor, Open Roof': 'Outdoor'
        , 'Domed, Open': 'Outdoor'
        , 'Domed, open': 'Outdoor'
        , 'Heinz Field': 'Outdoor'
        , 'Cloudy': 'Outdoor'
        , 'Retr. Roof - Open': 'Outdoor'
        , 'Retr. Roof Closed': 'Indoor'
        , 'Outdor': 'Outdoor'
        , 'Outside': 'Outdoor'
        , 'outdoor': 'Outdoor'
        , 'Outdoors ': 'Outdoor'
        , 'Indoor, non-retractable roof': 'Indoor'
        , 'Retr. roof - closed': 'Indoor'
        , 'Indoor, fixed roof ': 'Indoor'
        , 'Indoor, Non-Retractable Dome': 'Indoor'
        , 'Indoor, Fixed Roof': 'Indoor'
        , 'Indoor, fixed roof': 'Indoor'
        , 'Indoors (Domed)': 'Indoor'
        , None: 'Outdoor'
        }


    df = df.with_columns(pl.col("StadiumType").replace(stadium_dict)) # This uses the dict to assign naming conventions

    print(f"Someone managed to clean up those stadiums!")
    return df

In [107]:
concussion = stadium_cleaner(concussion)
concussion.head(2)

Someone managed to clean up those stadiums!


PlayKey,GSISID,GameKey,PlayID,Position,Number,Role,Game_Date,YardLine,Quarter,Play_Type,Poss_Team,Score_Home_Visiting,Game_Site,Start_Time,HomeTeamCode,VisitTeamCode,StadiumType,Turf,GameWeather,Temperature,Player_Activity_Derived,Primary_Impact_Type,Primary_Partner_Activity_Derived,Primary_Partner_GSISID,OpponentKey,IsInjured
str,i64,i64,i64,str,str,str,str,str,i64,str,str,str,str,str,str,str,str,str,str,f64,str,str,str,i64,str,i32
"""33790-509-2050""",33790,509,2050,"""OLB""","""50""","""PLT""","""10/29/2017""","""CIN 42""",2,"""Punt""","""CIN""","""10 - 10""","""Cincinnati""","""13:00""","""CIN""","""IND""","""Outdoor""","""UBU Sports Speed S5-M""","""Cloudy""",39.0,"""No_Injury""","""No_Injury""","""No_Injury""",0,"""None""",0
"""33410-372-2482""",33410,372,2482,"""CB""","""32""","""VR""","""08/24/2017""","""CAR 44""",3,"""Punt""","""JAX""","""10 - 10""","""Jacksonville""","""19:30""","""JAX""","""CAR""","""Outdoor""","""Grass""",,84.0,"""No_Injury""","""No_Injury""","""No_Injury""",0,"""None""",0


In [108]:
def weather_cleaner(df):
     """
     Uses mapping to limit the number of different weather groupings. 
     """
     import polars as pl # type: ignore

     # If using the concussion dataset, rename the GameWeather column to Weather
     if "GameWeather" in df.columns:
       df = df.rename({"GameWeather": "Weather"})

     weather_dict = {
            'Clear and warm': 'Clear'
            , 'Mostly Cloudy': 'Cloudy'
            , 'Sunny': 'Clear'
            , 'Clear': 'Clear'
            , 'Cloudy': 'Cloudy'
            , 'Cloudy, fog started developing in 2nd quarter': 'Hazy/Fog'
            , 'Rain': 'Rain'
            , 'Partly Cloudy': 'Cloudy'
            , 'Mostly cloudy': 'Cloudy'
            , 'Cloudy and cold': 'Cloudy'
            , 'Cloudy and Cool': 'Cloudy'
            , 'Rain Chance 40%': 'Rain'
            , 'Controlled Climate': 'Indoor'
            , 'Sunny and warm': 'Clear'
            , 'Partly cloudy': 'Cloudy'
            , 'Clear and Cool': 'Cloudy'
            , 'Clear and cold': 'Cloudy'
            , 'Sunny and cold': 'Clear'
            , 'Indoor': 'Indoor'
            , 'Partly Sunny': 'Clear'
            , 'N/A (Indoors)': 'Indoor'
            , 'Mostly Sunny': 'Clear'
            , 'Indoors': 'Indoor'
            , 'Clear Skies': 'Clear'
            , 'Partly sunny': 'Clear'
            , 'Showers': 'Rain'
            , 'N/A Indoor': 'Indoor'
            , 'Sunny and clear': 'Clear'
            , 'Snow': 'Snow'
            , 'Scattered Showers': 'Rain'
            , 'Party Cloudy': 'Cloudy'
            , 'Clear skies': 'Clear'
            , 'Rain likely, temps in low 40s.': 'Rain'
            , 'Hazy': 'Hazy/Fog'
            , 'Partly Clouidy': 'Cloudy'
            , 'Sunny Skies': 'Clear'
            , 'Overcast': 'Cloudy'
            , 'Cloudy, 50% change of rain': 'Cloudy'
            , 'Fair': 'Clear'
            , 'Light Rain': 'Rain'
            , 'Partly clear': 'Clear'
            , 'Mostly Coudy': 'Cloudy'
            , '10% Chance of Rain': 'Cloudy'
            , 'Cloudy, chance of rain': 'Cloudy'
            , 'Heat Index 95': 'Clear'
            , 'Sunny, highs to upper 80s': 'Clear'
            , 'Sun & clouds': 'Cloudy'
            , 'Heavy lake effect snow': 'Snow'
            , 'Mostly sunny': 'Clear'
            , 'Cloudy, Rain': 'Rain'
            , 'Sunny, Windy': 'Windy'
            , 'Mostly Sunny Skies': 'Clear'
            , 'Rainy': 'Rain'
            , '30% Chance of Rain': 'Rain'
            , 'Cloudy, light snow accumulating 1-3"': 'Snow'
            , 'cloudy': 'Cloudy'
            , 'Clear and Sunny': 'Clear'
            , 'Coudy': 'Cloudy'
            , 'Clear and sunny': 'Clear'
            , 'Clear to Partly Cloudy': 'Clear'
            , 'Cloudy with periods of rain, thunder possible. Winds shifting to WNW, 10-20 mph.': 'Windy'
            , 'Rain shower': 'Rain'
            , 'Cold': 'Clear'
            , 'Partly cloudy, lows to upper 50s.': 'Cloudy'
            , 'Scattered thunderstorms': 'Rain'
            , 'CLEAR': 'Clear'
            , 'Partly CLoudy': 'Cloudy'
            , 'Chance of Showers': 'Rain'
            , 'Snow showers': 'Snow'
            , 'Clear and Cold': 'Clear'
            , 'Cloudy with rain': 'Rain'
            , 'Sunny intervals': 'Clear'
            , 'Clear and cool': 'Clear'
            , 'Cloudy, Humid, Chance of Rain': 'Rain'
            , 'Cloudy and Cold': 'Cloudy'
            , 'Cloudy with patches of fog': 'Hazy/Fog'
            , 'Controlled': 'Indoor'
            , 'Sunny and Clear': 'Clear'
            , 'Cloudy with Possible Stray Showers/Thundershowers': 'Rain'
            , 'Suny': 'Clear'
            , 'T-Storms': 'Rain'
            , 'Sunny and cool': 'Clear'
            , 'Cloudy, steady temps': 'Cloudy'
            , 'Hazy, hot and humid': 'Hazy/Fog'
            , 'Sunny Intervals': 'Clear'
            , 'Partly Cloudy, Chance of Rain 80%': 'Rain'
            , 'Mostly Clear. Gusting ot 14.': 'Windy'
            , 'Mostly CLoudy': 'Cloudy'
            , 'Snow Showers, 3 to 5 inches expected.': 'Snow'
            }



     df = df.with_columns(pl.col("Weather").replace(weather_dict)) # Standardizes the weather to a few main types

     df = df.with_columns(             # Null handling - all null weather conditions for indoor stadiums are filled "indoor"
                pl.when(pl.col("StadiumType") == "Indoor")
                .then(pl.col("Weather").fill_null("Indoor"))
                .otherwise(pl.col("Weather"))
                .alias("Weather")
                )
     
     # For the non-indoor games with null values for weather, to maintain the percentage of games that were clear/cloudy, temperature was used as a divider, above and below 70 degrees
     df = df.with_columns(
                pl.when(pl.col("Temperature") > 70)
                .then(pl.col("Weather").fill_null("Clear"))
                .otherwise(pl.col("Weather"))
                .alias("Weather")
                )
     df = df.with_columns(pl.col("Weather").fill_null("Cloudy"))

     print(f"Looks like the weather has been cleared up.")
     return df


In [109]:
concusion = weather_cleaner(concussion)
concussion.head(2)

Looks like the weather has been cleared up.


PlayKey,GSISID,GameKey,PlayID,Position,Number,Role,Game_Date,YardLine,Quarter,Play_Type,Poss_Team,Score_Home_Visiting,Game_Site,Start_Time,HomeTeamCode,VisitTeamCode,StadiumType,Turf,GameWeather,Temperature,Player_Activity_Derived,Primary_Impact_Type,Primary_Partner_Activity_Derived,Primary_Partner_GSISID,OpponentKey,IsInjured
str,i64,i64,i64,str,str,str,str,str,i64,str,str,str,str,str,str,str,str,str,str,f64,str,str,str,i64,str,i32
"""33790-509-2050""",33790,509,2050,"""OLB""","""50""","""PLT""","""10/29/2017""","""CIN 42""",2,"""Punt""","""CIN""","""10 - 10""","""Cincinnati""","""13:00""","""CIN""","""IND""","""Outdoor""","""UBU Sports Speed S5-M""","""Cloudy""",39.0,"""No_Injury""","""No_Injury""","""No_Injury""",0,"""None""",0
"""33410-372-2482""",33410,372,2482,"""CB""","""32""","""VR""","""08/24/2017""","""CAR 44""",3,"""Punt""","""JAX""","""10 - 10""","""Jacksonville""","""19:30""","""JAX""","""CAR""","""Outdoor""","""Grass""",,84.0,"""No_Injury""","""No_Injury""","""No_Injury""",0,"""None""",0


In [110]:
concussion.null_count()

PlayKey,GSISID,GameKey,PlayID,Position,Number,Role,Game_Date,YardLine,Quarter,Play_Type,Poss_Team,Score_Home_Visiting,Game_Site,Start_Time,HomeTeamCode,VisitTeamCode,StadiumType,Turf,GameWeather,Temperature,Player_Activity_Derived,Primary_Impact_Type,Primary_Partner_Activity_Derived,Primary_Partner_GSISID,OpponentKey,IsInjured
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
0,0,0,0,10,10,0,71,71,71,71,71,71,0,0,0,0,0,315,29416,17838,0,0,0,0,0,0


In [111]:
def turf_cleaner(df):
    import polars as pl

    df = df.rename({"Turf": "FieldType"})

    turf_dict = {
        'Grass': 'Natural',
        'Field Turf': 'Synthetic', 
        'Natural Grass': 'Natural',
        'grass': 'Natural',
        'Artificial': 'Synthetic',
        'FieldTurf': 'Synthetic',
        'DD GrassMaster': 'Synthetic',
        'A-Turf Titan': 'Synthetic',
        'UBU Sports Speed S5-M': 'Synthetic',
        'UBU Speed Series S5-M': 'Synthetic',
        'Artifical': 'Synthetic',
        'UBU Speed Series-S5-M': 'Synthetic',
        'FieldTurf 360': 'Synthetic',
        'Natural grass': 'Natural',
        'Field turf': 'Synthetic',
        'Natural': 'Natural',
        'Natrual Grass': 'Natural',
        'Synthetic': 'Synthetic',
        'Natural Grass ': 'Natural',
        'Naturall Grass': 'Natural',
        'FieldTurf360': 'Synthetic',
        None: 'Natural' # The only field with null values is Miami Gardens, which has Natural
        }
    
    df = df.with_columns(pl.col("FieldType").replace(turf_dict))
    return df

In [112]:
concussion = turf_cleaner(concussion)
concussion.head(2)

PlayKey,GSISID,GameKey,PlayID,Position,Number,Role,Game_Date,YardLine,Quarter,Play_Type,Poss_Team,Score_Home_Visiting,Game_Site,Start_Time,HomeTeamCode,VisitTeamCode,StadiumType,FieldType,GameWeather,Temperature,Player_Activity_Derived,Primary_Impact_Type,Primary_Partner_Activity_Derived,Primary_Partner_GSISID,OpponentKey,IsInjured
str,i64,i64,i64,str,str,str,str,str,i64,str,str,str,str,str,str,str,str,str,str,f64,str,str,str,i64,str,i32
"""33790-509-2050""",33790,509,2050,"""OLB""","""50""","""PLT""","""10/29/2017""","""CIN 42""",2,"""Punt""","""CIN""","""10 - 10""","""Cincinnati""","""13:00""","""CIN""","""IND""","""Outdoor""","""Synthetic""","""Cloudy""",39.0,"""No_Injury""","""No_Injury""","""No_Injury""",0,"""None""",0
"""33410-372-2482""",33410,372,2482,"""CB""","""32""","""VR""","""08/24/2017""","""CAR 44""",3,"""Punt""","""JAX""","""10 - 10""","""Jacksonville""","""19:30""","""JAX""","""CAR""","""Outdoor""","""Natural""",,84.0,"""No_Injury""","""No_Injury""","""No_Injury""",0,"""None""",0


In [136]:
def cancellation_cleaner(df):
    """
    There are 44 rows that have no Game_Date, which correlate with games that were canceled. 
    This was verified by looking at the lineup of hometeam and visit team during those seasons.

    Additionally, there are 10 rows that lack positions, numbers, or even an identifier to which 
    team the players were on, totalling 4 undocumented players. These rows will be removed as well.   
    """
    df = df.filter(pl.col("Game_Date").is_not_null())
    df = df.filter(pl.col("Position").is_not_null()) # Removes 4 players and 10 rows where the position and player number were not recorded, none associated with injuries

    return df
    

In [138]:
concussion = cancellation_cleaner(concussion)
concussion.head(2)

PlayKey,GSISID,GameKey,PlayID,Position,Number,Role,Game_Date,YardLine,Quarter,Play_Type,Poss_Team,Score_Home_Visiting,Game_Site,Start_Time,HomeTeamCode,VisitTeamCode,StadiumType,FieldType,GameWeather,Temperature,Player_Activity_Derived,Primary_Impact_Type,Primary_Partner_Activity_Derived,Primary_Partner_GSISID,OpponentKey,IsInjured
str,i64,i64,i64,str,str,str,str,str,i64,str,str,str,str,str,str,str,str,str,str,f64,str,str,str,i64,str,i32
"""33790-509-2050""",33790,509,2050,"""OLB""","""50""","""PLT""","""10/29/2017""","""CIN 42""",2,"""Punt""","""CIN""","""10 - 10""","""Cincinnati""","""13:00""","""CIN""","""IND""","""Outdoor""","""Synthetic""","""Cloudy""",39.0,"""No_Injury""","""No_Injury""","""No_Injury""",0,"""None""",0
"""33410-372-2482""",33410,372,2482,"""CB""","""32""","""VR""","""08/24/2017""","""CAR 44""",3,"""Punt""","""JAX""","""10 - 10""","""Jacksonville""","""19:30""","""JAX""","""CAR""","""Outdoor""","""Natural""",,84.0,"""No_Injury""","""No_Injury""","""No_Injury""",0,"""None""",0
"""31364-86-1283""",31364,86,1283,"""OLB""","""90""","""PDR4""","""09/18/2016""","""KC 45""",2,"""Punt""","""KC""","""7 - 3""","""Houston""","""12:00""","""HST""","""KC""","""Indoor""","""Synthetic""","""Partly Cloudy""",91.0,"""No_Injury""","""No_Injury""","""No_Injury""",0,"""None""",0
"""31793-500-2927""",31793,500,2927,"""RB""","""35""","""PDL2""","""10/22/2017""","""CIN 38""",4,"""Punt""","""CIN""","""26 - 14""","""Pittsburgh""","""16:25""","""PIT""","""CIN""","""Outdoor""","""Natural""","""Sunny""",79.0,"""No_Injury""","""No_Injury""","""No_Injury""",0,"""None""",0
"""30405-264-734""",30405,264,734,"""P""","""18""","""P""","""12/11/2016""","""MIN 23""",1,"""Punt""","""MIN""","""0 - 6""","""Jacksonville""","""13:00""","""JAX""","""MIN""","""Outdoor""","""Natural""",,71.0,"""No_Injury""","""No_Injury""","""No_Injury""",0,"""None""",0


In [143]:
concussion.null_count()

PlayKey,GSISID,GameKey,PlayID,Position,Number,Role,Game_Date,YardLine,Quarter,Play_Type,Poss_Team,Score_Home_Visiting,Game_Site,Start_Time,HomeTeamCode,VisitTeamCode,StadiumType,FieldType,GameWeather,Temperature,Player_Activity_Derived,Primary_Impact_Type,Primary_Partner_Activity_Derived,Primary_Partner_GSISID,OpponentKey,IsInjured
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,29416,17838,0,0,0,0,0,0


In [141]:
def score_splitter(df):
    ''' 
    Splits the string column from Score_Home_Visiting into two numeric columns for each of the scores. It also creates a column that calculates the difference. 
    '''
    import polars as pl # type: ignore

    df = df.with_columns([
        pl.col("Score_Home_Visiting").str.extract(r"(\d+)\s*-\s*(\d+)", 1).cast(pl.Int16).alias("Home_Score")
        , pl.col("Score_Home_Visiting").str.extract(r"(\d+)\s*-\s*(\d+)", 2).cast(pl.Int16).alias("Visiting_Score") # Find difference between scores
        ])

    df = df.with_columns([
        (pl.col("Home_Score") - pl.col("Visiting_Score")).cast(pl.Int16).alias("Score_Difference")
        ])
    
    df = df.drop("Score_Home_Visiting")
    
    print(f"The scores have been fixed. Just not how Pete Rose would fix them.")
    return df

In [142]:
score_splitter(concussion).head()

The scores have been fixed. Just not how Pete Rose would fix them.


PlayKey,GSISID,GameKey,PlayID,Position,Number,Role,Game_Date,YardLine,Quarter,Play_Type,Poss_Team,Game_Site,Start_Time,HomeTeamCode,VisitTeamCode,StadiumType,FieldType,GameWeather,Temperature,Player_Activity_Derived,Primary_Impact_Type,Primary_Partner_Activity_Derived,Primary_Partner_GSISID,OpponentKey,IsInjured,Home_Score,Visiting_Score,Score_Difference
str,i64,i64,i64,str,str,str,str,str,i64,str,str,str,str,str,str,str,str,str,f64,str,str,str,i64,str,i32,i16,i16,i16
"""33790-509-2050""",33790,509,2050,"""OLB""","""50""","""PLT""","""10/29/2017""","""CIN 42""",2,"""Punt""","""CIN""","""Cincinnati""","""13:00""","""CIN""","""IND""","""Outdoor""","""Synthetic""","""Cloudy""",39.0,"""No_Injury""","""No_Injury""","""No_Injury""",0,"""None""",0,10,10,0
"""33410-372-2482""",33410,372,2482,"""CB""","""32""","""VR""","""08/24/2017""","""CAR 44""",3,"""Punt""","""JAX""","""Jacksonville""","""19:30""","""JAX""","""CAR""","""Outdoor""","""Natural""",,84.0,"""No_Injury""","""No_Injury""","""No_Injury""",0,"""None""",0,10,10,0
"""31364-86-1283""",31364,86,1283,"""OLB""","""90""","""PDR4""","""09/18/2016""","""KC 45""",2,"""Punt""","""KC""","""Houston""","""12:00""","""HST""","""KC""","""Indoor""","""Synthetic""","""Partly Cloudy""",91.0,"""No_Injury""","""No_Injury""","""No_Injury""",0,"""None""",0,7,3,4
"""31793-500-2927""",31793,500,2927,"""RB""","""35""","""PDL2""","""10/22/2017""","""CIN 38""",4,"""Punt""","""CIN""","""Pittsburgh""","""16:25""","""PIT""","""CIN""","""Outdoor""","""Natural""","""Sunny""",79.0,"""No_Injury""","""No_Injury""","""No_Injury""",0,"""None""",0,26,14,12
"""30405-264-734""",30405,264,734,"""P""","""18""","""P""","""12/11/2016""","""MIN 23""",1,"""Punt""","""MIN""","""Jacksonville""","""13:00""","""JAX""","""MIN""","""Outdoor""","""Natural""",,71.0,"""No_Injury""","""No_Injury""","""No_Injury""",0,"""None""",0,0,6,-6


In [3]:
import polars as pl
video = pl.read_csv("F:/Data/NFL-Punt-Analytics-Competition/video_footage-injury.csv")
review = pl.read_csv("F:/Data/NFL-Punt-Analytics-Competition/video_review.csv")

In [4]:
video.head()

season,Type,Week,Home_team,Visit_Team,Qtr,PlayDescription,gamekey,playid,PREVIEW LINK (5000K)
i64,str,i64,str,str,i64,str,i64,i64,str
2016,"""Pre""",2,"""Chicago Bears""","""Denver Broncos""",3,"""(3:44) (Punt formation) P.O'Do…",56840,3129,"""http://a.video.nfl.com//films/…"
2016,"""Pre""",3,"""Tennessee Titans""","""Carolina Panthers""",3,"""(5:52) (Punt formation) K.Redf…",56856,2587,"""http://a.video.nfl.com//films/…"
2016,"""Pre""",3,"""Washington Redskins""","""New York Jets""",1,"""(4:46) L.Edwards punts 51 yard…",56864,538,"""http://a.video.nfl.com//films/…"
2016,"""Pre""",4,"""New York Jets""","""New York Giants""",2,"""(8:29) B.Wing punts 44 yards t…",56880,1212,"""http://a.video.nfl.com//films/…"
2016,"""Pre""",5,"""Detroit Lions""","""Buffalo Bills""",1,"""(:38) C.Schmidt punts 46 yards…",56895,905,"""http://a.video.nfl.com//films/…"


In [5]:
review.head()

Season_Year,GameKey,PlayID,GSISID,Player_Activity_Derived,Turnover_Related,Primary_Impact_Type,Primary_Partner_GSISID,Primary_Partner_Activity_Derived,Friendly_Fire
i64,i64,i64,i64,str,str,str,str,str,str
2016,5,3129,31057,"""Tackling""","""No""","""Helmet-to-body""","""32482""","""Tackled""","""No"""
2016,21,2587,29343,"""Blocked""","""No""","""Helmet-to-helmet""","""31059""","""Blocking""","""No"""
2016,29,538,31023,"""Tackling""","""No""","""Helmet-to-body""","""31941""","""Tackled""","""No"""
2016,45,1212,33121,"""Tackling""","""No""","""Helmet-to-body""","""28249""","""Tackled""","""No"""
2016,54,1045,32444,"""Blocked""","""No""","""Helmet-to-body""","""31756""","""Blocked""","""Yes"""


In [6]:
unique_plays = review["PlayID"].unique().to_list()
time_df = pl.DataFrame({
    "PlayID": unique_plays
    , "Time": [None]*len(unique_plays)
})

In [7]:
time_df

PlayID,Time
i64,null
183,
538,
602,
733,
905,
…,…
3509,
3609,
3630,
3663,


In [8]:
def update_time(play_id, new_time):
    global time_df
    time_df = time_df.with_columns(
        pl.when(pl.col("PlayID") == play_id)
        .then(new_time)
        .otherwise(pl.col("Time"))
        .alias("Time")
    )

In [10]:
update_time(2902, 9.7)
update_time(3609, 21.6)
update_time(3312, 12.9)
update_time(1988, 25.5)
update_time(3509, 15.5)
update_time(3746, 27.2)
update_time(3663, 23.6)
update_time(3278, 22.6)
update_time(3468, 12.7)
update_time(1526, 13.3)
update_time(2072, 23.7)
update_time(978, 21.1)
update_time(3630, 35.6)

In [12]:
df = time_df

In [13]:
update_time(733, 20.7)
update_time(538, 15.8)
update_time(2489, 20.2)
update_time(2792, 40.7)
update_time(2341, 15.3)
update_time(2918, 15.9)
update_time(1976, 19.5)
update_time(2764, 11.1)
update_time(1407, 27.9)
update_time(2342, 13.1)
update_time(1088, 35.5)
update_time(2667, 24.3)
update_time(1683, 14.1)
update_time(1526, 15.1)
update_time(2208, 20.0)
update_time(602, 33.3)
update_time(183, 13.2)
update_time(2792, 19.6)
update_time(1262, 22.6)



Some of these plays show the players making contact more than one time, but there is typically only a single time when there is a distinct trajectory change that signifies it was unanticipated contact for both players. Upon video review for the plays in question, this seems to confirm such. 

In [14]:
time_df

PlayID,Time
i64,f64
183,13.2
538,15.8
602,33.3
733,20.7
905,
…,…
3509,15.5
3609,21.6
3630,35.6
3663,23.6


In [15]:
time_df = time_df.filter(pl.col("Time").is_not_null())

In [16]:
time_df

PlayID,Time
i64,f64
183,13.2
538,15.8
602,33.3
733,20.7
978,21.1
…,…
3509,15.5
3609,21.6
3630,35.6
3663,23.6


In [19]:
time_df.write_csv("F:/Data/Clean_Data/impact_times.csv")


In [20]:
time_df.filter(pl.col("Time")>=44)

PlayID,Time
i64,f64


In [26]:
time_df['Time'].min()

9.7

The longest recorded play in the NFL is 44 seconds, but numerous plays continued to run the clock and exceed this, so it is impossible to have plays that go beyond 44 seconds. I don't know how long the clock elapsed before the play started, but it seems like it was between 5 and 10 seconds. What I am going to use as the earliest start time is 5 seconds less than the minimum time from the injury list, which is 9.7 seconds. So we can set the minium time to be 4.5 seconds. 

What I am aiming to do for the control data is cutting into 10 second samples (100 points). For the injuries, this will include 5 seconds before and 5 seconds after. Since there is no injury in the other plays, I will need to randomly find a start time between 4.5 seconds and 34 seconds. 