# Extract and Transform the Injury and Concussion Data
This will contain all of the functions for both injury and concussion and list the .py files that they are contained in. Finally, these .py files will be used to actually process the data, NOT the cells within the Jupyter notebook. 

In [None]:
def data_loader(dataset): 
    """
    Accepts the desired dataset string and opens the file as either a polars dataframe
    or a lazyframe. Lazyloading is used for the larger tracking datasets. 
    """
    import polars as pl  # type: ignore
    import numpy as np # type: ignore

    valid_datasets = ['plays', 'injuries', 'role_data', 'punt_data', 'play_information', 'game_data', 'video_review']
    if dataset not in valid_datasets: 
        raise ValueError(f"Invalid dataset name '{dataset}'. Valid options are: {valid_datasets}")

    try:
        # Injury Datasets
        if dataset == 'plays':
            PlayList_path = "F:/Data/nfl-playing-surface-analytics/PlayList.csv"
            df = pl.read_csv(PlayList_path)
        elif dataset == 'injuries':
            InjuryRecord_path = "F:/Data/nfl-playing-surface-analytics/InjuryRecord.csv"
            df = pl.read_csv(InjuryRecord_path)

        # Concussion Datasets
        elif dataset == 'role_data':
            play_player_role_data_path = "F:/Data/NFL-Punt-Analytics-Competition/play_player_role_data.csv"
            df = pl.read_csv(play_player_role_data_path)
        elif dataset == 'punt_data':
            player_punt_data_path = "F:/Data/NFL-Punt-Analytics-Competition/player_punt_data.csv"
            df = pl.read_csv(player_punt_data_path)
        elif dataset == 'play_information':
            play_information_path = "F:/Data/NFL-Punt-Analytics-Competition/play_information.csv"
            df = pl.read_csv(play_information_path)
        elif dataset == 'game_data':
            game_data_path = "F:/Data/NFL-Punt-Analytics-Competition/game_data.csv"
            df = pl.read_csv(game_data_path)
        elif dataset == 'video_review':
            video_review_path = "F:/Data/NFL-Punt-Analytics-Competition/video_review.csv"
            df = pl.read_csv(video_review_path)

        return df
    
    except Exception as e: 
        print(f"An error occurred while loading the dataset '{dataset}': {e}")
        return None

In [None]:
def table_joiner(analysis):
    """
    Joins the two non-ngs tables in the injury data, and joins 5 non-ngs tables from the concussion dataset. 
    """
    import polars as pl # type: ignore
    # from DataHandler import data_loader

    valid_analyses = ['injury', 'concussion']
    if analysis not in valid_analyses: 
        raise ValueError(f"Invalid dataset name '{analysis}'. Valid options are: {valid_analyses}")


    try: 
        #Injury Data Loader
        if analysis == 'injury':
            plays = data_loader('plays')
            injuries = data_loader('injuries')

            df = (
                plays.join(injuries, on="PlayKey", how='left')
                .select([
                    pl.col("PlayKey").cast(pl.Utf8)
                    , pl.col("Position").cast(pl.Utf8)
                    , pl.col("StadiumType").cast(pl.Utf8)
                    , pl.col("FieldType").cast(pl.Utf8)
                    , pl.col("Temperature").cast(pl.Int16)
                    , pl.col("Weather").cast(pl.Utf8)
                    , pl.col("PlayType").cast(pl.Utf8)
                    , pl.col("BodyPart").cast(pl.Utf8)
                    , pl.col("DM_M1").cast(pl.Int8)
                    , pl.col("DM_M7").cast(pl.Int8)
                    , pl.col("DM_M28").cast(pl.Int8)
                    , pl.col("DM_M42").cast(pl.Int8)
                ])
            )

        # Concussion Data Loader
        elif analysis == 'concussion':
            role_data = data_loader('role_data')
            punt_data = data_loader('punt_data')
            play_information = data_loader('play_information')
            game_data = data_loader('game_data')
            video_review = data_loader('video_review')


            df = (
                role_data
                .join(
                    punt_data
                    , left_on="GSISID"
                    , right_on="GSISID"
                    , how="left"
                    , suffix="_punt"
                )
                .join(
                    play_information
                    , left_on=["GameKey", "PlayID"]
                    , right_on=["GameKey", "PlayID"]
                    , how="left"
                    , suffix="_play"
                )
                .join(
                    game_data
                    , left_on="GameKey"
                    , right_on="GameKey"
                    , how="left"
                    , suffix="_game"
                )
                .join(
                    video_review
                    , left_on=["GameKey", "PlayID", "GSISID"]
                    , right_on=["GameKey", "PlayID", "GSISID"]
                    , how="left"
                    , suffix="_video"
                )
                .with_columns([
                    pl.concat_str([
                        pl.col("GSISID").cast(pl.Utf8)
                        , pl.lit("-")
                        , pl.col("GameKey").cast(pl.Utf8)
                        , pl.lit("-")
                        , pl.col("PlayID").cast(pl.Utf8)
                    ]).alias("PlayKey"),
                    pl.concat_str([
                        pl.col("Primary_Partner_GSISID").cast(pl.Utf8)
                        , pl.lit("-")
                        , pl.col("GameKey").cast(pl.Utf8)
                        , pl.lit("-")
                        , pl.col("PlayID").cast(pl.Utf8)
                    ]).alias("OpponentKey")
                    , pl.when(pl.col("Primary_Partner_GSISID") == "Unclear")
                        .then(pl.lit("00000"))
                        .otherwise(pl.col("Primary_Partner_GSISID"))
                        .cast(pl.Int64)
                        .alias("Primary_Partner_GSISID")
                ])
                .select([
                    "PlayKey"
                    , "GSISID"
                    , "GameKey"
                    , "PlayID"
                    , "Position"
                    , 'Number'
                    , "Role"
                    , "Game_Date"
                    , "YardLine"
                    , "Quarter"
                    , "Play_Type"
                    , "Poss_Team"
                    , "Score_Home_Visiting"
                    , "Game_Site"
                    , "Start_Time"
                    , "HomeTeamCode"
                    , "VisitTeamCode"
                    , "StadiumType"
                    , "Turf"
                    , "GameWeather"
                    , "Temperature"
                    , "Player_Activity_Derived"
                    , "Primary_Impact_Type"
                    , "Primary_Partner_Activity_Derived"
                    , "Primary_Partner_GSISID"
                    , "OpponentKey"
                ])
                .unique()
            )



        print(f"Tables are holding hands. How cute.")
        return df
    
    except Exception as e: 
        print(f"An error occurred while processing the '{analysis}' analysis: {e}.")
        return None

In [None]:
def injury_interpolator(df, analysis): 
    """
    Creates two new columns, IsInjured and IsSevere, where any injury sets 
    IsInjured to 1, and any injury over 28 days provides a 1 in IsSevere
    """
    import polars as pl

    if analysis == 'injury':
        df = df.with_columns([ 
            pl.when(pl.col("DM_M1") == 1).then(1).otherwise(0).cast(pl.Int8).alias("IsInjured")
            , pl.when(pl.col("DM_M28") == 1).then(1).otherwise(0).cast(pl.Int8).alias("IsSevere")
            ])
        
        df = df.filter(pl.col('PlayType').is_not_null()) # 0.14% of rows did not have a play type, and ALL of these were non-injury plays, so they were removed
        df = df.with_columns([ 
                pl.col("BodyPart").fill_null("No_Injury")
                , pl.col(["DM_M1", "DM_M7", "DM_M28", "DM_M42"]).fill_null(0)
                ])

    elif analysis == 'concussion':
        df = df.with_columns([ 
            pl.when(pl.col("Primary_Impact_Type").is_not_null()).then(1).otherwise(0).alias("IsInjured")
            , pl.col("Player_Activity_Derived").fill_null("No_Injury")
            , pl.col("Primary_Impact_Type").fill_null("No_Injury")
            , pl.col("Primary_Partner_Activity_Derived").fill_null("No_Injury")
            , pl.col("Primary_Partner_GSISID").fill_null(00000)
            , pl.col("OpponentKey").fill_null("None")
            ])
        
    print("Injury columns have been added.")
    return df

In [None]:
def stadium_cleaner(df):
    """
    Noramlizes all stadium types to be either indoor or outdoor per game records. Some of the dome 
    stadiums were listed as open or closed for different games, and these were accounted for.
    All games with dates were checked to ensure null values were indeed outdoor games. 
    """
    import polars as pl  # type: ignore

    stadium_dict = {
        'Outdoor': 'Outdoor'
        , 'Indoors': 'Indoor'
        , 'Oudoor': 'Outdoor'
        , 'Outdoors': 'Outdoor'
        , 'Open': 'Outdoor'
        , 'Closed Dome': 'Indoor'
        , 'Domed, closed': 'Indoor'
        , 'Dome': 'Indoor'
        , 'Indoor': 'Indoor'
        , 'Domed': 'Indoor'
        , 'Retr. Roof-Closed': 'Indoor'
        , 'Outdoor Retr Roof-Open': 'Outdoor'
        , 'Retractable Roof': 'Indoor'
        , 'Ourdoor': 'Outdoor'
        , 'Indoor, Roof Closed': 'Indoor'
        , 'Retr. Roof - Closed': 'Indoor'
        , 'Bowl': 'Outdoor'
        , 'Outddors': 'Outdoor'
        , 'Retr. Roof-Open': 'Outdoor'
        , 'Dome, closed': 'Indoor'
        , 'Indoor, Open Roof': 'Outdoor'
        , 'Domed, Open': 'Outdoor'
        , 'Domed, open': 'Outdoor'
        , 'Heinz Field': 'Outdoor'
        , 'Cloudy': 'Outdoor'
        , 'Retr. Roof - Open': 'Outdoor'
        , 'Retr. Roof Closed': 'Indoor'
        , 'Outdor': 'Outdoor'
        , 'Outside': 'Outdoor'
        , 'outdoor': 'Outdoor'
        , 'Outdoors ': 'Outdoor'
        , 'Indoor, non-retractable roof': 'Indoor'
        , 'Retr. roof - closed': 'Indoor'
        , 'Indoor, fixed roof ': 'Indoor'
        , 'Indoor, Non-Retractable Dome': 'Indoor'
        , 'Indoor, Fixed Roof': 'Indoor'
        , 'Indoor, fixed roof': 'Indoor'
        , 'Indoors (Domed)': 'Indoor'
        , None: 'Outdoor'
        }


    df = df.with_columns(pl.col("StadiumType").replace(stadium_dict)) # This uses the dict to assign naming conventions

    print(f"Someone managed to clean up those stadiums!")
    return df

In [None]:
def weather_cleaner(df):
     """
     Uses mapping to limit the number of different weather groupings. 
     """
     import polars as pl # type: ignore

     # If using the concussion dataset, rename the GameWeather column to Weather
     if "GameWeather" in df.columns:
       df = df.rename({"GameWeather": "Weather"})

     weather_dict = {
            'Clear and warm': 'Clear'
            , 'Mostly Cloudy': 'Cloudy'
            , 'Sunny': 'Clear'
            , 'Clear': 'Clear'
            , 'Cloudy': 'Cloudy'
            , 'Cloudy, fog started developing in 2nd quarter': 'Hazy/Fog'
            , 'Rain': 'Rain'
            , 'Partly Cloudy': 'Cloudy'
            , 'Mostly cloudy': 'Cloudy'
            , 'Cloudy and cold': 'Cloudy'
            , 'Cloudy and Cool': 'Cloudy'
            , 'Rain Chance 40%': 'Rain'
            , 'Controlled Climate': 'Indoor'
            , 'Sunny and warm': 'Clear'
            , 'Partly cloudy': 'Cloudy'
            , 'Clear and Cool': 'Cloudy'
            , 'Clear and cold': 'Cloudy'
            , 'Sunny and cold': 'Clear'
            , 'Indoor': 'Indoor'
            , 'Partly Sunny': 'Clear'
            , 'N/A (Indoors)': 'Indoor'
            , 'Mostly Sunny': 'Clear'
            , 'Indoors': 'Indoor'
            , 'Clear Skies': 'Clear'
            , 'Partly sunny': 'Clear'
            , 'Showers': 'Rain'
            , 'N/A Indoor': 'Indoor'
            , 'Sunny and clear': 'Clear'
            , 'Snow': 'Snow'
            , 'Scattered Showers': 'Rain'
            , 'Party Cloudy': 'Cloudy'
            , 'Clear skies': 'Clear'
            , 'Rain likely, temps in low 40s.': 'Rain'
            , 'Hazy': 'Hazy/Fog'
            , 'Partly Clouidy': 'Cloudy'
            , 'Sunny Skies': 'Clear'
            , 'Overcast': 'Cloudy'
            , 'Cloudy, 50% change of rain': 'Cloudy'
            , 'Fair': 'Clear'
            , 'Light Rain': 'Rain'
            , 'Partly clear': 'Clear'
            , 'Mostly Coudy': 'Cloudy'
            , '10% Chance of Rain': 'Cloudy'
            , 'Cloudy, chance of rain': 'Cloudy'
            , 'Heat Index 95': 'Clear'
            , 'Sunny, highs to upper 80s': 'Clear'
            , 'Sun & clouds': 'Cloudy'
            , 'Heavy lake effect snow': 'Snow'
            , 'Mostly sunny': 'Clear'
            , 'Cloudy, Rain': 'Rain'
            , 'Sunny, Windy': 'Windy'
            , 'Mostly Sunny Skies': 'Clear'
            , 'Rainy': 'Rain'
            , '30% Chance of Rain': 'Rain'
            , 'Cloudy, light snow accumulating 1-3"': 'Snow'
            , 'cloudy': 'Cloudy'
            , 'Clear and Sunny': 'Clear'
            , 'Coudy': 'Cloudy'
            , 'Clear and sunny': 'Clear'
            , 'Clear to Partly Cloudy': 'Clear'
            , 'Cloudy with periods of rain, thunder possible. Winds shifting to WNW, 10-20 mph.': 'Windy'
            , 'Rain shower': 'Rain'
            , 'Cold': 'Clear'
            , 'Partly cloudy, lows to upper 50s.': 'Cloudy'
            , 'Scattered thunderstorms': 'Rain'
            , 'CLEAR': 'Clear'
            , 'Partly CLoudy': 'Cloudy'
            , 'Chance of Showers': 'Rain'
            , 'Snow showers': 'Snow'
            , 'Clear and Cold': 'Clear'
            , 'Cloudy with rain': 'Rain'
            , 'Sunny intervals': 'Clear'
            , 'Clear and cool': 'Clear'
            , 'Cloudy, Humid, Chance of Rain': 'Rain'
            , 'Cloudy and Cold': 'Cloudy'
            , 'Cloudy with patches of fog': 'Hazy/Fog'
            , 'Controlled': 'Indoor'
            , 'Sunny and Clear': 'Clear'
            , 'Cloudy with Possible Stray Showers/Thundershowers': 'Rain'
            , 'Suny': 'Clear'
            , 'T-Storms': 'Rain'
            , 'Sunny and cool': 'Clear'
            , 'Cloudy, steady temps': 'Cloudy'
            , 'Hazy, hot and humid': 'Hazy/Fog'
            , 'Sunny Intervals': 'Clear'
            , 'Partly Cloudy, Chance of Rain 80%': 'Rain'
            , 'Mostly Clear. Gusting ot 14.': 'Windy'
            , 'Mostly CLoudy': 'Cloudy'
            , 'Snow Showers, 3 to 5 inches expected.': 'Snow'
            }



     df = df.with_columns(pl.col("Weather").replace(weather_dict)) # Standardizes the weather to a few main types

     df = df.with_columns(             # Null handling - all null weather conditions for indoor stadiums are filled "indoor"
                pl.when(pl.col("StadiumType") == "Indoor")
                .then(pl.col("Weather").fill_null("Indoor"))
                .otherwise(pl.col("Weather"))
                .alias("Weather")
                )
     
     # For the non-indoor games with null values for weather, to maintain the percentage of games that were clear/cloudy, temperature was used as a divider, above and below 70 degrees
     df = df.with_columns(
                pl.when(pl.col("Temperature") > 70)
                .then(pl.col("Weather").fill_null("Clear"))
                .otherwise(pl.col("Weather"))
                .alias("Weather")
                )
     df = df.with_columns(pl.col("Weather").fill_null("Cloudy"))

     print(f"Looks like the weather has been cleared up.")
     return df


In [None]:
def turf_cleaner(df):
    import polars as pl

    df = df.rename({"Turf": "FieldType"})

    turf_dict = {
        'Grass': 'Natural',
        'Field Turf': 'Synthetic', 
        'Natural Grass': 'Natural',
        'grass': 'Natural',
        'Artificial': 'Synthetic',
        'FieldTurf': 'Synthetic',
        'DD GrassMaster': 'Synthetic',
        'A-Turf Titan': 'Synthetic',
        'UBU Sports Speed S5-M': 'Synthetic',
        'UBU Speed Series S5-M': 'Synthetic',
        'Artifical': 'Synthetic',
        'UBU Speed Series-S5-M': 'Synthetic',
        'FieldTurf 360': 'Synthetic',
        'Natural grass': 'Natural',
        'Field turf': 'Synthetic',
        'Natural': 'Natural',
        'Natrual Grass': 'Natural',
        'Synthetic': 'Synthetic',
        'Natural Grass ': 'Natural',
        'Naturall Grass': 'Natural',
        'FieldTurf360': 'Synthetic',
        None: 'Natural' # The only field with null values is Miami Gardens, which has Natural
        }
    
    df = df.with_columns(pl.col("FieldType").replace(turf_dict))
    return df

In [None]:
def cancellation_cleaner(df):
    """
    There are 44 rows that have no Game_Date, which correlate with games that were canceled. 
    This was verified by looking at the lineup of hometeam and visit team during those seasons.

    Additionally, there are 10 rows that lack positions, numbers, or even an identifier to which 
    team the players were on, totalling 4 undocumented players. These rows will be removed as well.   
    """
    import polars as pl

    df = df.filter(pl.col("Game_Date").is_not_null())
    df = df.filter(pl.col("Position").is_not_null()) # Removes 4 players and 10 rows where the position and player number were not recorded, none associated with injuries

    return df
    

In [None]:
def score_splitter(df):
    ''' 
    Splits the string column from Score_Home_Visiting into two numeric columns for each of the scores. It also creates a column that calculates the difference. 
    '''
    import polars as pl # type: ignore

    df = df.with_columns([
        pl.col("Score_Home_Visiting").str.extract(r"(\d+)\s*-\s*(\d+)", 1).cast(pl.Int16).alias("Home_Score")
        , pl.col("Score_Home_Visiting").str.extract(r"(\d+)\s*-\s*(\d+)", 2).cast(pl.Int16).alias("Visiting_Score") # Find difference between scores
        ])

    df = df.with_columns([
        (pl.col("Home_Score") - pl.col("Visiting_Score")).cast(pl.Int16).alias("Score_Difference")
        ])
    
    df = df.drop("Score_Home_Visiting")
    
    print(f"The scores have been fixed. Just not how Pete Rose would fix them.")
    return df

In [None]:
def csv_writer(df, new_file_name):
    """
    Write table to local file as temporary until all cleaning and transformation is done.
    """
    import polars as pl # type: ignore
    import os
       
    path = 'F:/Data/Clean_Data'
    full_path = f"{path}/{new_file_name}.csv"
    
    # Check if file exists
    if os.path.exists(full_path):
        os.remove(full_path)
    
    # Write new file
    df.write_csv(full_path)
    print(f"New file has been written to {full_path}")

In [None]:
# def create_event_enum():
#     """
#     Create an Enum for known events.
#     """
#     import polars as pl # type: ignore

#     return pl.Enum([
#         "ball_snap"
#       ,  "drop_kick"
#       ,  "end_path"
#       ,  "extra_point"
#       ,  "extra_point_attempt"
#       ,  "extra_point_blocked"
#       ,  "extra_point_fake"
#       ,  "extra_point_missed"
#       ,  "fair_catch"
#       ,  "field_goal"
#       ,  "field_goal_attempt"
#       ,  "field_goal_blocked"
#       ,  "field_goal_fake"
#       ,  "field_goal_missed"
#       ,  "field_goal_play"
#       ,  "first_contact"
#       ,  "free_kick"
#       ,  "free_kick_play"
#       ,  "fumble"
#       ,  "fumble_defense_recovered"
#       ,  "fumble_offense_recovered"
#       ,  "handoff"
#       ,  "huddle_break_offense"
#       ,  "huddle_start_offense"
#       ,  "kick_received"
#       ,  "kick_recovered"
#       ,  "kickoff"
#       ,  "kickoff_land"
#       ,  "kickoff_play"
#       ,  "lateral"
#       ,  "line_set"
#       ,  "man_in_motion"
#       ,  "onside_kick"
#       ,  "out_of_bounds"
#       ,  "pass_arrived"
#       ,  "pass_forward"
#       ,  "pass_lateral"
#       ,  "pass_outcome_caught"
#       ,  "pass_outcome_incomplete"
#       ,  "pass_outcome_interception"
#       ,  "pass_outcome_touchdown"
#       ,  "pass_shovel"
#       ,  "pass_tipped"
#       ,  "penalty_accepted"
#       ,  "penalty_declined"
#       ,  "penalty_flag"
#       ,  "play_action"
#       ,  "play_submit"
#       ,  "punt"
#       ,  "punt_blocked"
#       ,  "punt_downed"
#       ,  "punt_fake"
#       ,  "punt_land"
#       ,  "punt_muffed"
#       ,  "punt_play"
#       ,  "punt_received"
#       ,  "qb_kneel"
#       ,  "qb_sack"
#       ,  "qb_spike"
#       ,  "qb_strip_sack"
#       ,  "run"
#       ,  "run_pass_option"
#       ,  "safety"
#       ,  "shift"
#       ,  "snap_direct"
#       ,  "tackle"
#       ,  "timeout"
#       ,  "timeout_away"
#       ,  "timeout_booth_review"
#       ,  "timeout_halftime"
#       ,  "timeout_home"
#       ,  "timeout_injury"
#       ,  "timeout_quarter"
#       ,  "timeout_tv"
#       ,  "touchback"
#       ,  "touchdown"
#       ,  "two_minute_warning"
#       ,  "two_point_conversion"
#       ,  "two_point_play"
#       ,  "xp_fake"
#     ])

In [None]:
def data_shrinker(df, verbose=True):
    """
    Optimize memory usage of a Polars dataframe for both categorical and numeric data.
    """
    import polars as pl
    import numpy as np

    # Enable string cache to ensure consistent encoding
    pl.enable_string_cache()

    start_mem = df.estimated_size("mb")
    if verbose:
        print(f'Memory usage of dataframe is {start_mem:.2f} MB')

    for col in df.columns:
        col_type = df[col].dtype

        if col_type in [pl.Int8, pl.Int16, pl.Int32, pl.Int64, pl.Float32, pl.Float64]:
            # Handle missing values
            if df[col].null_count() > 0:
                c_min = df[col].min() if df[col].min() is not None else float('nan')
                c_max = df[col].max() if df[col].max() is not None else float('nan')
            else:
                c_min = df[col].min()
                c_max = df[col].max()

            if col_type.is_integer():
                if c_min >= np.iinfo(np.int8).min and c_max <= np.iinfo(np.int8).max:
                    df = df.with_columns(pl.col(col).cast(pl.Int8))
                elif c_min >= np.iinfo(np.int16).min and c_max <= np.iinfo(np.int16).max:
                    df = df.with_columns(pl.col(col).cast(pl.Int16))
                elif c_min >= np.iinfo(np.int32).min and c_max <= np.iinfo(np.int32).max:
                    df = df.with_columns(pl.col(col).cast(pl.Int32))
                else:
                    df = df.with_columns(pl.col(col).cast(pl.Int64))
            else:
                if c_min >= np.finfo(np.float32).min and c_max <= np.finfo(np.float32).max:
                    df = df.with_columns(pl.col(col).cast(pl.Float32))
                else:
                    df = df.with_columns(pl.col(col).cast(pl.Float64))

        elif col_type == pl.Utf8:
            if col != "PlayKey" and df[col].n_unique() / len(df) < 0.5:  # If less than 50% unique values
                # Create an Enum type for the column
                enum_type = pl.Enum(df[col].unique())
                df = df.with_columns(pl.col(col).cast(enum_type))

    end_mem = df.estimated_size("mb")
    if verbose:
        print(f'Memory usage after optimization is: {end_mem:.2f} MB')
        print(f'Decreased by {100 * (start_mem - end_mem) / start_mem:.1f}%')

    return df


In [None]:
##### Primary Injury Cleaning Function #####
def clean_injuries():
    """
    Applies data cleaning to surface injury data and writes to 'qualitative_injuries' as a csv file 
    """
    analysis = "injury"
    df = table_joiner(analysis) 
    df = injury_interpolator(df, analysis)
    df = stadium_cleaner(df)
    df = weather_cleaner(df)
    df = data_shrinker(df)
    # csv_writer(df, "qualitative_injuries")
    # del df
    print('Injuries have been cleaned and dressed.')
    return df

In [None]:
##### Primary Concussion Cleaning Function #####
def clean_concussions(): 
    """
    Applies data cleaning to surface injury data and writes to 'qualitative_injuries' as a csv file 
    """
    analysis = "concussion"
    df = table_joiner(analysis)
    df = injury_interpolator(df, analysis)
    df = stadium_cleaner(df)
    df = weather_cleaner(df)
    df = turf_cleaner(df)
    df = cancellation_cleaner(df)
    df = score_splitter(df)
    df = data_shrinker(df)
    # csv_writer(df, "qualitative_concussions")
    # del df

    print('Concussions have been assessed and cleared for play.')
    return df

In [1]:
from DataHandler import *
from QualitativeCleaner import clean_concussions, clean_injuries

In [2]:
clean_injuries()

Tables are holding hands. How cute.
Injury columns have been added.
Someone managed to clean up those stadiums!
Looks like the weather has been cleared up.
Memory usage of dataframe is 14.00 MB
Memory usage after optimization is: 10.92 MB
Decreased by 22.0%
New Parquet file has been written to F:/Data/Clean_Data/qualitative_injuries.parquet
Injuries have been cleaned and dressed.


In [3]:
clean_concussions()

Tables are holding hands. How cute.
Injury columns have been added.
Someone managed to clean up those stadiums!
Looks like the weather has been cleared up.
The scores have been fixed. Just not how Pete Rose would fix them.
Memory usage of dataframe is 35.78 MB
Memory usage after optimization is: 22.08 MB
Decreased by 38.3%
New Parquet file has been written to F:/Data/Clean_Data/qualitative_concussions.parquet
Concussions have been assessed and cleared for play.


In [1]:
from DataHandler import *


In [2]:
data_loader('qualitative_concussions').head()



PlayKey,GSISID,GameKey,PlayID,Position,Number,Role,Game_Date,YardLine,Quarter,Play_Type,Poss_Team,Game_Site,Start_Time,HomeTeamCode,VisitTeamCode,StadiumType,FieldType,Weather,Temperature,Player_Activity_Derived,Primary_Impact_Type,Primary_Partner_Activity_Derived,Primary_Partner_GSISID,OpponentKey,IsInjured,Home_Score,Visiting_Score,Score_Difference
str,i32,i16,i16,enum,enum,enum,enum,enum,i8,enum,enum,enum,enum,enum,enum,enum,enum,enum,f32,enum,enum,enum,i32,enum,i8,i8,i8,i8
"""28454-131-122""",28454,131,122,"""OLB""","""55""","""PDL2""","""10/09/2016""","""CLV 32""",1,"""Punt""","""CLV""","""Cleveland""","""13:00""","""CLV""","""NE""","""Outdoor""","""Natural""","""Clear""",58.0,"""No_Injury""","""No_Injury""","""No_Injury""",0,"""None""",0,0,0,0
"""28495-264-272""",28495,264,272,"""DE""","""94""","""PDR3""","""12/11/2016""","""JAX 14""",1,"""Punt""","""JAX""","""Jacksonville""","""13:00""","""JAX""","""MIN""","""Outdoor""","""Natural""","""Clear""",71.0,"""No_Injury""","""No_Injury""","""No_Injury""",0,"""None""",0,0,3,-3
"""32190-367-3608""",32190,367,3608,"""TE""","""82""","""PRG""","""08/26/2017""","""ARZ 17""",4,"""Punt""","""ARZ""","""Atlanta""","""19:00""","""ATL""","""ARZ""","""Indoor""","""Synthetic""","""Indoor""",70.0,"""No_Injury""","""No_Injury""","""No_Injury""",0,"""None""",0,6,24,-18
"""29529-546-2191""",29529,546,2191,"""CB""","""24""","""VLo""","""11/16/2017""","""TEN 25""",2,"""Punt""","""TEN""","""Pittsburgh""","""20:25""","""PIT""","""TEN""","""Outdoor""","""Natural""","""Cloudy""",40.0,"""No_Injury""","""No_Injury""","""No_Injury""",0,"""None""",0,13,7,6
"""30181-93-914""",30181,93,914,"""TE""","""84""","""PLW""","""09/18/2016""","""IND 30""",1,"""Punt""","""IND""","""Denver""","""14:25""","""DEN""","""IND""","""Outdoor""","""Natural""","""Clear""",85.0,"""No_Injury""","""No_Injury""","""No_Injury""",0,"""None""",0,3,3,0


In [3]:
injuries = data_loader('qualitative_injuries')
injuries.head()

PlayKey,Position,StadiumType,FieldType,Temperature,Weather,PlayType,BodyPart,DM_M1,DM_M7,DM_M28,DM_M42,IsInjured,IsSevere
str,enum,enum,enum,i16,enum,enum,enum,i8,i8,i8,i8,i8,i8
"""26624-1-1""","""QB""","""Outdoor""","""Synthetic""",63,"""Clear""","""Pass""","""No_Injury""",0,0,0,0,0,0
"""26624-1-2""","""QB""","""Outdoor""","""Synthetic""",63,"""Clear""","""Pass""","""No_Injury""",0,0,0,0,0,0
"""26624-1-3""","""QB""","""Outdoor""","""Synthetic""",63,"""Clear""","""Rush""","""No_Injury""",0,0,0,0,0,0
"""26624-1-4""","""QB""","""Outdoor""","""Synthetic""",63,"""Clear""","""Rush""","""No_Injury""",0,0,0,0,0,0
"""26624-1-5""","""QB""","""Outdoor""","""Synthetic""",63,"""Clear""","""Pass""","""No_Injury""",0,0,0,0,0,0


In [5]:
3/3.5

0.8571428571428571