# Debugging Summaries

Need to go through the whole process of Summary Data production to ML prep

### Start with Injury Cleaning

In [9]:
def clean_injury_qual():
    """
    Applies data cleaning to surface injury data and writes to 'qualitative_injuries' as a csv file 
    """
    from DataHandler import data_shrinker
    import os 
    import time

    start_time = time.time()
    analysis = "injury"
    injury_qual_path = "F:/Data/Processing_data/QualInjuries.parquet"

    df = table_joiner(analysis) 
    df = injury_interpolator(df, analysis)
    df = stadium_cleaner(df)
    df = play_cleaner(df)
    df = weather_cleaner(df)
    df, schema = data_shrinker(df)
    # df.write_parquet(injury_qual_path)

    end_time = time.time()
    execution_time = end_time - start_time
    print(f'Injuries have been cleaned and dressed. Execution time: {execution_time} seconds.')
    return df

In [3]:
def table_joiner(analysis):
    """
    Joins the two non-ngs tables in the injury data, and joins 5 non-ngs tables from the concussion dataset. 
    """
    import polars as pl # type: ignore
    from DataHandler import data_loader

    valid_analyses = ['injury', 'concussion']
    if analysis not in valid_analyses: 
        raise ValueError(f"Invalid dataset name '{analysis}'. Valid options are: {valid_analyses}")


    try: 
        #Injury Data Loader
        if analysis == 'injury':
            plays = data_loader('plays')
            injuries = data_loader('injuries')

            df = (
                plays.join(injuries, on="PlayKey", how='left')
                .select([
                    pl.col("PlayKey").cast(pl.Utf8)
                    , pl.col("Position").cast(pl.Utf8)
                    , pl.col("StadiumType").cast(pl.Utf8)
                    , pl.col("FieldType").cast(pl.Utf8)
                    , pl.col("Temperature").cast(pl.Int16)
                    , pl.col("Weather").cast(pl.Utf8)
                    , pl.col("PlayType").cast(pl.Utf8)
                    , pl.col("BodyPart").cast(pl.Utf8)
                    , pl.col("DM_M1").cast(pl.Int8)
                    , pl.col("DM_M7").cast(pl.Int8)
                    , pl.col("DM_M28").cast(pl.Int8)
                    , pl.col("DM_M42").cast(pl.Int8)
                ])
            )

        # Concussion Data Loader
        elif analysis == 'concussion':
            role_data = data_loader('role_data')
            punt_data = data_loader('punt_data')
            play_information = data_loader('play_information')
            game_data = data_loader('game_data')
            video_review = data_loader('video_review')


            df = (
                role_data
                .join(
                    punt_data
                    , left_on="GSISID"
                    , right_on="GSISID"
                    , how="left"
                    , suffix="_punt"
                )
                .join(
                    play_information
                    , left_on=["GameKey", "PlayID"]
                    , right_on=["GameKey", "PlayID"]
                    , how="left"
                    , suffix="_play"
                )
                .join(
                    game_data
                    , left_on="GameKey"
                    , right_on="GameKey"
                    , how="left"
                    , suffix="_game"
                )
                .join(
                    video_review
                    , left_on=["GameKey", "PlayID", "GSISID"]
                    , right_on=["GameKey", "PlayID", "GSISID"]
                    , how="left"
                    , suffix="_video"
                )
                .with_columns([
                    pl.concat_str([
                        pl.col("GSISID").cast(pl.Utf8)
                        , pl.lit("-")
                        , pl.col("GameKey").cast(pl.Utf8)
                        , pl.lit("-")
                        , pl.col("PlayID").cast(pl.Utf8)
                    ]).alias("PlayKey"),
                    pl.concat_str([
                        pl.col("Primary_Partner_GSISID").cast(pl.Utf8)
                        , pl.lit("-")
                        , pl.col("GameKey").cast(pl.Utf8)
                        , pl.lit("-")
                        , pl.col("PlayID").cast(pl.Utf8)
                    ]).alias("OpponentKey")
                    , pl.when(pl.col("Primary_Partner_GSISID") == "Unclear")
                        .then(pl.lit("00000"))
                        .otherwise(pl.col("Primary_Partner_GSISID"))
                        .cast(pl.Int64)
                        .alias("Primary_Partner_GSISID")
                ])
                .select([
                    "PlayKey"
                    , "GSISID"
                    , "GameKey"
                    , "PlayID"
                    , "Position"
                    , 'Number'
                    , "Role"
                    , "Game_Date"
                    , "YardLine"
                    , "Quarter"
                    , "Play_Type"
                    , "Poss_Team"
                    , "Score_Home_Visiting"
                    , "Game_Site"
                    , "Start_Time"
                    , "HomeTeamCode"
                    , "VisitTeamCode"
                    , "StadiumType"
                    , "Turf"
                    , "GameWeather"
                    , "Temperature"
                    , "Player_Activity_Derived"
                    , "Primary_Impact_Type"
                    , "Primary_Partner_Activity_Derived"
                    , "Primary_Partner_GSISID"
                    , "OpponentKey"
                ])
                .unique()
            )



        print(f"Tables are holding hands. How cute.")
        return df
    
    except Exception as e: 
        print(f"An error occurred while processing the '{analysis}' analysis: {e}.")
        return None

In [4]:
def injury_interpolator(df, analysis): 
    """
    Creates two new columns, IsInjured and IsSevere, where any injury sets 
    IsInjured to 1, and any injury over 28 days provides a 1 in IsSevere
    """
    import polars as pl # type: ignore

    if analysis == 'injury':
        df = df.with_columns([ 
            pl.when(pl.col("DM_M1") == 1).then(1).otherwise(0).cast(pl.Int8).alias("IsInjured")
            , pl.when(pl.col("DM_M28") == 1).then(1).otherwise(0).cast(pl.Int8).alias("IsSevere")
            ])
        
        df = df.filter(pl.col('PlayType').is_not_null()) # 0.14% of rows did not have a play type, and ALL of these were non-injury plays, so they were removed
        df = df.with_columns([ 
                pl.col("BodyPart").fill_null("No_Injury")
                , pl.col(["DM_M1", "DM_M7", "DM_M28", "DM_M42"]).fill_null(0)
                ])

    elif analysis == 'concussion':
        df = df.with_columns([ 
            pl.when(pl.col("Primary_Impact_Type").is_not_null()).then(1).otherwise(0).alias("IsInjured")
            , pl.col("Player_Activity_Derived").fill_null("No_Injury")
            , pl.col("Primary_Impact_Type").fill_null("No_Injury")
            , pl.col("Primary_Partner_Activity_Derived").fill_null("No_Injury")
            , pl.col("Primary_Partner_GSISID").fill_null(00000)
            , pl.col("OpponentKey").fill_null("None")
            ])
        
    print("Injury columns have been added.")
    return df

In [5]:
def stadium_cleaner(df):
    """
    Noramlizes all stadium types to be either indoor or outdoor per game records. Some of the dome 
    stadiums were listed as open or closed for different games, and these were accounted for.
    All games with dates were checked to ensure null values were indeed outdoor games. 
    """
    import polars as pl  # type: ignore

    stadium_dict = {
        'Outdoor': 'Outdoor'
        , 'Indoors': 'Indoor'
        , 'Oudoor': 'Outdoor'
        , 'Outdoors': 'Outdoor'
        , 'Open': 'Outdoor'
        , 'Closed Dome': 'Indoor'
        , 'Domed, closed': 'Indoor'
        , 'Dome': 'Indoor'
        , 'Indoor': 'Indoor'
        , 'Domed': 'Indoor'
        , 'Retr. Roof-Closed': 'Indoor'
        , 'Outdoor Retr Roof-Open': 'Outdoor'
        , 'Retractable Roof': 'Indoor'
        , 'Ourdoor': 'Outdoor'
        , 'Indoor, Roof Closed': 'Indoor'
        , 'Retr. Roof - Closed': 'Indoor'
        , 'Bowl': 'Outdoor'
        , 'Outddors': 'Outdoor'
        , 'Retr. Roof-Open': 'Outdoor'
        , 'Dome, closed': 'Indoor'
        , 'Indoor, Open Roof': 'Outdoor'
        , 'Domed, Open': 'Outdoor'
        , 'Domed, open': 'Outdoor'
        , 'Heinz Field': 'Outdoor'
        , 'Cloudy': 'Outdoor'
        , 'Retr. Roof - Open': 'Outdoor'
        , 'Retr. Roof Closed': 'Indoor'
        , 'Outdor': 'Outdoor'
        , 'Outside': 'Outdoor'
        , 'outdoor': 'Outdoor'
        , 'Outdoors ': 'Outdoor'
        , 'Indoor, non-retractable roof': 'Indoor'
        , 'Retr. roof - closed': 'Indoor'
        , 'Indoor, fixed roof ': 'Indoor'
        , 'Indoor, Non-Retractable Dome': 'Indoor'
        , 'Indoor, Fixed Roof': 'Indoor'
        , 'Indoor, fixed roof': 'Indoor'
        , 'Indoors (Domed)': 'Indoor'
        , None: 'Outdoor'
        }


    df = df.with_columns(pl.col("StadiumType").replace(stadium_dict)) # This uses the dict to assign naming conventions

    print(f"Someone managed to clean up those stadiums!")
    return df

In [6]:
def play_cleaner(df):
    """
    Reduces the number of play types listed as Kickoff or Punt plays to just those two types. 
    """
    import polars as pl  # type: ignore

    play_dict = {
        'Kickoff Not Returned': 'Kickoff'
        , 'Kickoff Returned': 'Kickoff'
        , 'Punt Not Returned': 'Punt'
        , 'Punt Returned': 'Punt'
        , '0': 'Unknown'
        }

    df = df.with_columns(pl.col("PlayType").replace(play_dict)) # This uses the dict to assign naming conventions

    print(f"Plays have been set!")
    return df

In [7]:
def weather_cleaner(df):
     """
     Uses mapping to limit the number of different weather groupings. 
     """
     import polars as pl # type: ignore

     # If using the concussion dataset, rename the GameWeather column to Weather
     if "GameWeather" in df.columns:
       df = df.rename({"GameWeather": "Weather"})

     weather_dict = {
            'Clear and warm': 'Clear'
            , 'Mostly Cloudy': 'Cloudy'
            , 'Sunny': 'Clear'
            , 'Clear': 'Clear'
            , 'Cloudy': 'Cloudy'
            , 'Cloudy, fog started developing in 2nd quarter': 'Hazy/Fog'
            , 'Rain': 'Rain'
            , 'Partly Cloudy': 'Cloudy'
            , 'Mostly cloudy': 'Cloudy'
            , 'Cloudy and cold': 'Cloudy'
            , 'Cloudy and Cool': 'Cloudy'
            , 'Rain Chance 40%': 'Rain'
            , 'Controlled Climate': 'Indoor'
            , 'Sunny and warm': 'Clear'
            , 'Partly cloudy': 'Cloudy'
            , 'Clear and Cool': 'Cloudy'
            , 'Clear and cold': 'Cloudy'
            , 'Sunny and cold': 'Clear'
            , 'Indoor': 'Indoor'
            , 'Partly Sunny': 'Clear'
            , 'N/A (Indoors)': 'Indoor'
            , 'Mostly Sunny': 'Clear'
            , 'Indoors': 'Indoor'
            , 'Clear Skies': 'Clear'
            , 'Partly sunny': 'Clear'
            , 'Showers': 'Rain'
            , 'N/A Indoor': 'Indoor'
            , 'Sunny and clear': 'Clear'
            , 'Snow': 'Snow'
            , 'Scattered Showers': 'Rain'
            , 'Party Cloudy': 'Cloudy'
            , 'Clear skies': 'Clear'
            , 'Rain likely, temps in low 40s.': 'Rain'
            , 'Hazy': 'Hazy/Fog'
            , 'Partly Clouidy': 'Cloudy'
            , 'Sunny Skies': 'Clear'
            , 'Overcast': 'Cloudy'
            , 'Cloudy, 50% change of rain': 'Cloudy'
            , 'Fair': 'Clear'
            , 'Light Rain': 'Rain'
            , 'Partly clear': 'Clear'
            , 'Mostly Coudy': 'Cloudy'
            , '10% Chance of Rain': 'Cloudy'
            , 'Cloudy, chance of rain': 'Cloudy'
            , 'Heat Index 95': 'Clear'
            , 'Sunny, highs to upper 80s': 'Clear'
            , 'Sun & clouds': 'Cloudy'
            , 'Heavy lake effect snow': 'Snow'
            , 'Mostly sunny': 'Clear'
            , 'Cloudy, Rain': 'Rain'
            , 'Sunny, Windy': 'Windy'
            , 'Mostly Sunny Skies': 'Clear'
            , 'Rainy': 'Rain'
            , '30% Chance of Rain': 'Rain'
            , 'Cloudy, light snow accumulating 1-3"': 'Snow'
            , 'cloudy': 'Cloudy'
            , 'Clear and Sunny': 'Clear'
            , 'Coudy': 'Cloudy'
            , 'Clear and sunny': 'Clear'
            , 'Clear to Partly Cloudy': 'Clear'
            , 'Cloudy with periods of rain, thunder possible. Winds shifting to WNW, 10-20 mph.': 'Windy'
            , 'Rain shower': 'Rain'
            , 'Cold': 'Clear'
            , 'Partly cloudy, lows to upper 50s.': 'Cloudy'
            , 'Scattered thunderstorms': 'Rain'
            , 'CLEAR': 'Clear'
            , 'Partly CLoudy': 'Cloudy'
            , 'Chance of Showers': 'Rain'
            , 'Snow showers': 'Snow'
            , 'Clear and Cold': 'Clear'
            , 'Cloudy with rain': 'Rain'
            , 'Sunny intervals': 'Clear'
            , 'Clear and cool': 'Clear'
            , 'Cloudy, Humid, Chance of Rain': 'Rain'
            , 'Cloudy and Cold': 'Cloudy'
            , 'Cloudy with patches of fog': 'Hazy/Fog'
            , 'Controlled': 'Indoor'
            , 'Sunny and Clear': 'Clear'
            , 'Cloudy with Possible Stray Showers/Thundershowers': 'Rain'
            , 'Suny': 'Clear'
            , 'T-Storms': 'Rain'
            , 'Sunny and cool': 'Clear'
            , 'Cloudy, steady temps': 'Cloudy'
            , 'Hazy, hot and humid': 'Hazy/Fog'
            , 'Sunny Intervals': 'Clear'
            , 'Partly Cloudy, Chance of Rain 80%': 'Rain'
            , 'Mostly Clear. Gusting ot 14.': 'Windy'
            , 'Mostly CLoudy': 'Cloudy'
            , 'Snow Showers, 3 to 5 inches expected.': 'Snow'
            }


     df = df.with_columns(pl.col("Weather").replace(weather_dict)) # Standardizes the weather to a few main types

     df = df.with_columns(             # Null handling - all null weather conditions for indoor stadiums are filled "indoor"
                pl.when(pl.col("StadiumType") == "Indoor")
                .then(pl.col("Weather").fill_null("Indoor"))
                .otherwise(pl.col("Weather"))
                .alias("Weather")
                )
     
     # For the non-indoor games with null values for weather, to maintain the percentage of games that were clear/cloudy, temperature was used as a divider, above and below 70 degrees
     df = df.with_columns(
                pl.when(pl.col("Temperature") > 70)
                .then(pl.col("Weather").fill_null("Clear"))
                .otherwise(pl.col("Weather"))
                .alias("Weather")
                )
     df = df.with_columns(pl.col("Weather").fill_null("Cloudy"))

     print(f"Looks like the weather has been cleared up.")
     return df

In [8]:
from DataHandler import data_shrinker

In [10]:
injury_qual = clean_injury_qual()

Tables are holding hands. How cute.
Injury columns have been added.
Someone managed to clean up those stadiums!
Plays have been set!
Looks like the weather has been cleared up.
Memory usage of dataframe is 13.85 MB
Memory usage after optimization is: 10.92 MB
Decreased by 21.2%
Injuries have been cleaned and dressed. Execution time: 1.1065185070037842 seconds.


In [11]:
injury_qual.head()

PlayKey,Position,StadiumType,FieldType,Temperature,Weather,PlayType,BodyPart,DM_M1,DM_M7,DM_M28,DM_M42,IsInjured,IsSevere
str,cat,cat,cat,i16,cat,cat,cat,i8,i8,i8,i8,i8,i8
"""26624-1-1""","""QB""","""Outdoor""","""Synthetic""",63,"""Clear""","""Pass""","""No_Injury""",0,0,0,0,0,0
"""26624-1-2""","""QB""","""Outdoor""","""Synthetic""",63,"""Clear""","""Pass""","""No_Injury""",0,0,0,0,0,0
"""26624-1-3""","""QB""","""Outdoor""","""Synthetic""",63,"""Clear""","""Rush""","""No_Injury""",0,0,0,0,0,0
"""26624-1-4""","""QB""","""Outdoor""","""Synthetic""",63,"""Clear""","""Rush""","""No_Injury""",0,0,0,0,0,0
"""26624-1-5""","""QB""","""Outdoor""","""Synthetic""",63,"""Clear""","""Pass""","""No_Injury""",0,0,0,0,0,0


## Retest direct from file 

In [1]:
from QualitativeCleaner import *
iq = clean_injury_qual()
iq.head()

Tables are holding hands. How cute.
Injury columns have been added.
Someone managed to clean up those stadiums!
Plays have been set!
Looks like the weather has been cleared up.
Memory usage of dataframe is 13.85 MB
Memory usage after optimization is: 10.92 MB
Decreased by 21.2%
Injuries have been cleaned and dressed.


PlayKey,Position,StadiumType,FieldType,Temperature,Weather,PlayType,BodyPart,DM_M1,DM_M7,DM_M28,DM_M42,IsInjured,IsSevere
str,cat,cat,cat,i16,cat,cat,cat,i8,i8,i8,i8,i8,i8
"""26624-1-1""","""QB""","""Outdoor""","""Synthetic""",63,"""Clear""","""Pass""","""No_Injury""",0,0,0,0,0,0
"""26624-1-2""","""QB""","""Outdoor""","""Synthetic""",63,"""Clear""","""Pass""","""No_Injury""",0,0,0,0,0,0
"""26624-1-3""","""QB""","""Outdoor""","""Synthetic""",63,"""Clear""","""Rush""","""No_Injury""",0,0,0,0,0,0
"""26624-1-4""","""QB""","""Outdoor""","""Synthetic""",63,"""Clear""","""Rush""","""No_Injury""",0,0,0,0,0,0
"""26624-1-5""","""QB""","""Outdoor""","""Synthetic""",63,"""Clear""","""Pass""","""No_Injury""",0,0,0,0,0,0


## Continue with Concussion Cleaning

In [1]:
from QualitativeCleaner import *
cq = clean_concussions()
cq.head()

Tables are holding hands. How cute.
Injury columns have been added.
Someone managed to clean up those stadiums!
Looks like the weather has been cleared up.
The scores have been fixed. Just not how Pete Rose would fix them.
Memory usage of dataframe is 21.77 MB
Memory usage after optimization is: 15.04 MB
Decreased by 30.9%
Concussions have been assessed and cleared for play.


PlayKey,Position,Role,Quarter,Play_Type,Poss_Team,Game_Site,HomeTeamCode,VisitTeamCode,StadiumType,FieldType,Weather,Temperature,OpponentKey,IsInjured,Score_Difference,PlayerActivity,ImpactType
str,cat,cat,i8,cat,cat,cat,cat,cat,cat,cat,cat,f32,cat,i8,i8,cat,cat
"""29793-256-2699""","""TE""","""PRW""",3,"""Punt""","""SEA""","""Seattle""","""SEA""","""CAR""","""Outdoor""","""Synthetic""","""Cloudy""",37.0,"""None""",0,23,"""No_Injury""","""No_Injury"""
"""32438-526-2647""","""CB""","""PLR""",3,"""Punt""","""TEN""","""Nashville""","""TEN""","""BLT""","""Outdoor""","""Natural""","""Cloudy""",75.0,"""None""",0,10,"""No_Injury""","""No_Injury"""
"""32456-41-3069""","""CB""","""PDL1""",3,"""Punt""","""GB""","""Santa Clara""","""SF""","""GB""","""Outdoor""","""Natural""","""Clear""",64.0,"""None""",0,-7,"""No_Injury""","""No_Injury"""
"""32637-635-3199""","""FS""","""PLL""",3,"""Punt""","""ARZ""","""Glendale""","""ARZ""","""NYG""","""Indoor""","""Natural""","""Clear""",71.0,"""None""",0,16,"""No_Injury""","""No_Injury"""
"""29119-289-339""","""TE""","""PRG""",1,"""Punt""","""WAS""","""Landover""","""WAS""","""CAR""","""Outdoor""","""Natural""","""Clear""",32.0,"""None""",0,-3,"""No_Injury""","""No_Injury"""


# Now for the Summary Calculations
  The process here is somewhat similar to the tracking, but it aggregates the information from each tracking file and saves it as a temp file. These are then concatenated for each the summary and the injury datasets. 

## Start with Injury Summary 

In [5]:
def summary_calculator(df):
    """
    The df input to this function is the tracking data chunk already formatted as a polars dataframe. 
    
    Collects dispalcement and distance, means and maxima for the for each of the parameters collected
    and outputs to a quantitative summary table that can be joined to the qualitative table for machine learning.  
    """
    import polars as pl # type: ignore

    result = df.select([
        "PlayKey"
        , pl.col("Position")
        , pl.col("Displacement").sum().over("PlayKey").alias("Distance")
        , pl.col("x").first().over("PlayKey").alias("start_x")
        , pl.col("y").first().over("PlayKey").alias("start_y")
        , pl.col("x").last().over("PlayKey").alias("end_x")
        , pl.col("y").last().over("PlayKey").alias("end_y")
        , pl.col("Angle_Diff").max().over("PlayKey").alias("Max_Angle_Diff")
        , pl.col("Angle_Diff").mean().over("PlayKey").alias("Mean_Angle_Diff")
        , pl.col("Speed").max().over("PlayKey").alias("Max_Speed")
        , pl.col("Speed").mean().over("PlayKey").alias("Mean_Speed")
        , pl.col("J_magnitude").max().over("PlayKey").alias("Max_Impulse")
        , pl.col("J_magnitude").mean().over("PlayKey").alias("Mean_Impulse")
        , pl.col("torque").max().over("PlayKey").alias("Max_Torque")
        , pl.col("torque").mean().over("PlayKey").alias("Mean_Torque")
        , pl.col("torque_internal").max().over("PlayKey").alias("Max_Int_Torque")
        , pl.col("torque_internal").mean().over("PlayKey").alias("Mean_Int_Torque")

        ]).unique(subset=["PlayKey"])


    # Calculate the displacement and the difference between the distance and displacement
    result = result.with_columns([
        (((pl.col("end_x") - pl.col("start_x"))**2 + 
          (pl.col("end_y") - pl.col("start_y"))**2)**0.5)
        .alias("Displacement")
        ]).with_columns([
            (pl.col("Distance") - pl.col("Displacement")).alias("Path_Diff")
        ])

     
    # Select only the required columns
    result = result.select([
        'PlayKey'
        , 'Position'
        , 'Distance'
        , 'Displacement'
        , 'Path_Diff'
        , 'Max_Angle_Diff'
        , 'Mean_Angle_Diff'
        , 'Max_Speed'
        , 'Mean_Speed'
        , 'Max_Impulse'
        , 'Mean_Impulse'
        , 'Max_Torque'
        , 'Mean_Torque'
        , 'Max_Int_Torque'
        , 'Mean_Int_Torque'
      
    ]).sort("PlayKey")


    return result

In [6]:
def collect_summaries(group_dir):

    import polars as pl # type: ignore
    import os

    # Initialize an empty list to store dataframes
    summary_dfs = []

    # Iterate through files in the directory
    for file in os.listdir(group_dir):
        if file.startswith("group_"):
            file_path = os.path.join(group_dir, file)
            
            # Read the Parquet file
            df = pl.read_parquet(file_path)
            
            # Apply the summary_calculator function
            temp_df = summary_calculator(df)
            
            # Append to the list of summary dataframes
            summary_dfs.append(temp_df)

    # Concatenate all summary dataframes
    summary_df = pl.concat(summary_dfs)

    # Save the concatenated dataframe
    # summary_df.write_parquet(os.path.join(group_dir, "summary_df.parquet"))

    # print("Processing complete. Summary dataframe saved as 'summary_df.parquet'")

    return summary_df

In [10]:
def injury_summary_maker(group_dir, injury_qual, output_dir):
    """
    Joins the qualitative and quantitative summary data
    """
    import polars as pl # type: ignore
    pl.enable_string_cache()
    import os
    
    #Write    
    summary_file = "Summary_Injuries.parquet"
    summary_path = os.path.join(output_dir, summary_file)

    quant = collect_summaries(group_dir)
    # quals = pl.read_parquet(qual_path)
    summary_df = injury_qual.join(quant, on="PlayKey", how="inner")

    # summary_df.write_parquet(summary_path)
    print(f"Saved the full summary with qualitative and quantitative features at {summary_path}")
    return summary_df

In [11]:
from QualitativeCleaner import *
injury_qual = clean_injury_qual()
group_dir = "F:/Data/Clean_Data/injury_output/"
output_dir = "F:/Data/Clean_Data/"

summary_df = injury_summary_maker(group_dir, injury_qual, output_dir)
summary_df.head()

Tables are holding hands. How cute.
Injury columns have been added.
Someone managed to clean up those stadiums!
Plays have been set!
Looks like the weather has been cleared up.
Memory usage of dataframe is 13.85 MB
Memory usage after optimization is: 10.92 MB
Decreased by 21.2%
Injuries have been cleaned and dressed.
Saved the full summary with qualitative and quantitative features at F:/Data/Clean_Data/Summary_Injuries.parquet


PlayKey,Position,StadiumType,FieldType,Temperature,Weather,PlayType,BodyPart,DM_M1,DM_M7,DM_M28,DM_M42,IsInjured,IsSevere,Position_right,Distance,Displacement,Path_Diff,Max_Angle_Diff,Mean_Angle_Diff,Max_Speed,Mean_Speed,Max_Impulse,Mean_Impulse,Max_Torque,Mean_Torque,Max_Int_Torque,Mean_Int_Torque
str,cat,cat,cat,i16,cat,cat,cat,i8,i8,i8,i8,i8,i8,cat,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32
"""26624-1-19""","""QB""","""Outdoor""","""Synthetic""",63,"""Clear""","""Rush""","""No_Injury""",0,0,0,0,0,0,"""QB""",46.350941,30.559803,15.791138,179.899994,81.03978,5.573143,1.093183,262.500061,16.24078,1169.989746,0.062934,817.741455,-0.04599
"""26624-1-22""","""QB""","""Outdoor""","""Synthetic""",63,"""Clear""","""Pass""","""No_Injury""",0,0,0,0,0,0,"""QB""",27.687366,7.580832,20.106533,179.25,101.294319,4.517752,1.936179,72.198616,17.693897,1170.477295,0.091562,819.584412,-0.092134
"""26624-1-23""","""QB""","""Outdoor""","""Synthetic""",63,"""Clear""","""Rush""","""No_Injury""",0,0,0,0,0,0,"""QB""",28.736702,8.434956,20.301746,178.229996,78.788918,4.414752,1.037426,240.314255,15.576145,1187.834473,0.329282,811.438843,-0.190185
"""26624-1-4""","""QB""","""Outdoor""","""Synthetic""",63,"""Clear""","""Rush""","""No_Injury""",0,0,0,0,0,0,"""QB""",7.034525,3.085855,3.94867,137.759995,73.784317,3.465543,0.558296,173.270645,13.67563,838.609009,-0.096732,520.58783,-0.080455
"""26624-1-54""","""QB""","""Outdoor""","""Synthetic""",63,"""Clear""","""Pass""","""No_Injury""",0,0,0,0,0,0,"""QB""",27.86739,14.248103,13.619287,179.490005,105.027611,5.035871,0.774094,300.459412,17.843529,1172.492432,0.029335,815.875793,-0.024084


## Next, concussion Summary

This will require processing and writing the individual expansions to file or appending to a dataframe while processing each of the NGS files, but that may be less than ideal since it will be held in memory. 

In [None]:
source_dir = "F:/Data/NFL-Punt-Analytics-Competition"
concussion_output_dir = "F:/Data/Processing_data/concussion_output/"

In [None]:
def process_ngs_summary_files(source_dir, output_dir):
    """
    Opens each of the NGS files and applies the concussion transformation funcions, appending
    the results to a single combined dataframe
    """
    import polars as pl # type: ignore
    import os
    import time

    start_time = time.time()
    # List all NGS files in the directory
    ngs_files = [f for f in os.listdir(source_dir) if f.startswith('NGS-')]

    review = clean_review()
    # Process each file and store the results
    processed_dfs = []
    for file in ngs_files:
        file_path = os.path.join(source_dir, file)
        processed_df = transform_concussion_tracking(file_path, review)
        processed_dfs.append(processed_df)

    # Concatenate all processed DataFrames
    combined_df = pl.concat(processed_dfs)

    # Save the combined DataFrame to a CSV file
    output_path = os.path.join(output_dir, 'TrackingConcussions.parquet')
    combined_df.write_parquet(output_path)

    end_time = time.time()
    execution_time = end_time - start_time
    print(f"Combined processed data saved to: {output_path}. Execution time: {execution_time} seconds.")
    return combined_df

In [32]:
def clean_review():
    import polars as pl # type: ignore
    pl.enable_string_cache()
    from DataHandler import data_shrinker

    review = pl.read_csv("F:/Data/NFL-Punt-Analytics-Competition/video_review.csv")
    review, schema = data_shrinker(review)
    
    review = review.with_columns([
        pl.concat_str([
            pl.col('GSISID').cast(pl.Int32).cast(pl.Utf8)
            , pl.lit('-')
            , pl.col('GameKey').cast(pl.Utf8)
            , pl.lit('-')
            , pl.col('PlayID').cast(pl.Utf8)
        ]).alias('PlayKey')
        , pl.concat_str([
            pl.col('Primary_Partner_GSISID').cast(pl.Utf8)
            , pl.lit('-')
            , pl.col('GameKey').cast(pl.Utf8)
            , pl.lit('-')
            , pl.col('PlayID').cast(pl.Utf8)
        ]).alias('OpponentKey')
    ])

    review = review.with_columns([
        pl.col('PlayKey').alias('InjuryKey')
        ]).drop([
        'Season_Year'
        , 'GameKey'
        , 'PlayID'
        , 'GSISID'
        , 'Turnover_Related'
        , 'Friendly_Fire'
        ])
    
    return review

In [33]:

def column_corrector(df):
    import polars as pl # type: ignore
    """
    Add a Play_Time column that acts like the 'time' column did in the injury dataset. 
    Each PlayKey will start at 0.0 and increase by 0.1 for each subsequent record.
    """
    # Filter out rows with GSISID values that are too large for Int32
    df = df.filter(pl.col('GSISID') <= 1000000)  # Maximum value for Int32

    df = df.with_columns([
        pl.concat_str([
            pl.col('GSISID').cast(pl.Int32).cast(pl.Utf8)
            , pl.lit('-')
            , pl.col('GameKey').cast(pl.Utf8)
            , pl.lit('-')
            , pl.col('PlayID').cast(pl.Utf8)
        ]).alias('PlayKey')
    ])
     
    df = df.sort(['PlayKey', 'Time'])

    df = df.with_columns([
        pl.col('Time').str.strptime(
            pl.Datetime
            , format="%Y-%m-%d %H:%M:%S.%3f"
            , strict=False
        ).alias('Time')
    ])
    
    df = create_time_numeric(df)

    df = df.select([
        'PlayKey'
        , 'Time_numeric'
        , 'x'
        , 'y'
        , 'o'
        , 'dir'
        , 'GSISID'
        ]).rename({"Time_numeric":"time"})
    
    return df

In [36]:

def create_opponent_plays(df, review, method='tracking'): 
    import polars as pl # type: ignore
    pl.enable_string_cache()

    if method == 'summary':
        join_type = 'left'
    else: 
        join_type = 'inner'


    # Establish the column header order 
    column_order = [
        'PlayKey'
        , 'time'
        , 'x'
        , 'y'
        , 'o'
        , 'dir'
        , 'GSISID'
        , 'PlayerActivity'
        , 'ImpactType'
        , 'OpponentKey'
        , 'InjuryKey'
    ]

    # First join: review.InjuryKey = df.PlayKey
    df_joined_injury = df.join(
        review
        , on='PlayKey'
        , how=join_type
        ).drop(['Primary_Partner_GSISID'
                , 'Primary_Partner_Activity_Derived'
                ]
        ).rename({'Player_Activity_Derived': 'PlayerActivity', 'Primary_Impact_Type': 'ImpactType'}
        ).select(column_order)


    # Second join: review.OpponentKey = df.PlayKey
    df_joined_opponent = df.join(
        review
        , left_on='PlayKey'
        , right_on='OpponentKey'
        , how=join_type
        ).drop(['PlayKey_right'
            , 'Player_Activity_Derived'
            , 'Primary_Partner_GSISID']
        ).rename({'Primary_Partner_Activity_Derived': 'PlayerActivity', 'Primary_Impact_Type': 'ImpactType'})

    df_joined_opponent = df_joined_opponent.with_columns([
        pl.lit(None).cast(pl.Utf8).alias('OpponentKey')
        ]).select(column_order)


    # Combine the results
    df_final = pl.concat([df_joined_injury, df_joined_opponent])

    return df_final


In [34]:
def reduce_float_precision(df):
    import polars as pl # type: ignore
    for col in df.columns:
        if df[col].dtype == pl.Float64:
            df = df.with_columns(pl.col(col).cast(pl.Float32))
        elif df[col].dtype == pl.Int64:
            df = df.with_columns(pl.col(col).cast(pl.Int32))
    return df

In [50]:
import polars as pl
import os
from TrackingCleaner import *

source_dir = "F:/Data/NFL-Punt-Analytics-Competition/"
concussion_output_dir = "F:/Data/Clean_Data/concussion_output/"
os.makedirs(output_dir, exist_ok=True)

file_path = os.path.join(source_dir, 'NGS-2016-pre.csv')

df = pl.read_csv(file_path, truncate_ragged_lines=True, ignore_errors=True)
df = column_corrector(df)
df = create_opponent_plays(df, 'summary')
df = (df
      .pipe(reduce_float_precision)
      .pipe(angle_corrector)
      .pipe(body_builder_conc)
      .pipe(velocity_calculator)
      .pipe(impulse_calculator)
      .pipe(summary_calculator)
      )



Memory usage of dataframe is 0.00 MB
Memory usage after optimization is: 0.00 MB
Decreased by 36.9%


PlayKey,Position,Distance,Displacement,Path_Diff,Max_Angle_Diff,Mean_Angle_Diff,Max_Speed,Mean_Speed,Max_Impulse,Mean_Impulse,Max_Torque,Mean_Torque,Max_Int_Torque,Mean_Int_Torque
str,cat,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32
"""19714-27-125""","""P""",229.247421,24.448645,204.798782,178.289993,90.046806,244.48645,2.0198,23814.419922,60.470604,1123.374268,-0.001725,785.014526,-0.002376
"""19714-27-3624""","""P""",67.835236,16.160028,51.675209,179.850006,93.000893,161.600281,1.364893,15993.665039,82.250549,1107.757812,0.02222,703.648743,0.001884
"""19714-7-3321""","""P""",161.807785,40.077179,121.730606,179.25,88.342865,400.77179,2.213513,39235.78125,127.547783,1265.90686,0.001318,584.751282,0.000387
"""19714-7-4212""","""P""",158.142441,36.465527,121.67691,178.75,92.96637,364.655273,3.012233,35610.613281,158.262161,1117.133911,0.025359,781.711182,0.000954
"""20712-31-188""","""OLB""",79.647102,25.681313,53.96579,137.809998,102.713303,256.81311,1.758214,28149.753906,144.098984,52.800941,0.013116,861.96283,0.016483
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""33179-20-3310""","""WR""",194.923996,43.338902,151.585098,179.990005,84.884308,433.389008,4.83682,39315.878906,224.697464,1056.621338,-0.003376,725.365051,0.011865
"""33181-28-3023""","""DB""",183.451355,32.081783,151.369568,179.5,108.731277,320.817841,4.463536,28039.476562,782.128967,981.087952,0.0,846.200684,0.0
"""33181-28-3267""","""DB""",232.804138,48.092003,184.712128,179.089996,78.961678,480.920044,2.249318,42032.410156,393.560699,996.613708,0.0,699.012573,0.0
"""33181-28-3984""","""DB""",148.488617,34.279835,114.208786,179.520004,121.596725,342.79834,1.51674,29960.574219,265.397064,999.869324,0.0,717.009338,0.0


In [51]:
len(df)

3644

In [1]:
from TrackingCleaner import *

In [2]:
process_concussion_summary()

Memory usage of dataframe is 0.00 MB
Memory usage after optimization is: 0.00 MB
Decreased by 36.9%
Processed and saved: F:/Data/Clean_Data/concussion_output/NGS-2016-post.parquet
Memory usage of dataframe is 0.00 MB
Memory usage after optimization is: 0.00 MB
Decreased by 36.9%
Processed and saved: F:/Data/Clean_Data/concussion_output/NGS-2016-pre.parquet
Memory usage of dataframe is 0.00 MB
Memory usage after optimization is: 0.00 MB
Decreased by 36.9%
Processed and saved: F:/Data/Clean_Data/concussion_output/NGS-2016-reg-wk1-6.parquet
Memory usage of dataframe is 0.00 MB
Memory usage after optimization is: 0.00 MB
Decreased by 36.9%
Processed and saved: F:/Data/Clean_Data/concussion_output/NGS-2016-reg-wk13-17.parquet
Memory usage of dataframe is 0.00 MB
Memory usage after optimization is: 0.00 MB
Decreased by 36.9%
Processed and saved: F:/Data/Clean_Data/concussion_output/NGS-2016-reg-wk7-12.parquet
Memory usage of dataframe is 0.00 MB
Memory usage after optimization is: 0.00 MB
De

PlayKey,Position,Distance,Displacement,Path_Diff,Max_Angle_Diff,Mean_Angle_Diff,Max_Speed,Mean_Speed,Max_Impulse,Mean_Impulse,Max_Torque,Mean_Torque,Max_Int_Torque,Mean_Int_Torque
str,cat,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32
"""19714-322-1031""","""P""",143.445801,37.63493,105.810867,179.380005,100.349785,376.349304,2.701427,36771.925781,159.612427,1766.650391,0.002577,847.255859,-0.031122
"""19714-322-1145""","""P""",2.827821,0.920219,1.907602,91.739998,88.577499,9.202193,1.885214,1033.901001,166.573563,1116.605957,0.252797,778.842468,-0.274756
"""19714-322-1709""","""P""",127.87104,29.753408,98.11763,179.960007,92.115196,297.534088,0.85304,29061.958984,57.748055,2205.736084,-0.001886,779.038147,0.000711
"""19714-322-1738""","""P""",82.852829,14.178928,68.673904,176.240005,86.465889,141.789276,1.325645,13948.632812,64.533623,1797.106323,0.001592,802.400208,-0.009369
"""19714-322-1859""","""P""",3.898672,0.770065,3.128607,178.960007,105.360947,7.700654,0.378512,842.293396,35.329845,1074.227905,0.113837,771.084045,0.040909
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""33976-572-2643""","""DT""",98.701004,31.217957,67.483047,134.339996,91.402481,312.179565,2.749332,44359.796875,279.681458,336.315308,-0.011853,82.950203,0.00203
"""33976-572-2730""","""DT""",62.771347,20.859648,41.911697,106.169998,89.953728,208.596481,6.214984,30162.939453,633.94458,66.405411,0.023473,20.00296,0.062567
"""33976-572-2915""","""DT""",38.972767,12.842343,26.130424,107.989998,87.736671,128.423431,4.695513,18711.773438,485.832642,51.282524,0.070467,44.872169,-0.034298
"""33976-572-392""","""DT""",49.191399,16.063278,33.12812,109.669998,90.63903,160.632782,4.775865,23083.826172,477.28476,42.208778,0.027881,67.497833,-0.0948


In [3]:
import polars as pl
pl.read_parquet("F:/Data/Clean_Data/concussion_output/NGS-2016-post.parquet").head()

PlayKey,Position,Distance,Displacement,Path_Diff,Max_Angle_Diff,Mean_Angle_Diff,Max_Speed,Mean_Speed,Max_Impulse,Mean_Impulse,Max_Torque,Mean_Torque,Max_Int_Torque,Mean_Int_Torque
str,cat,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32
"""19714-322-1031""","""P""",143.445801,37.63493,105.810867,179.380005,100.349785,376.349304,2.701427,36771.925781,159.612427,1766.650391,0.002577,847.255859,-0.031122
"""19714-322-1145""","""P""",2.827821,0.920219,1.907602,91.739998,88.577499,9.202193,1.885214,1033.901001,166.573563,1116.605957,0.252797,778.842468,-0.274756
"""19714-322-1709""","""P""",127.87104,29.753408,98.11763,179.960007,92.115196,297.534088,0.85304,29061.958984,57.748055,2205.736084,-0.001886,779.038147,0.000711
"""19714-322-1738""","""P""",82.852829,14.178928,68.673904,176.240005,86.465889,141.789276,1.325645,13948.632812,64.533623,1797.106323,0.001592,802.400208,-0.009369
"""19714-322-1859""","""P""",3.898672,0.770065,3.128607,178.960007,105.360947,7.700654,0.378512,842.293396,35.329845,1074.227905,0.113837,771.084045,0.040909


In [4]:
review_df = collect_concussion_summaries("F:/Data/Clean_Data/concussion_output/")

In [5]:
review_df.head()

PlayKey,Position,Distance,Displacement,Path_Diff,Max_Angle_Diff,Mean_Angle_Diff,Max_Speed,Mean_Speed,Max_Impulse,Mean_Impulse,Max_Torque,Mean_Torque,Max_Int_Torque,Mean_Int_Torque
str,cat,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32
"""19714-322-1031""","""P""",143.445801,37.63493,105.810867,179.380005,100.349785,376.349304,2.701427,36771.925781,159.612427,1766.650391,0.002577,847.255859,-0.031122
"""19714-322-1145""","""P""",2.827821,0.920219,1.907602,91.739998,88.577499,9.202193,1.885214,1033.901001,166.573563,1116.605957,0.252797,778.842468,-0.274756
"""19714-322-1709""","""P""",127.87104,29.753408,98.11763,179.960007,92.115196,297.534088,0.85304,29061.958984,57.748055,2205.736084,-0.001886,779.038147,0.000711
"""19714-322-1738""","""P""",82.852829,14.178928,68.673904,176.240005,86.465889,141.789276,1.325645,13948.632812,64.533623,1797.106323,0.001592,802.400208,-0.009369
"""19714-322-1859""","""P""",3.898672,0.770065,3.128607,178.960007,105.360947,7.700654,0.378512,842.293396,35.329845,1074.227905,0.113837,771.084045,0.040909


In [6]:
review_df.write_parquet("F:/Data/Clean_Data/SummaryConcussions.parquet")

In [9]:
def collect_injury_summaries(group_dir):

    import polars as pl # type: ignore
    import os

    # Initialize an empty list to store dataframes
    summary_dfs = []

    # Iterate through files in the directory
    for file in os.listdir(group_dir):
        if file.startswith("group_"):
            file_path = os.path.join(group_dir, file)
            
            # Read the Parquet file
            df = pl.read_parquet(file_path)
            
            # Apply the summary_calculator function
            temp_df = summary_calculator(df)
            
            # Append to the list of summary dataframes
            summary_dfs.append(temp_df)

    # Concatenate all summary dataframes
    summary_df = pl.concat(summary_dfs)


    return summary_df

In [None]:
def injury_summary_maker(group_dir):
    """
    Joins the qualitative and quantitative summary data
    """
    import polars as pl # type: ignore
    pl.enable_string_cache()
    # Read
    qual_path = "F:/Data/Processing_data/QualitativeInjuries.parquet"
    
    #Write    
    qual_quant_path = "F:/Data/Processing_data/Full_Summary_Injuries.parquet"
    

    quant = collect_summaries(group_dir)
    quals = pl.read_parquet(qual_path)
    qual_quant = quals.join(quant, on="PlayKey", how="inner")

    qual_quant.write_parquet(qual_quant_path)
    print(f"Saved the full summary with qualitative and quantitative features at {qual_quant_path}")

In [7]:
pl.read_parquet("F:/Data/Clean_Data/injury_output/group_4.parquet").head()

PlayKey,time,x,y,dir,o,Angle_Diff,Displacement,Speed,vx,vy,omega_dir,omega_o,omega_diff,Position,Height_m,Weight_kg,Chest_rad_m,px,py,moment,moment_upper,p_magnitude,L_dir,L_diff,Jx,Jy,J_magnitude,torque,torque_internal
str,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,cat,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32
"""26624-1-21""",0.0,85.889999,24.65,-120.0,159.779999,80.220001,,,,,,,,"""QB""",1.91,102.099998,0.191,,,1.862355,1.303649,,,,,,,,
"""26624-1-21""",0.1,85.849998,24.74,-118.480011,161.570007,79.949997,0.098489,0.984891,-0.400009,0.900002,0.265288,0.312414,0.047126,"""QB""",1.91,102.099998,0.191,-40.840935,91.890152,1.862355,1.303649,100.55735,0.494061,0.061436,,,,,
"""26624-1-21""",0.2,85.82,24.82,-117.179993,163.220001,79.599998,0.08544,0.854395,-0.299988,0.799999,0.226896,0.287979,0.061083,"""QB""",1.91,102.099998,0.191,-30.628754,81.679924,1.862355,1.303649,87.233772,0.422561,0.079631,10.212181,-10.210228,14.440825,-0.715005,0.181951
"""26624-1-21""",0.3,85.790001,24.9,-117.519989,164.799988,77.68,0.08544,0.854395,-0.299988,0.799999,-0.059342,0.27576,0.335102,"""QB""",1.91,102.099998,0.191,-30.628754,81.679924,1.862355,1.303649,87.233772,-0.110517,0.436855,0.0,0.0,0.0,-5.330771,3.572248
"""26624-1-21""",0.4,85.760002,24.969999,-117.369995,166.23999,76.389999,0.076157,0.76157,-0.299988,0.699997,0.026181,0.251329,0.225148,"""QB""",1.91,102.099998,0.191,-30.628754,71.469688,1.862355,1.303649,77.756264,0.048758,0.293514,0.0,-10.210236,10.210236,1.592744,-1.433412


In [11]:
group_dir = "F:/Data/Clean_Data/injury_output/"


summary_df = collect_injury_summaries(group_dir)
summary_df.head()

PlayKey,Position,Distance,Displacement,Path_Diff,Max_Angle_Diff,Mean_Angle_Diff,Max_Speed,Mean_Speed,Max_Impulse,Mean_Impulse,Max_Torque,Mean_Torque,Max_Int_Torque,Mean_Int_Torque
str,cat,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32
"""26624-1-19""","""QB""",46.350941,30.559803,15.791138,179.899994,81.03978,5.573143,1.093183,262.500061,16.24078,1169.989746,0.062934,817.741455,-0.04599
"""26624-1-22""","""QB""",27.687366,7.580832,20.106533,179.25,101.294319,4.517752,1.936179,72.198616,17.693897,1170.477295,0.091562,819.584412,-0.092134
"""26624-1-23""","""QB""",28.736702,8.434956,20.301746,178.229996,78.788918,4.414752,1.037426,240.314255,15.576145,1187.834473,0.329282,811.438843,-0.190185
"""26624-1-4""","""QB""",7.034525,3.085855,3.94867,137.759995,73.784317,3.465543,0.558296,173.270645,13.67563,838.609009,-0.096732,520.58783,-0.080455
"""26624-1-54""","""QB""",27.86739,14.248103,13.619287,179.490005,105.027611,5.035871,0.774094,300.459412,17.843529,1172.492432,0.029335,815.875793,-0.024084


In [1]:
from QualitativeCleaner import *
injury_qual = clean_injury_qual()
group_dir = "F:/Data/Clean_Data/injury_output/"
output_dir = "F:/Data/Clean_Data/"

Tables are holding hands. How cute.
Injury columns have been added.
Someone managed to clean up those stadiums!
Plays have been set!
Looks like the weather has been cleared up.
Memory usage of dataframe is 13.85 MB
Memory usage after optimization is: 10.92 MB
Decreased by 21.2%
Injuries have been cleaned and dressed.


In [2]:
from SummaryCleaner import *

summary_df = injury_summary_maker(group_dir, injury_qual, output_dir)

Tables are holding hands. How cute.
Injury columns have been added.
Someone managed to clean up those stadiums!
Plays have been set!
Looks like the weather has been cleared up.
Memory usage of dataframe is 13.85 MB
Memory usage after optimization is: 10.92 MB
Decreased by 21.2%
Injuries have been cleaned and dressed.
Saved the full summary with qualitative and quantitative features at F:/Data/Clean_Data/Summary_Injuries.parquet


In [4]:
import polars as pl 
pl.read_parquet("F:/Data/Clean_Data/Summary_Injuries.parquet")

PlayKey,Position,StadiumType,FieldType,Temperature,Weather,PlayType,BodyPart,DM_M1,DM_M7,DM_M28,DM_M42,IsInjured,IsSevere,Position_right,Distance,Displacement,Path_Diff,Max_Angle_Diff,Mean_Angle_Diff,Max_Speed,Mean_Speed,Max_Impulse,Mean_Impulse,Max_Torque,Mean_Torque,Max_Int_Torque,Mean_Int_Torque
str,cat,cat,cat,i16,cat,cat,cat,i8,i8,i8,i8,i8,i8,cat,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32
"""26624-1-19""","""QB""","""Outdoor""","""Synthetic""",63,"""Clear""","""Rush""","""No_Injury""",0,0,0,0,0,0,"""QB""",46.350941,30.559803,15.791138,179.899994,81.03978,5.573143,1.093183,262.500061,16.24078,1169.989746,0.062934,817.741455,-0.04599
"""26624-1-22""","""QB""","""Outdoor""","""Synthetic""",63,"""Clear""","""Pass""","""No_Injury""",0,0,0,0,0,0,"""QB""",27.687366,7.580832,20.106533,179.25,101.294319,4.517752,1.936179,72.198616,17.693897,1170.477295,0.091562,819.584412,-0.092134
"""26624-1-23""","""QB""","""Outdoor""","""Synthetic""",63,"""Clear""","""Rush""","""No_Injury""",0,0,0,0,0,0,"""QB""",28.736702,8.434956,20.301746,178.229996,78.788918,4.414752,1.037426,240.314255,15.576145,1187.834473,0.329282,811.438843,-0.190185
"""26624-1-4""","""QB""","""Outdoor""","""Synthetic""",63,"""Clear""","""Rush""","""No_Injury""",0,0,0,0,0,0,"""QB""",7.034525,3.085855,3.94867,137.759995,73.784317,3.465543,0.558296,173.270645,13.67563,838.609009,-0.096732,520.58783,-0.080455
"""26624-1-54""","""QB""","""Outdoor""","""Synthetic""",63,"""Clear""","""Pass""","""No_Injury""",0,0,0,0,0,0,"""QB""",27.86739,14.248103,13.619287,179.490005,105.027611,5.035871,0.774094,300.459412,17.843529,1172.492432,0.029335,815.875793,-0.024084
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""47888-9-18""","""DB""","""Outdoor""","""Synthetic""",53,"""Cloudy""","""Rush""","""No_Injury""",0,0,0,0,0,0,"""DB""",14.697285,6.58827,8.109015,176.779999,83.27549,2.817797,1.194902,53.164562,14.226443,986.93103,0.014596,616.351868,-0.157733
"""47888-9-19""","""DB""","""Outdoor""","""Synthetic""",53,"""Cloudy""","""Pass""","""No_Injury""",0,0,0,0,0,0,"""DB""",22.940378,12.785874,10.154504,179.339996,103.545921,6.989275,1.164486,105.604507,16.580637,1037.794067,4.955154,700.142273,3.47467
"""47888-9-36""","""DB""","""Outdoor""","""Synthetic""",53,"""Cloudy""","""Pass""","""No_Injury""",0,0,0,0,0,0,"""DB""",87.394646,52.987083,34.407562,179.220001,53.967442,9.305894,2.632368,47.075153,14.625926,1066.870361,-0.008742,689.644043,-0.0163
"""47888-9-45""","""DB""","""Outdoor""","""Synthetic""",53,"""Cloudy""","""Pass""","""No_Injury""",0,0,0,0,0,0,"""DB""",44.276279,21.361666,22.914614,178.809998,44.622726,7.815997,2.170408,49.441444,14.331692,1010.999146,-0.10417,702.810608,-0.099112
