# NGS ETL

In [1]:
import polars as pl
import sqlalchemy as db
from sqlalchemy.orm import Session
from sqlalchemy import create_engine
import psycopg2

from CleaningFunctions import *
from DataHandler import data_loader, data_shrinker

In [32]:
def calculate_angle_difference(angle1, angle2):
    import numpy as np
    """
    Calculate the smallest angle difference between two angles 
    using trigonometric functions, accounting for edge cases.
    """
    sin_diff = np.sin(np.radians(angle2 - angle1))
    cos_diff = np.cos(np.radians(angle2 - angle1))
    return np.degrees(np.arctan2(sin_diff, cos_diff))

def angle_corrector(df):
    import polars as pl
    """
    Make corrections to angles to reduce fringe errors at 360
    """
    df = df.with_columns([
        ((pl.col("dir") + 180) % 360 - 180).alias("dir")
        , ((pl.col("o") + 180) % 360 - 180).alias("o")
    ]).with_columns(
        (calculate_angle_difference(pl.col("dir"), pl.col("o"))).abs().round(2).alias("Angle_Diff")
        )
    
    return df



def path_calculator(df):
    import polars as pl
    # Calculate total distance and displacement for each PlayKey
    # Calculate total distance and displacement for each PlayKey
    result = df.select([
        "PlayKey"
        , pl.col("dist").sum().over("PlayKey").alias("Distance")
        , pl.col("x").first().over("PlayKey").alias("start_x")
        , pl.col("y").first().over("PlayKey").alias("start_y")
        , pl.col("x").last().over("PlayKey").alias("end_x")
        , pl.col("y").last().over("PlayKey").alias("end_y")
        , pl.col("Angle_Diff").max().over("PlayKey").alias("Max_Angle_Diff")
        , pl.col("Angle_Diff").mean().over("PlayKey").alias("Mean_Angle_Diff")
        , pl.col("Speed").max().over("PlayKey").alias("Max_Speed")
        , pl.col("Speed").mean().over("PlayKey").alias("Mean_Speed")
        , pl.col("omega_dir").max().over("PlayKey").alias("Max_omega_dir")
        , pl.col("omega_dir").mean().over("PlayKey").alias("Mean_omega_dir")
        , pl.col("omega_o").max().over("PlayKey").alias("Max_omega_o")
        , pl.col("omega_o").mean().over("PlayKey").alias("Mean_omega_o")
        , pl.col("d_omega").max().over("PlayKey").alias("Max_d_omega")
        , pl.col("d_omega").mean().over("PlayKey").alias("Mean_d_omega")
        ]).unique(subset=["PlayKey"])


    # Calculate the displacement
    result = result.with_columns([
        (((pl.col("end_x") - pl.col("start_x"))**2 + 
          (pl.col("end_y") - pl.col("start_y"))**2)**0.5)
        .alias("Displacement")
        ]).with_columns([
            (pl.col("Distance") - pl.col("Displacement")).alias("Path_Diff")
        ])

     
    # Select only the required columns
    result = result.select([
        'PlayKey'
        , 'Distance'
        , 'Displacement'
        , 'Path_Diff'
        , 'Max_Angle_Diff'
        , 'Mean_Angle_Diff'
        , 'Max_Speed'
        , 'Mean_Speed'
        , 'Max_omega_dir'
        , 'Mean_omega_dir'
        , 'Max_omega_o'
        , 'Mean_omega_o'
        , 'Max_d_omega'
        , 'Mean_d_omega'
    ]).sort("PlayKey")


    return result

def column_corrector(df):
    import polars as pl
    """
    Add a Play_Time column that acts like the 'time' column did in the injury dataset. 
    Each PlayKey will start at 0.0 and increase by 0.1 for each subsequent record.
    """
    df = df.with_columns([
        pl.concat_str([
            pl.col('gsisid').cast(pl.Int32).cast(pl.Utf8)
            , pl.lit('-')
            , pl.col('gamekey').cast(pl.Utf8)
            , pl.lit('-')
            , pl.col('playid').cast(pl.Utf8)
        ]).alias('PlayKey')
    ])
     
    
    df = df.select([
        'PlayKey'
        , 'time'
        , 'x'
        , 'y'
        , 'o'
        , 'dir'
        , 'gsisid'
        ]).rename({"time":"datetime"})

    df = df.sort(['PlayKey', 'datetime'])

    df = df.with_columns(
        (pl.arange(0, pl.len()) * 0.1).over("PlayKey").alias("time")
        ).with_columns([pl.col('gsisid').cast(pl.Int32)])  
    
    return df


def body_builder():
    body_data = pl.DataFrame({
        "position": ["QB", "RB", "FB", "WR", "TE", "T", "G", "C", "DE", "DT", "NT", "LB", "OLB", "MLB", "CB", "S", "K", "P", "SS", "ILB", "FS", "LS", "DB"]
        # , "Position_Name": ["Quarterback", "Running Back", "Fullback", "Wide Receiver", "Tight End", "Tackle", "Guard", "Center", "Defensive End", "Defensive Tackle", "Nose Tackle", "Linebacker", "Outside Linebacker", "Middle Linebacker", "Cornerback", "Safety", "Kicker", "Punter", "Strong Safety", "Inside Linebacker", "Free Safety", "Long Snapper", "Defensive Back"]
        , "Height_m": [1.91, 1.79, 1.85, 1.88, 1.96, 1.97, 1.90, 1.87, 1.97, 1.92, 1.88, 1.90, 1.90, 1.87, 1.82, 1.84, 1.83, 1.88, 1.84, 1.90, 1.84, 1.88, 1.82]
        , "Weight_kg": [102.1, 95.3, 111.1, 90.7, 114.6, 140.6, 141.8, 136.1, 120.2, 141.8, 152.0, 110.0, 108.9, 113.4, 87.4, 95.9, 92.08, 97.52, 95.9, 110.0, 95.9, 108.86, 87.4]
        , "Chest_rad_m": [0.191, 0.191, 0.191, 0.191, 0.191, 0.191, 0.191, 0.191, 0.191, 0.191, 0.191, 0.191, 0.191, 0.191, 0.191, 0.191, 0.191, 0.191, 0.191, 0.191, 0.191, 0.191, 0.191]
        })

    position = data_loader(dataset='positions', database='nfl_concussion')
    position = position.join(
        body_data
        , left_on='position'
        , right_on='position'
        , how='left'
        )
    
    return position


def weight_gainer(df):
    position = body_builder()
    
    return df.join(
        position
        , on='gsisid'
        , how='left'
        ).drop_nulls(subset=['position'])
    
    
    # .drop(['gsisid'])

In [33]:
track = data_loader(dataset='ngs_data', database='nfl_concussion')
track = data_shrinker(track)
track = column_corrector(track)
track = angle_corrector(track)
track = weight_gainer(track)
track.head()

Memory usage of dataframe is 0.42 MB
Memory usage after optimization is: 0.38 MB
Decreased by 8.7%


PlayKey,datetime,x,y,o,dir,gsisid,time,Angle_Diff,position,Height_m,Weight_kg,Chest_rad_m
str,datetime[ns],f32,f32,f32,f32,i32,f64,f32,str,f64,f64,f64
"""25506-555-1332""",2017-11-19 22:17:17.700,77.699997,-0.44,-91.769989,0.809998,25506,0.0,92.580002,"""G""",1.9,141.8,0.191
"""25506-555-1332""",2017-11-19 22:17:17.800,77.699997,-0.29,-91.470001,0.270004,25506,0.1,91.739998,"""G""",1.9,141.8,0.191
"""25506-555-1332""",2017-11-19 22:17:17.900,77.699997,-0.14,-91.109985,-0.27002,25506,0.2,90.839996,"""G""",1.9,141.8,0.191
"""25506-555-1332""",2017-11-19 22:17:18,77.690002,-0.0,-90.75,-0.719971,25506,0.3,90.029999,"""G""",1.9,141.8,0.191
"""25506-555-1332""",2017-11-19 22:17:18.100,77.699997,0.14,-90.350006,-1.200012,25506,0.4,89.150002,"""G""",1.9,141.8,0.191


In [35]:
def velocity_calculator(df):
    import numpy as np
    import polars as pl
    """
    Using the (X,Y) and time columns, perform calculations based on the 
    difference between two rows to find displacement, speed, direction 
    of motion, velocity in x and y components, and the angular velocities 
    of the direction of motion and orientations 
    """
    
    return df.with_columns([
        # Convert 'o' and 'dir' to radians
        (pl.col("o") * np.pi / 180).alias("o_rad"),
        (pl.col("dir") * np.pi / 180).alias("dir_rad")
    ]).with_columns([
        # Pre-calculate shifted values
        pl.col("x").shift(1).over("PlayKey").alias("prev_x")
        , pl.col("y").shift(1).over("PlayKey").alias("prev_y")
        # , pl.col("time").shift(1).over("PlayKey").alias("prev_time")
        , pl.col("dir_rad").shift(1).over("PlayKey").alias("prev_dir")
        , pl.col("o_rad").shift(1).over("PlayKey").alias("prev_o")
    ]).with_columns([
        # Calculate the component displacements 
          (pl.col("x") - pl.col("prev_x")).alias("dx")
        , (pl.col("y") - pl.col("prev_y")).alias("dy")
    ]).with_columns([
        # Calculate displacement
        ((pl.col("dx")**2 + pl.col("dy")**2)**0.5).alias("Displacement")
    ]).with_columns([
        # Calculate speed
        (pl.col("Displacement") / 0.1).alias("Speed")
        # Calculate direction
        , (np.degrees(np.arctan2(pl.col("dx"), pl.col("dy")))).alias("Direction")
        # Calculate velocity components
        , (pl.col("dx") / 0.1).alias("vx")
        , (pl.col("dy") / 0.1).alias("vy")
        # Calculate angular velocities
        , ((pl.col("dir_rad") - pl.col("prev_dir")) / 0.1).alias("omega_dir")
        , ((pl.col("o_rad") - pl.col("prev_o")) / 0.1).alias("omega_o")
    ]).with_columns([
        ((pl.col("omega_dir") - pl.col("omega_o")).abs()).alias("omega_diff")
    ]).drop([
        "prev_x", "prev_y", "prev_dir", "prev_o", "dx", "dy", "o_rad", "dir_rad"
    ])

    # return df

In [36]:
track = velocity_calculator(track)
track.head()

PlayKey,datetime,x,y,o,dir,gsisid,time,Angle_Diff,position,Height_m,Weight_kg,Chest_rad_m,Displacement,Speed,Direction,vx,vy,omega_dir,omega_o,omega_diff
str,datetime[ns],f32,f32,f32,f32,i32,f64,f32,str,f64,f64,f64,f32,f32,f32,f32,f32,f32,f32,f32
"""25506-555-1332""",2017-11-19 22:17:17.700,77.699997,-0.44,-91.769989,0.809998,25506,0.0,92.580002,"""G""",1.9,141.8,0.191,,,,,,,,
"""25506-555-1332""",2017-11-19 22:17:17.800,77.699997,-0.29,-91.470001,0.270004,25506,0.1,91.739998,"""G""",1.9,141.8,0.191,0.15,1.5,0.0,0.0,1.5,-0.094247,0.052358,0.146605
"""25506-555-1332""",2017-11-19 22:17:17.900,77.699997,-0.14,-91.109985,-0.27002,25506,0.2,90.839996,"""G""",1.9,141.8,0.191,0.15,1.5,0.0,0.0,1.5,-0.094252,0.062834,0.157086
"""25506-555-1332""",2017-11-19 22:17:18,77.690002,-0.0,-90.75,-0.719971,25506,0.3,90.029999,"""G""",1.9,141.8,0.191,0.140356,1.403563,-4.08338,-0.099945,1.4,-0.078531,0.062829,0.141361
"""25506-555-1332""",2017-11-19 22:17:18.100,77.699997,0.14,-90.350006,-1.200012,25506,0.4,89.150002,"""G""",1.9,141.8,0.191,0.140356,1.403563,4.08338,0.099945,1.4,-0.083783,0.069813,0.153596


## Include Momentum and Impulse 
It's not necessary to actually calculate the force, because we can assess these measurements for Impulse, J, as the change in either linear or angular momentum. 
I will need to include the weights of the players to get the momentum of each. 


Measures I would like to consider: Average chest circumference for NFL players seems to be between 46 and 49 inches - so I'll go with 1.2 M (120 cm)


In [59]:

def impulse_calculator(df):
    import numpy as np
    import polars as pl
    """
    Using the (X,Y) and time columns, perform calculations based on the velocities and changes 
    in velocites along with player mass to get the momentum and impulse, a measure that can 
    be assessed along with medical data related to concussions and injuries
    """
    
    return df.with_columns([
        # Calculate the linear momentum for each instant
        (pl.col('vx') * pl.col('Weight_kg')).alias('px')
        , (pl.col('vy') * pl.col('Weight_kg')).alias('py')

        # Calculate the moment of inertia of a rotating upright body (1/12 mr^2)
        , (1/12 * pl.col('Weight_kg') * (pl.col('Chest_rad_m')**2)).alias('moment')
        
        # Calculate the moment of inertia of the upper body turning upright with respect to waist (70% mass)
        , (1/12 * (pl.col('Weight_kg')*0.7) * (pl.col('Chest_rad_m')**2)).alias('moment_upper')
    
    ]).with_columns([
          # Calculate the magnitude of linear momentum
        ((pl.col("px")**2 + pl.col("py")**2)**0.5).alias("p_magnitude")
        
        # Calculate the angular momentum for the direction
        , (pl.col('omega_dir')*pl.col('moment')).alias('L_dir')

        # Calculate the angular momentum of the upper body with respect to lower
        , (pl.col('omega_diff')*pl.col('moment_upper')).alias('L_diff')


    ]).with_columns([
        # Pre-calculate shifted values for linear and angular momenta
        pl.col("px").shift(1).over("PlayKey").alias("prev_px")
        , pl.col("py").shift(1).over("PlayKey").alias("prev_py")
        , pl.col("L_dir").shift(1).over("PlayKey").alias("prev_L_dir")
        , pl.col("L_diff").shift(1).over("PlayKey").alias("prev_L_diff")
        
    ]).with_columns([
        # Calculate impulse, J, which is the change in linear momentum 
        ((pl.col("px") - pl.col("prev_px"))).alias("Jx")
        , ((pl.col("py") - pl.col("prev_py"))).alias("Jy")
        
    ]).with_columns([
          # Calculate the magnitude of linear momentum
        ((pl.col("Jx")**2 + pl.col("Jy")**2)**0.5).alias("J_magnitude")

        # Calculate torque as the change in angular momentum L over the change in time
        , (((pl.col("L_dir") - pl.col("prev_L_dir"))) / 0.1).alias("torque")
        , (((pl.col("L_diff") - pl.col("prev_L_diff"))) / 0.1).alias("torque_internal")

    ]).drop([
        "prev_L_dir", "prev_px", "prev_py", "prev_L_diff"
    ])

    # return df

In [60]:
tracks = impulse_calculator(track)
tracks.head()

PlayKey,datetime,x,y,o,dir,gsisid,time,Angle_Diff,position,Height_m,Weight_kg,Chest_rad_m,Displacement,Speed,Direction,vx,vy,omega_dir,omega_o,omega_diff,px,py,moment,moment_upper,p_magnitude,L_dir,L_diff,Jx,Jy,J_magnitude,torque,torque_internal
str,datetime[ns],f32,f32,f32,f32,i32,f64,f32,str,f64,f64,f64,f32,f32,f32,f32,f32,f32,f32,f32,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""25506-555-1332""",2017-11-19 22:17:17.700,77.699997,-0.44,-91.769989,0.809998,25506,0.0,92.580002,"""G""",1.9,141.8,0.191,,,,,,,,,,,0.431084,0.301759,,,,,,,,
"""25506-555-1332""",2017-11-19 22:17:17.800,77.699997,-0.29,-91.470001,0.270004,25506,0.1,91.739998,"""G""",1.9,141.8,0.191,0.15,1.5,0.0,0.0,1.5,-0.094247,0.052358,0.146605,0.0,212.7,0.431084,0.301759,212.7,-0.040628,0.044239,,,,,
"""25506-555-1332""",2017-11-19 22:17:17.900,77.699997,-0.14,-91.109985,-0.27002,25506,0.2,90.839996,"""G""",1.9,141.8,0.191,0.15,1.5,0.0,0.0,1.5,-0.094252,0.062834,0.157086,0.0,212.699983,0.431084,0.301759,212.699983,-0.04063,0.047402,0.0,-1.7e-05,1.7e-05,-2.3e-05,0.031629
"""25506-555-1332""",2017-11-19 22:17:18,77.690002,-0.0,-90.75,-0.719971,25506,0.3,90.029999,"""G""",1.9,141.8,0.191,0.140356,1.403563,-4.08338,-0.099945,1.4,-0.078531,0.062829,0.141361,-14.172211,198.519997,0.431084,0.301759,199.025226,-0.033854,0.042657,-14.172211,-14.179986,20.048032,0.067769,-0.047453
"""25506-555-1332""",2017-11-19 22:17:18.100,77.699997,0.14,-90.350006,-1.200012,25506,0.4,89.150002,"""G""",1.9,141.8,0.191,0.140356,1.403563,4.08338,0.099945,1.4,-0.083783,0.069813,0.153596,14.172211,198.519997,0.431084,0.301759,199.025226,-0.036118,0.046349,28.344421,0.0,28.344421,-0.022639,0.03692


In [61]:
def path_calculator(df):
    import polars as pl
    # This provides a summary table that can be integrated with the qualitative data

    # Calculate total distance and displacement for each PlayKey
    # Calculate total distance and displacement for each PlayKey
    result = df.select([
        "PlayKey"
        , pl.col("Displacement").sum().over("PlayKey").alias("Distance")
        , pl.col("x").first().over("PlayKey").alias("start_x")
        , pl.col("y").first().over("PlayKey").alias("start_y")
        , pl.col("x").last().over("PlayKey").alias("end_x")
        , pl.col("y").last().over("PlayKey").alias("end_y")
        , pl.col("Angle_Diff").max().over("PlayKey").alias("Max_Angle_Diff")
        , pl.col("Angle_Diff").mean().over("PlayKey").alias("Mean_Angle_Diff")
        , pl.col("Speed").max().over("PlayKey").alias("Max_Speed")
        , pl.col("Speed").mean().over("PlayKey").alias("Mean_Speed")
        , pl.col("J_magnitude").max().over("PlayKey").alias("Max_Impulse")
        , pl.col("J_magnitude").mean().over("PlayKey").alias("Mean_Impulse")
        , pl.col("torque").max().over("PlayKey").alias("Max_Torque")
        , pl.col("torque").mean().over("PlayKey").alias("Mean_Torque")
        , pl.col("torque_internal").max().over("PlayKey").alias("Max_Int_Torque")
        , pl.col("torque_internal").mean().over("PlayKey").alias("Mean_Int_Torque")

        ]).unique(subset=["PlayKey"])


    # Calculate the displacement
    result = result.with_columns([
        (((pl.col("end_x") - pl.col("start_x"))**2 + 
          (pl.col("end_y") - pl.col("start_y"))**2)**0.5)
        .alias("Displacement")
        ]).with_columns([
            (pl.col("Distance") - pl.col("Displacement")).alias("Path_Diff")
        ])

     
    # Select only the required columns
    result = result.select([
        'PlayKey'
        , 'Distance'
        , 'Displacement'
        , 'Path_Diff'
        , 'Max_Angle_Diff'
        , 'Mean_Angle_Diff'
        , 'Max_Speed'
        , 'Mean_Speed'
        , 'Max_Impulse'
        , 'Mean_Impulse'
        , 'Max_Torque'
        , 'Mean_Torque'
        , 'Max_Int_Torque'
        , 'Mean_Int_Torque'
      
    ]).sort("PlayKey")


    return result

In [62]:
summary = path_calculator(tracks)
summary.head()

PlayKey,Distance,Displacement,Path_Diff,Max_Angle_Diff,Mean_Angle_Diff,Max_Speed,Mean_Speed,Max_Impulse,Mean_Impulse,Max_Torque,Mean_Torque,Max_Int_Torque,Mean_Int_Torque
str,f32,f32,f32,f32,f32,f32,f32,f64,f64,f64,f64,f64,f64
"""25506-555-1332""",34.27224,28.210428,6.061811,179.190002,77.081902,85.678299,1.73092,12141.829548,317.333409,291.367928,0.018561,180.267993,0.008555
"""25630-555-1332""",112.338295,43.121223,69.217072,167.830002,77.769073,469.932068,3.237413,51244.142605,436.97721,197.702775,-0.005592,145.107755,-0.007841
"""26160-555-1332""",0.481156,0.481042,0.000114,95.489998,93.569992,1.697076,1.603853,12.361504,10.551686,0.26894,0.19709,0.1558,0.136341
"""27044-555-1332""",26.349907,20.820673,5.529234,177.929993,79.466606,60.824429,1.50571,7121.087324,157.403092,228.125634,-0.000477,159.482572,0.000205
"""27103-555-1332""",93.405952,58.289104,35.116848,177.490005,76.825272,370.257599,2.804982,36107.521839,327.801371,215.869078,-0.01264,124.471799,0.005324
