# NGS ETL

In [111]:
import polars as pl
import sqlalchemy as db
from sqlalchemy.orm import Session
from sqlalchemy import create_engine
import psycopg2

from CleaningFunctions import *
from DataHandler import data_loader, data_shrinker

In [112]:
def calculate_angle_difference(angle1, angle2):
    import numpy as np
    """
    Calculate the smallest angle difference between two angles 
    using trigonometric functions, accounting for edge cases.
    """
    sin_diff = np.sin(np.radians(angle2 - angle1))
    cos_diff = np.cos(np.radians(angle2 - angle1))
    return np.degrees(np.arctan2(sin_diff, cos_diff))

def angle_corrector(df):
    import polars as pl
    """
    Make corrections to angles to reduce fringe errors at 360
    """
    df = df.with_columns([
        ((pl.col("dir") + 180) % 360 - 180).alias("dir")
        , ((pl.col("o") + 180) % 360 - 180).alias("o")
    ]).with_columns(
        (calculate_angle_difference(pl.col("dir"), pl.col("o"))).abs().round(2).alias("Angle_Diff")
        )
    
    return df



def path_calculator(df):
    import polars as pl
    # Calculate total distance and displacement for each PlayKey
    # Calculate total distance and displacement for each PlayKey
    result = df.select([
        "PlayKey"
        , pl.col("dist").sum().over("PlayKey").alias("Distance")
        , pl.col("x").first().over("PlayKey").alias("start_x")
        , pl.col("y").first().over("PlayKey").alias("start_y")
        , pl.col("x").last().over("PlayKey").alias("end_x")
        , pl.col("y").last().over("PlayKey").alias("end_y")
        , pl.col("Angle_Diff").max().over("PlayKey").alias("Max_Angle_Diff")
        , pl.col("Angle_Diff").mean().over("PlayKey").alias("Mean_Angle_Diff")
        , pl.col("Speed").max().over("PlayKey").alias("Max_Speed")
        , pl.col("Speed").mean().over("PlayKey").alias("Mean_Speed")
        , pl.col("omega_dir").max().over("PlayKey").alias("Max_omega_dir")
        , pl.col("omega_dir").mean().over("PlayKey").alias("Mean_omega_dir")
        , pl.col("omega_o").max().over("PlayKey").alias("Max_omega_o")
        , pl.col("omega_o").mean().over("PlayKey").alias("Mean_omega_o")
        , pl.col("d_omega").max().over("PlayKey").alias("Max_d_omega")
        , pl.col("d_omega").mean().over("PlayKey").alias("Mean_d_omega")
        ]).unique(subset=["PlayKey"])


    # Calculate the displacement
    result = result.with_columns([
        (((pl.col("end_x") - pl.col("start_x"))**2 + 
          (pl.col("end_y") - pl.col("start_y"))**2)**0.5)
        .alias("Displacement")
        ]).with_columns([
            (pl.col("Distance") - pl.col("Displacement")).alias("Path_Diff")
        ])

     
    # Select only the required columns
    result = result.select([
        'PlayKey'
        , 'Distance'
        , 'Displacement'
        , 'Path_Diff'
        , 'Max_Angle_Diff'
        , 'Mean_Angle_Diff'
        , 'Max_Speed'
        , 'Mean_Speed'
        , 'Max_omega_dir'
        , 'Mean_omega_dir'
        , 'Max_omega_o'
        , 'Mean_omega_o'
        , 'Max_d_omega'
        , 'Mean_d_omega'
    ]).sort("PlayKey")


    return result

def column_corrector(df):
    import polars as pl
    """
    Add a Play_Time column that acts like the 'time' column did in the injury dataset. 
    Each PlayKey will start at 0.0 and increase by 0.1 for each subsequent record.
    """
    df = df.with_columns([
        pl.concat_str([
            pl.col('gsisid').cast(pl.Int32).cast(pl.Utf8)
            , pl.lit('-')
            , pl.col('gamekey').cast(pl.Utf8)
            , pl.lit('-')
            , pl.col('playid').cast(pl.Utf8)
        ]).alias('PlayKey')
    ])
     
    
    df = df.select([
        'PlayKey'
        , 'time'
        , 'x'
        , 'y'
        , 'o'
        , 'dir'
        ]).rename({"time":"datetime"})

    df = df.sort(['PlayKey', 'datetime'])

    df = df.with_columns(
        (pl.arange(0, pl.len()) * 0.1).over("PlayKey").alias("time")
        )  
    
    return df

In [None]:
# import numpy as np
# import polars as pl

# def velocity_calculator(df):
#     """
#     Calculate velocity-related metrics using position data recorded every 0.1 seconds.
#     """
#     df.with_columns([
#         # Convert 'o' and 'dir' to radians
#         (pl.col("o") * np.pi / 180).alias("o_rad"),
#         (pl.col("dir") * np.pi / 180).alias("dir_rad")
#     ])
    
#     df.with_columns([
#         # Pre-calculate shifted values
#         pl.col("x").shift(1).over("PlayKey").alias("prev_x"),
#         pl.col("y").shift(1).over("PlayKey").alias("prev_y"),
#         pl.col("dir_rad").shift(1).over("PlayKey").alias("prev_dir_rad"),
#         pl.col("o_rad").shift(1).over("PlayKey").alias("prev_o_rad"),

#         # Calculate x and y differences
#         (pl.col("x") - pl.col("x").shift(1).over("PlayKey")).alias("dx"),
#         (pl.col("y") - pl.col("y").shift(1).over("PlayKey")).alias("dy")
#     ]).with_columns([
#         # Calculate displacement
#         ((pl.col("dx")**2 + pl.col("dy")**2)**0.5).alias("dist"),

#         # Calculate speed
#         (pl.col("dist") / 0.1).alias("Speed"),

#         # Calculate direction
#         (pl.atan2(pl.col("dy"), pl.col("dx")) * 180 / np.pi).alias("Direction"),

#         # Calculate velocity components
#         (pl.col("dx") / 0.1).alias("vx"),
#         (pl.col("dy") / 0.1).alias("vy"),

#         # Calculate angular velocities
#         ((pl.col("dir_rad") - pl.col("prev_dir_rad")) / 0.1).alias("omega_dir"),
#         ((pl.col("o_rad") - pl.col("prev_o_rad")) / 0.1).alias("omega_o")
#     ]).with_columns([
#         (pl.col("omega_dir") - pl.col("omega_o")).abs().alias("omega_diff")
#     ]).drop([
#         "prev_x", "prev_y", "prev_dir_rad", "prev_o_rad", "dx", "dy", "o_rad", "dir_rad"
#     ])

#     return df


In [124]:
track = data_loader(dataset='ngs_data', database='nfl_concussion')
track = data_shrinker(track)
track = column_corrector(track)
track = angle_corrector(track)
len(track)

Memory usage of dataframe is 0.42 MB
Memory usage after optimization is: 0.38 MB
Decreased by 9.2%


10000

In [125]:
def velocity_calculator(df):
    import numpy as np
    import polars as pl
    """
    Using the (X,Y) and time columns, perform calculations based on the 
    difference between two rows to find displacement, speed, direction 
    of motion, velocity in x and y components, and the angular velocities 
    of the direction of motion and orientations 
    """
    
    return df.with_columns([
        # Convert 'o' and 'dir' to radians
        (pl.col("o") * np.pi / 180).alias("o_rad"),
        (pl.col("dir") * np.pi / 180).alias("dir_rad")
    ]).with_columns([
        # Pre-calculate shifted values
        pl.col("x").shift(1).over("PlayKey").alias("prev_x")
        , pl.col("y").shift(1).over("PlayKey").alias("prev_y")
        # , pl.col("time").shift(1).over("PlayKey").alias("prev_time")
        , pl.col("dir_rad").shift(1).over("PlayKey").alias("prev_dir")
        , pl.col("o_rad").shift(1).over("PlayKey").alias("prev_o")
    ]).with_columns([
        # Calculate the component displacements 
          (pl.col("x") - pl.col("prev_x")).alias("dx")
        , (pl.col("y") - pl.col("prev_y")).alias("dy")
    ]).with_columns([
        # Calculate displacement
        ((pl.col("dx")**2 + pl.col("dy")**2)**0.5).alias("Displacement")
    ]).with_columns([
        # Calculate speed
        (pl.col("Displacement") / 0.1).alias("Speed")
        # Calculate direction
        , (np.degrees(np.arctan2(pl.col("dx"), pl.col("dy")))).alias("Direction")
        # Calculate velocity components
        , (pl.col("dx") / 0.1).alias("vx")
        , (pl.col("dy") / 0.1).alias("vy")
        # Calculate angular velocities
        , ((pl.col("dir_rad") - pl.col("prev_dir")) / 0.1).alias("omega_dir")
        , ((pl.col("o_rad") - pl.col("prev_o")) / 0.1).alias("omega_o")
    ]).with_columns([
        ((pl.col("omega_dir") - pl.col("omega_o")).abs()).alias("omega_diff")
    ]).drop([
        "prev_x", "prev_y", "prev_dir", "prev_o", "dx", "dy", "o_rad", "dir_rad"
    ])

    # return df

In [126]:
track = velocity_calculator(track)
track.head()

PlayKey,datetime,x,y,o,dir,time,Angle_Diff,Displacement,Speed,Direction,vx,vy,omega_dir,omega_o,omega_diff
str,datetime[ns],f32,f32,f32,f32,f64,f32,f32,f32,f32,f32,f32,f32,f32,f32
"""22127-516-3021""",2017-10-29 22:50:52.900,67.959999,53.169998,58.290009,156.210022,0.0,97.919998,,,,,,,,
"""22127-516-3021""",2017-10-29 22:50:53,68.010002,53.169998,49.940002,156.25,0.1,106.309998,0.050003,0.500031,90.0,0.500031,0.0,0.006976,-1.457352,1.464328
"""22127-516-3021""",2017-10-29 22:50:53.100,68.07,53.169998,41.570007,156.190002,0.2,114.620003,0.059998,0.599976,90.0,0.599976,0.0,-0.010471,-1.46084,1.450368
"""22127-516-3021""",2017-10-29 22:50:53.200,68.129997,53.169998,34.209991,156.109985,0.3,121.900002,0.059998,0.599976,90.0,0.599976,0.0,-0.013964,-1.284565,1.270601
"""22127-516-3021""",2017-10-29 22:50:53.300,68.239998,53.169998,27.990005,156.190002,0.4,128.199997,0.110001,1.100006,90.0,1.100006,0.0,0.013964,-1.085593,1.099557


In [127]:

def acceleration_calculator(df):
    import numpy as np
    import polars as pl
    """
    Using the (X,Y) and time columns, perform calculations based on the 
    difference between two rows to find displacement, speed, direction 
    of motion, velocity in x and y components, and the angular velocities 
    of the direction of motion and orientations 
    """
    
    return df.with_columns([
        # Pre-calculate shifted values for linear and angular velocities
        pl.col("vx").shift(1).over("PlayKey").alias("prev_vx")
        , pl.col("vy").shift(1).over("PlayKey").alias("prev_vy")
        , pl.col("omega_dir").shift(1).over("PlayKey").alias("prev_omega_dir")
        , pl.col("omega_o").shift(1).over("PlayKey").alias("prev_omega_o")
    
    ]).with_columns([
        # Calculate ax and ay from velocity differences over time
        ((pl.col("vx") - pl.col("prev_vx")) / 0.1).alias("ax")
        , ((pl.col("vy") - pl.col("prev_vy")) / 0.1).alias("ay")
        # Calculate angular accelerations
        , ((pl.col("omega_dir") - pl.col("prev_omega_dir")) / 0.1).alias("alpha_dir")
        , ((pl.col("omega_o") - pl.col("prev_omega_o")) / 0.1).alias("alpha_o")
    
    ]).with_columns([
        # Calculate the magnitude of linear acceleration
        ((pl.col("ax")**2 + pl.col("ay")**2)**0.5).alias("a_magnitude")
    ]).drop([
        "prev_omega_dir", "prev_omega_o", "prev_vx", "prev_vy"
    ])

    # return df

In [128]:
tracks = acceleration_calculator(track)
tracks.head()

PlayKey,datetime,x,y,o,dir,time,Angle_Diff,Displacement,Speed,Direction,vx,vy,omega_dir,omega_o,omega_diff,ax,ay,alpha_dir,alpha_o,a_magnitude
str,datetime[ns],f32,f32,f32,f32,f64,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32
"""22127-516-3021""",2017-10-29 22:50:52.900,67.959999,53.169998,58.290009,156.210022,0.0,97.919998,,,,,,,,,,,,,
"""22127-516-3021""",2017-10-29 22:50:53,68.010002,53.169998,49.940002,156.25,0.1,106.309998,0.050003,0.500031,90.0,0.500031,0.0,0.006976,-1.457352,1.464328,,,,,
"""22127-516-3021""",2017-10-29 22:50:53.100,68.07,53.169998,41.570007,156.190002,0.2,114.620003,0.059998,0.599976,90.0,0.599976,0.0,-0.010471,-1.46084,1.450368,0.999451,0.0,-0.174475,-0.034875,0.999451
"""22127-516-3021""",2017-10-29 22:50:53.200,68.129997,53.169998,34.209991,156.109985,0.3,121.900002,0.059998,0.599976,90.0,0.599976,0.0,-0.013964,-1.284565,1.270601,0.0,0.0,-0.034928,1.762742,0.0
"""22127-516-3021""",2017-10-29 22:50:53.300,68.239998,53.169998,27.990005,156.190002,0.4,128.199997,0.110001,1.100006,90.0,1.100006,0.0,0.013964,-1.085593,1.099557,5.000305,0.0,0.279284,1.989728,5.000305


In [133]:
def path_calculator(df):
    import polars as pl
    # This provides a summary table that can be integrated with the qualitative data

    # Calculate total distance and displacement for each PlayKey
    # Calculate total distance and displacement for each PlayKey
    result = df.select([
        "PlayKey"
        , pl.col("Displacement").sum().over("PlayKey").alias("Distance")
        , pl.col("x").first().over("PlayKey").alias("start_x")
        , pl.col("y").first().over("PlayKey").alias("start_y")
        , pl.col("x").last().over("PlayKey").alias("end_x")
        , pl.col("y").last().over("PlayKey").alias("end_y")
        , pl.col("Angle_Diff").max().over("PlayKey").alias("Max_Angle_Diff")
        , pl.col("Angle_Diff").mean().over("PlayKey").alias("Mean_Angle_Diff")
        , pl.col("Speed").max().over("PlayKey").alias("Max_Speed")
        , pl.col("Speed").mean().over("PlayKey").alias("Mean_Speed")
        , pl.col("a_magnitude").max().over("PlayKey").alias("Max_Accel")
        , pl.col("a_magnitude").mean().over("PlayKey").alias("Mean_Accel")
        , pl.col("omega_dir").max().over("PlayKey").alias("Max_omega_dir")
        , pl.col("omega_dir").mean().over("PlayKey").alias("Mean_omega_dir")
        , pl.col("omega_o").max().over("PlayKey").alias("Max_omega_o")
        , pl.col("omega_o").mean().over("PlayKey").alias("Mean_omega_o")
        , pl.col("omega_diff").max().over("PlayKey").alias("Max_d_omega")
        , pl.col("omega_diff").mean().over("PlayKey").alias("Mean_d_omega")
        , pl.col("alpha_dir").max().over("PlayKey").alias("Max_alpha_dir")
        , pl.col("alpha_dir").mean().over("PlayKey").alias("Mean_alpha_dir")
        , pl.col("alpha_o").max().over("PlayKey").alias("Max_alpha_o")
        , pl.col("alpha_o").mean().over("PlayKey").alias("Mean_alpha_o")
        ]).unique(subset=["PlayKey"])


    # Calculate the displacement
    result = result.with_columns([
        (((pl.col("end_x") - pl.col("start_x"))**2 + 
          (pl.col("end_y") - pl.col("start_y"))**2)**0.5)
        .alias("Displacement")
        ]).with_columns([
            (pl.col("Distance") - pl.col("Displacement")).alias("Path_Diff")
        ])

     
    # Select only the required columns
    result = result.select([
        'PlayKey'
        , 'Distance'
        , 'Displacement'
        , 'Path_Diff'
        , 'Max_Angle_Diff'
        , 'Mean_Angle_Diff'
        , 'Max_Speed'
        , 'Mean_Speed'
        , 'Max_omega_dir'
        , 'Mean_omega_dir'
        , 'Max_omega_o'
        , 'Mean_omega_o'
        , 'Max_d_omega'
        , 'Mean_d_omega'
    ]).sort("PlayKey")


    return result

In [134]:
summary = path_calculator(tracks)
summary.head()

PlayKey,Distance,Displacement,Path_Diff,Max_Angle_Diff,Mean_Angle_Diff,Max_Speed,Mean_Speed,Max_omega_dir,Mean_omega_dir,Max_omega_o,Mean_omega_o,Max_d_omega,Mean_d_omega
str,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32
"""22127-516-3021""",11.34505,10.990455,0.354595,150.309998,135.056412,4.295342,3.241442,0.069807,-0.008079,0.762708,-0.117585,1.464328,0.453087
"""23449-518-1480""",26.031353,25.446819,0.584534,100.459999,65.97084,130.178711,5.658988,-0.013959,-0.185916,1.893684,-0.017719,2.532476,0.425975
"""26714-516-3021""",50.939556,37.09502,13.844536,179.850006,87.744713,7.566373,2.877941,61.643288,-0.116651,62.217503,0.095904,61.946987,2.903519
"""27008-516-3021""",5.314974,4.958649,0.356325,10.89,3.264782,3.383787,2.415897,0.235615,0.09877,0.801106,0.179292,0.750484,0.216341
"""27442-518-116""",0.01,0.01,0.0,123.120003,119.309998,0.1,0.1,1.272341,1.272341,-0.057592,-0.057592,1.329934,1.329934
