In [30]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from functools import lru_cache

In [31]:
@lru_cache(maxsize=None)  
def fetch_data_from_source(parquet_file):
    df = pd.read_parquet(f"https://data3001-racing.s3.ap-southeast-2.amazonaws.com/{parquet_file}")
    return df

def fetch_data_csv(parquet_file):
    df = pd.read_csv(f"https://data3001-racing.s3.ap-southeast-2.amazonaws.com/{parquet_file}")
    return df

In [32]:
# testing validity of our primary key

df22 = fetch_data_from_source('f1sim-data-2022.parquet')
df23 = fetch_data_from_source('f1sim-data-2023.parquet')
df = pd.concat([df22, df23], ignore_index=True)
si22 = len(df22["SESSION_IDENTIFIER"].unique())
si23 = len(df23["SESSION_IDENTIFIER"].unique())
si_final = len(df["SESSION_IDENTIFIER"].unique())
print(si22 + si23 == si_final)

# since the statement is true, there are no overlapping "SESSION_IDENTIFIER" values across two dfs

True


In [45]:
# Inputs: N/A
# Outputs: df with both datasets appended to each other (sorted by "SESSION_IDENTIFIER" then "FRAME")

def fetch_and_merge():
    df22 = fetch_data_from_source('f1sim-data-2022.parquet')
    df23 = fetch_data_from_source('f1sim-data-2023.parquet')
    df = pd.concat([df22, df23], ignore_index=True)
    df.sort_values(by=["SESSION_IDENTIFIER", "FRAME"], inplace=True)
    df.reset_index(drop=True, inplace=True)
    assert(len(df22)+len(df23)==len(df))
    return df

In [34]:
# Inputs: Sector {1,2,3}
# Outputs: df of records with data from that sector

def split_sector(s):
    df = fetch_and_merge()
    df = df[df["SECTOR"]==s]
    return df

In [35]:
# Inputs: df, time
# Outputs: df with records where lap time is under time given
def lap_under(df, t):
    df = df[df["LAP_TIME_MS"]<t]
    return df

In [41]:
# Inputs: df, sector {1,2,3}, time (in ms)
# Outputs: df where the time set in the sector is less than the time given

def sector_under(df, s, t):
    if s==1:
        str = 'SECTOR_1_TIME_MS'
    elif s==2:
        str = 'SECTOR_2_TIME_MS'
    elif s==3:
        str = 'SECTOR_3_MS'

    df = df[df[str]<t]
    return df