# Setup: Import Libraries and Define Paths


In [65]:
import os
import pandas as pd
from getpass import getuser
import numpy as np


In [66]:
# Automatically set the user
user = getuser()

# Define paths
base_path = f"C:/Users/{user}/Documents/GitHub/false_start_2/data"
data_in = os.path.join(base_path, 'in')
data_out = os.path.join(base_path, 'out')

# Ensure output directory exists
os.makedirs(data_out, exist_ok=True)

In [67]:

# File path
df_path = os.path.join(data_out, "df.csv")
df_finalists_path = os.path.join(data_out, "df_finalists.csv")

In [68]:
# Load data
df = pd.read_csv(df_path)
df_finalists = pd.read_csv(df_finalists_path)


# Data Filtring


In [69]:
# Filter for 100m
df_hm = df[df['discipline'] == '100m']

# Filter for 100m men
df_hm_men = df[(df['discipline'] == '100m') & (df['men'] == 1)]
df_hm_men_finalists = df_finalists[(df_finalists['discipline'] == '100m') & (df_finalists['men'] == 1)]

# Filter for 100m women
df_hm_women = df[(df['discipline'] == '100m') & (df['men'] == 0)]
df_hm_women_finalists = df_finalists[(df_finalists['discipline'] == '100m') & (df_finalists['men'] == 0)]


## missing values

In [70]:
# Specify the columns to check for missing values
columns_to_check = [
    "reactiontime_ms", "age_at_event", "finishingtime", 
    "semi", "final", "rule", "wind", "country", "lane"
]

# Count missing values for each column of interest
missing_values = df[columns_to_check].isna().sum()
print("Missing values per column:")
display(missing_values)

# Create indicator columns for missing values in each variable
for col in columns_to_check:
    df[f"{col}_missing"] = df[col].isna()

# Count the number of missing values in each row for the specified columns
df['missing_count'] = df[[f"{col}_missing" for col in columns_to_check]].sum(axis=1)

# Filter rows with more than one missing value
observations_with_multiple_missing = df.loc[df['missing_count'] > 1, columns_to_check + [f"{col}_missing" for col in columns_to_check] + ['missing_count']]

if not observations_with_multiple_missing.empty:
    print("Observations with more than one missing value:")
    display(observations_with_multiple_missing)
else:
    print("No observations with more than one missing value.")

# Clean up the temporary columns after the analysis
df = df.drop(columns=[f"{col}_missing" for col in columns_to_check] + ['missing_count'])
display(df.head())
# Remove observations with missing values in any of the specified columns
# df = df.dropna(subset=columns_to_check)


Missing values per column:


reactiontime_ms    4278
age_at_event         31
finishingtime       181
semi                  0
final                 0
rule                  0
wind                 39
country               2
lane                  0
dtype: int64

Observations with more than one missing value:


Unnamed: 0,reactiontime_ms,age_at_event,finishingtime,semi,final,rule,wind,country,lane,reactiontime_ms_missing,age_at_event_missing,finishingtime_missing,semi_missing,final_missing,rule_missing,wind_missing,country_missing,lane_missing,missing_count
2,,25.0,10.46,0,0,rule 1,,RSA,7.0,True,False,False,False,False,False,True,False,False,2
54,,26.0,,0,0,rule 1,-0.5,CAF,3.0,True,False,True,False,False,False,False,False,False,2
60,,24.0,10.34,0,0,rule 1,,FRA,8.0,True,False,False,False,False,False,True,False,False,2
74,,24.0,,0,0,rule 1,-0.3,MHL,3.0,True,False,True,False,False,False,False,False,False,2
87,,25.0,10.03,0,0,rule 1,,BAR,4.0,True,False,False,False,False,False,True,False,False,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4243,,23.0,,0,0,rule 3,-1.4,RSA,5.0,True,False,True,False,False,False,False,False,False,2
4245,,29.0,,0,0,rule 3,-1.3,GER,8.0,True,False,True,False,False,False,False,False,False,2
4246,,27.0,,0,0,rule 3,-0.6,TTO,4.0,True,False,True,False,False,False,False,False,False,2
4253,,32.0,,0,0,rule 3,-0.4,HAI,7.0,True,False,True,False,False,False,False,False,False,2


Unnamed: 0,event,100m,110mh,100mh,men,date,time,preliminary,prelnum,heat,...,age_at_event,discipline,stage,reactiontime_ms,rule,rule1,rule2,rule3,athleteID,early_stages
0,1997,1,0,0,1,1997-08-03 00:00:00,18:50:00,0,0,0,...,30.0,100m,semi,,rule 1,1,0,0,surin_1967-07-12,0
1,2001,1,0,0,1,2001-08-04 00:00:00,11:00:00,1,6,0,...,25.0,100m,preliminary,,rule 1,1,0,0,quinn_1976-04-17,1
2,2001,1,0,0,1,2001-08-04 00:00:00,16:11:00,0,0,1,...,25.0,100m,heat,,rule 1,1,0,0,quinn_1976-04-17,1
3,1999,1,0,0,1,1999-08-21 00:00:00,12:24:00,1,10,0,...,23.0,100m,preliminary,,rule 1,1,0,0,quinn_1976-04-17,1
4,1997,1,0,0,0,1997-08-03 00:00:00,21:35:00,0,0,0,...,27.0,100m,final,,rule 1,1,0,0,paschke_1970-06-29,0


In [73]:
# Group by 'rule' and 'men' and calculate summary statistics
table1 = (
    df.groupby(['rule', 'men'])
    .apply(lambda group: pd.Series({
        'Mean_ReactionTime': group['reactiontime'].mean(skipna=True),
        'SD_ReactionTime': group['reactiontime'].std(skipna=True),
        'Num_FalseStarts': (group['falsestart'] == 1).sum(),
        'Num_Observations': len(group)  # Total rows in the group
    }))
    .reset_index()
)

# Print the summary table
print("Table 1:")
print(table1)

# Calculate overall summary statistics
table1_tot = pd.DataFrame({
    'Mean_ReactionTime': [df['reactiontime'].mean(skipna=True)],
    'SD_ReactionTime': [df['reactiontime'].std(skipna=True)],
    'Num_FalseStarts': [(df['falsestart'] == 1).sum()],
    'Num_Observations': [len(df)]
})

# Print the total summary table
print("\nTable 1 Totals:")
print(table1_tot)


Table 1:
     rule  men  Mean_ReactionTime  SD_ReactionTime  Num_FalseStarts  \
0  rule 1    0           0.149000         0.019274             10.0   
1  rule 1    1           0.154714         0.028022             13.0   
2  rule 2    0           0.166000              NaN              3.0   
3  rule 2    1                NaN              NaN              7.0   
4  rule 3    0                NaN              NaN              6.0   
5  rule 3    1                NaN              NaN             14.0   

   Num_Observations  
0             450.0  
1             584.0  
2             734.0  
3             823.0  
4             797.0  
5             907.0  

Table 1 Totals:
   Mean_ReactionTime  SD_ReactionTime  Num_FalseStarts  Num_Observations
0           0.152353         0.022374               53              4295


  .apply(lambda group: pd.Series({
