# Setup and Imports

In [1]:
import numpy as np
import os
import pandas as pd
from getpass import getuser


# Data Input

In [2]:
# Data preparation

# Automatically set the user
user = getuser()

# Set working directory
folder_path = f"C:/Users/{user}/Documents/GitHub/false_start_2/data/in"
os.chdir(folder_path)

# Load the data.xlsx file
file_name = "data.xlsx"
file_path = os.path.join(folder_path, file_name)
df = pd.read_excel(file_path)

# Display the first few rows to verify the data
display(df.head())


Unnamed: 0,Event,100m,110m H,100m H,Men,Date,Time,preliminary,prel num,heat,...,race false start,false start,season best,WR (at time of event),CR record (at time of event),indiv personal best,indiv season best,height,weight,debut
0,2011,1,0,0,1,2011-08-27 00:00:00,13:16:00,1,4,0,...,0,0,9.78,9.58,9.58,10.4,10.4,,,
1,2011,1,0,0,1,2011-08-27 00:00:00,22:10:00,0,0,1,...,0,0,9.78,9.58,9.58,10.4,10.4,,,
2,2011,0,1,0,1,2011-08-28 00:00:00,10:11:00,0,0,1,...,0,0,12.94,12.87,12.91,13.57,13.57,,,
3,2007,1,0,0,1,2007-08-25 00:00:00,12:52:00,1,6,0,...,0,0,9.84,9.76,9.8,10.28,10.28,,,
4,1999,1,0,0,1,1999-08-21 00:00:00,12:00:00,1,6,0,...,0,0,9.79,9.79,9.86,,,,,


# Data Cleaning

In [3]:
# Change all variable names to lower case
df.columns = df.columns.str.lower()

# Remove all spaces in variable names
df.columns = df.columns.str.replace(" ", "", regex=False)

# Create a variable for conditional false start for event 1997
df['cond_fs'] = np.where(df['reactiontime'].str.contains("#", na=False), 1, 0)
df['racefalsestart'] = np.where(df['cond_fs'] == 1, 1, df['racefalsestart'])
df['falsestart'] = np.where(df['cond_fs'] == 1, 1, df['falsestart'])

# Clear reaction time with symbols but keep negative values
df['reactiontime'] = pd.to_numeric(df['reactiontime'].str.replace("[^0-9.-]", "", regex=True), errors='coerce')


# Create running time as the difference between finishing time and reaction time
df['runningtime'] = df['finishingtime'] - df['reactiontime']

# Use the strip method for removing leading/trailing spaces
df['country'] = df['country'].str.strip()

# Replace "TRI" with "TTO" in the country column
df['country'] = df['country'].replace("TRI", "TTO")


In [4]:
# Verify that the maximum value has been removed
print(df['reactiontime'].describe())

count    28.000000
mean      0.150071
std       0.021401
min       0.122000
25%       0.133750
50%       0.145000
75%       0.166500
max       0.211000
Name: reactiontime, dtype: float64


In [5]:
# Ensure dob and date columns are in datetime format
df['dob_date'] = pd.to_datetime(df['dob'], errors='coerce')  # Convert dob to datetime
df['event_date'] = pd.to_datetime(df['date'], errors='coerce')  # Convert date to datetime

# Calculate age at the time of the event
df['age_at_event'] = (df['event_date'] - df['dob_date']).dt.days / 365.25

# Convert to nullable Int64, ensuring NaNs are handled properly
df['age_at_event'] = df['age_at_event'].round(0).astype('Int64')

# View the first few rows to verify
print(df[['dob_date', 'event_date', 'age_at_event']].head())


    dob_date event_date  age_at_event
0 1985-05-31 2011-08-27            26
1 1985-05-31 2011-08-27            26
2 1985-07-08 2011-08-28            26
3 1986-06-24 2007-08-25            21
4 1978-11-17 1999-08-21            21


# Feature Engineering

In [6]:

# Convert specific columns to numeric
cols_num = ["reactiontime", "indivpersonalbest", "indivseasonbest"]
df[cols_num] = df[cols_num].apply(pd.to_numeric, errors='coerce')

# Create categorical variable for the type of competition
df['discipline'] = df[['100m', '110mh', '100mh']].idxmax(axis=1)
df['stage'] = df[['preliminary', 'heat', 'semi', 'final']].idxmax(axis=1)

# Order factor stage variable
stage_order = ["preliminary", "heat", "semi", "final"]
df['stage'] = pd.Categorical(df['stage'], categories=stage_order, ordered=True)

# New variables
df['reactiontime_ms'] = df['reactiontime'] * 1000

# Create rules based on the event year
df['rule'] = np.select(
    [df['event'] < 2003, df['event'] < 2010],
    ["rule 1", "rule 2"],
    default="rule 3"
)

df['rule1'] = np.where(df['rule'] == "rule 1", 1, 0)
df['rule2'] = np.where(df['rule'] == "rule 2", 1, 0)
df['rule3'] = np.where(df['rule'] == "rule 3", 1, 0)

# Round all numerical columns to 4 decimals
df = df.round(decimals=4)

# Remove everything after the first hyphen "-" in firstname
df['firstname'] = df['firstname'].str.split('-').str[0]

# Remove everything after the first hyphen "-" in surname
df['surname'] = df['surname'].str.split('-').str[0]

# Convert firstname and surname to lowercase
df['firstname'] = df['firstname'].str.lower()
df['surname'] = df['surname'].str.lower()

# Convert wind from character to numeric
df['wind'] = pd.to_numeric(df['wind'], errors='coerce')

# Convert 'lane' from numeric to factor (categorical)
df['lane'] = df['lane'].astype('category')

# Remove all observations where information on lane is missing
df = df[~df['lane'].isna()]

# Ensure 'dob' is a string
df['dob'] = df['dob'].astype(str)

# Create the athleteID variable by concatenating surname and dob
df['athleteID'] = df['surname'] + "_" + df['dob'].str.replace("/", "", regex=True)

# Create the early_stages variable for preliminary + heat
df['early_stages'] = np.where((df['preliminary'] == 1) | (df['heat'] == 1), 1, 0)


In [7]:
# Sort the DataFrame by the 'rule' column
df = df.sort_values(by='rule')

# Filter the DataFrame to include only finalists for the "100m" discipline
finalists_df = df[(df['final'] == 1) & (df['discipline'] == "100m")]

# Group by 'event' and 'men' and count occurrences
finalists_event = finalists_df.groupby(['event', 'men']).size().reset_index(name='count')

# View the resulting DataFrame
display(finalists_event)


Unnamed: 0,event,men,count
0,1997,0,8
1,1997,1,8
2,1999,0,8
3,1999,1,8
4,2001,0,8
5,2001,1,8
6,2003,0,8
7,2003,1,8
8,2005,0,8
9,2005,1,8


In [8]:
# Perform a semi-join to keep all observations in df that match athleteID, event, men, and discipline in finalists_df
df_finalists = df.merge(finalists_df[['athleteID', 'event', 'men', 'discipline']], 
                        on=['athleteID', 'event', 'men', 'discipline'], 
                        how='inner')

# Group by athleteID, event, discipline, and men to count observations
event_counts = (
    df_finalists
    .groupby(['athleteID', 'event', 'discipline', 'men'])
    .size()
    .reset_index(name='observations')
)

# View the resulting DataFrame
display(event_counts)


Unnamed: 0,athleteID,event,discipline,men,observations
0,ahouré_1987-08-23,2013,100m,0,3
1,ahouré_1987-08-23,2017,100m,0,3
2,ahouré_1997-08-23,2019,100m,0,1
3,ahye_1992-04-10,2015,100m,0,3
4,ahye_1992-04-10,2017,100m,0,3
...,...,...,...,...,...
188,williams4_1983-09-11,2005,100m,0,4
189,williams4_1983-09-11,2007,100m,0,4
190,williams4_1983-09-11,2009,100m,0,4
191,zakari_1976-09-02,2001,100m,1,4


In [9]:
# Ensure the 'date' column is in datetime format
df_finalists['date'] = pd.to_datetime(df_finalists['date'], errors='coerce')

# Calculate the date difference
df2_hm_finalists = (
    df_finalists
    .sort_values(['event', 'athleteID', 'date'])  # Ensure sorting before calculating lag
    .assign(date_difference=lambda x: x.groupby(['event', 'athleteID'])['date'].diff().dt.days)  # Date difference in days
)

# Create variables for reaction time differences
df2_hm_finalists = (
    df2_hm_finalists[df2_hm_finalists['discipline'] == "100m"]  # Filter for "100m"
    .groupby(['event', 'athleteID'], as_index=False)
    .agg(
        max_reactiontime=('reactiontime_ms', 'max'),
        min_reactiontime=('reactiontime_ms', 'min')
    )
    .assign(difference=lambda x: x['max_reactiontime'] - x['min_reactiontime'])
)

# Filter for finalists and non-missing differences
df2_hm_finalists = df2_hm_finalists.merge(
    df_finalists[df_finalists['final'] == 1][['event', 'athleteID']],
    on=['event', 'athleteID']
).dropna(subset=['difference'])

# Display the resulting DataFrame
print(df2_hm_finalists)


   event         athleteID  max_reactiontime  min_reactiontime  difference
6   1997  fynes_1974-10-17             184.0             184.0         0.0


# Filtering and Final Adjustments

In [10]:
# Check if 'rule' exists
if 'rule' not in df2_hm_finalists.columns:
    # Merge 'rule' from the original DataFrame if it exists there
    df2_hm_finalists = df2_hm_finalists.merge(
        df_finalists[['event', 'athleteID', 'rule']],  # Adjust based on available columns
        on=['event', 'athleteID'],
        how='left'
    )

# Create dummy variables for consistency
df2_hm_finalists = df2_hm_finalists.assign(
    consistent=lambda x: (x['difference'] < 20).astype(int),
    intermediate=lambda x: ((x['difference'] >= 20) & (x['difference'] <= 50)).astype(int),
    inconsistent=lambda x: (x['difference'] > 50).astype(int)
)

# Create dummy variables for the "rule" variable
df2_hm_finalists = df2_hm_finalists.assign(
    rule1=lambda x: (x['rule'] == "rule 1").astype(int),
    rule2=lambda x: (x['rule'] == "rule 2").astype(int),
    rule3=lambda x: (x['rule'] == "rule 3").astype(int)
)

# Display the resulting DataFrame
display(df2_hm_finalists.head())


Unnamed: 0,event,athleteID,max_reactiontime,min_reactiontime,difference,rule,consistent,intermediate,inconsistent,rule1,rule2,rule3
0,1997,fynes_1974-10-17,184.0,184.0,0.0,rule 1,1,0,0,1,0,0
1,1997,fynes_1974-10-17,184.0,184.0,0.0,rule 1,1,0,0,1,0,0
2,1997,fynes_1974-10-17,184.0,184.0,0.0,rule 1,1,0,0,1,0,0


# Exporting final datasets

In [11]:
# Define the output folder path
folder_path = f"C:/Users/{user}/Documents/GitHub/false_start_2/data/out"

# Ensure the directory exists
os.makedirs(folder_path, exist_ok=True)

# File paths for each DataFrame
df_file_path = os.path.join(folder_path, "df.csv")
df_finalists_file_path = os.path.join(folder_path, "df_finalists.csv")
finalists_df_file_path = os.path.join(folder_path, "finalists_df.csv")

# Export DataFrames to CSV
df.to_csv(df_file_path, index=False)
df_finalists.to_csv(df_finalists_file_path, index=False)
finalists_df.to_csv(finalists_df_file_path, index=False)

print(f"DataFrames exported to {folder_path}")


DataFrames exported to C:/Users/aldi/Documents/GitHub/false_start_2/data/out
