In [1]:
import pandas as pd
import numpy as np
import matplotlib
import glob
import os
import re
from datetime import datetime, timedelta

### Load data

In [2]:
def load_and_concat_csv(folder_path, chunksize=None):
    all_files = glob.glob(os.path.join(folder_path, "*.csv"))
    df_list = []

    for filename in all_files:
        try:
            # Read the CSV file, potentially in chunks
            df_chunks = pd.read_csv(filename, chunksize=chunksize, 
                                    low_memory=False, encoding='utf-8')
            
            if chunksize:
                # If reading in chunks, concatenate the chunks
                df = pd.concat(df_chunks, ignore_index=True)
            else:
                df = next(df_chunks)
            
            df['source_file'] = os.path.basename(filename)
            df_list.append(df)
        except Exception as e:
            print(f"Error reading file {filename}: {str(e)}")

    combined_df = pd.concat(df_list, ignore_index=True, sort=False)
    
    # Handle missing values
    #combined_df = combined_df.fillna('Unknown')  # or use another appropriate method
    
    return combined_df

In [3]:
folder_path = "../output/"
result_df = load_and_concat_csv(folder_path, chunksize=100000)

print(result_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8009673 entries, 0 to 8009672
Data columns (total 23 columns):
 #   Column                             Dtype  
---  ------                             -----  
 0   Rank                               object 
 1   Performance                        object 
 2   Surname, first name                object 
 3   Club                               object 
 4   Nat.                               object 
 5   YOB                                object 
 6   M/F                                object 
 7   Rank M/F                           float64
 8   Cat                                object 
 9   Cat. Rank                          float64
 10  Avg.Speed km/h                     object 
 11  Age graded performance             object 
 12  Runner ID                          object 
 13  Event                              object 
 14  Date                               object 
 15  Distance                           object 
 16  Finishers         

In [21]:
def sample_csv(input_file, output_file, n, random_state=None):
    """
    Sample n rows from a CSV file and save to a new CSV file.
    
    :param input_file: Path to the input CSV file
    :param output_file: Path to save the output CSV file
    :param n: Number of rows to sample
    :param random_state: Seed for random number generator (optional)
    """
    # Read the CSV file
    df = pd.read_csv(input_file)
    
    # Sample n rows
    sampled_df = df.sample(n=n, random_state=random_state)
    
    # Save the sampled DataFrame to a new CSV file
    sampled_df.to_csv(output_file, index=False)
    
    print(f"Sampled {n} rows from {input_file} and saved to {output_file}")

# Usage
input_file = '../output/all_events_data_2024.csv'
output_file = '2024_sample.csv'
sample_size = 500 

sample_csv(input_file, output_file, sample_size)

Sampled 500 rows from /Users/riz/Projects/duvdata/output/all_events_data_2024.csv and saved to 2024_sample.csv


### Clean data

In [52]:
df = pd.read_csv("../tmp.csv")
#df[df['Event ID']==105226]

In [39]:
def runner_name(data):
    data[['Surname', 'First Name']] = data['Surname, first name'].str.split(',', expand=True)
    
    original_name_column = 'Original name\nSurname, first name' 
    if original_name_column in data.columns:
        data['First Name'] = data['First Name'].fillna(data[original_name_column])
        data['Surname'] = data['Surname'].fillna(data[original_name_column])

    return data

def parse_performance(data):
    # Regular expressions to identify time and distance
    time_regex = re.compile(r'(?:(\d+)d )?(\d{1,2}):(\d{2}):(\d{2}) h')
    distance_regex = re.compile(r'(\d+\.?\d*) km')
    
    # Lists to store time and distance results
    times = []
    distances = []
    
    for entry in data['Performance']:
        if 'km' in entry:  # This entry is a distance
            match = distance_regex.search(entry)
            if match:
                distances.append(float(match.group(1)))  # Convert the captured distance to float
                times.append(None)  # No time entry for distances
            else:
                distances.append(None)
                times.append(None)
        else:  # This entry is a time
            match = time_regex.search(entry)
            if match:
                days, hours, minutes, seconds = match.groups(default='0')  # Default missing days to '0'
                # Convert parsed time to total seconds for uniformity and easy handling
                total_seconds = timedelta(days=int(days), hours=int(hours), minutes=int(minutes), seconds=int(seconds)).total_seconds()
                times.append(total_seconds)
                distances.append(None)  # No distance entry for times
            else:
                times.append(None)
                distances.append(None)

    # Create new DataFrame columns for times and distances
    data['Time Seconds Finish'] = times
    data['Distance Finish'] = distances

    return data

def parse_performance_winner(data):
    # Regular expressions to identify time and distance
    time_regex = re.compile(r'(?:(\d+)d )?(\d{1,2}):(\d{2}):(\d{2}) h')
    distance_regex = re.compile(r'(\d+\.?\d*) km')
    
    # Lists to store time and distance results
    times = []
    distances = []
    
    for entry in data['Winner Time']:
        if 'km' in entry:  # This entry is a distance
            match = distance_regex.search(entry)
            if match:
                distances.append(float(match.group(1)))  # Convert the captured distance to float
                times.append(None)  # No time entry for distances
            else:
                distances.append(None)
                times.append(None)
        else:  # This entry is a time
            match = time_regex.search(entry)
            if match:
                days, hours, minutes, seconds = match.groups(default='0')  # Default missing days to '0'
                # Convert parsed time to total seconds for uniformity and easy handling
                total_seconds = timedelta(days=int(days), hours=int(hours), minutes=int(minutes), seconds=int(seconds)).total_seconds()
                times.append(total_seconds)
                distances.append(None)  # No distance entry for times
            else:
                times.append(None)
                distances.append(None)

    # Create new DataFrame columns for times and distances
    data['Time Seconds Winner'] = times
    data['Distance Winner'] = distances

    return data

def split_distance_column(df):
    # Define regex patterns to extract the numeric part and the text part
    distance_pattern = re.compile(r'(\d+\.?\d*)\s*(km|mi|h)')  # Matches '50 km', '100 mi', or '24 h'
    race_type_pattern = re.compile(r'(\d+\.?\d*\s*(km|mi|h))\s*(.*)')  # Captures anything after the numeric part
    
    distances = []
    race_types = []

    for entry in df['Distance']:
        # Extract the distance/time
        distance_match = distance_pattern.search(entry)
        if distance_match:
            distances.append(distance_match.group(0))  # Group 0 is the full match, e.g., '50 km' or '24 h'
        else:
            distances.append(None)

        # Extract the race type
        race_type_match = race_type_pattern.search(entry)
        if race_type_match and len(race_type_match.groups()) > 2:
            race_types.append(race_type_match.group(3).strip())  # Group 3 is the race type description
        else:
            race_types.append(None)

    # Assign new columns to the DataFrame
    df['Distance/Time'] = distances
    df['Terrain'] = race_types
    return df

def convert_miles_to_km(entry):
    if pd.isna(entry):
        return None  # Handle missing values
    if isinstance(entry, str):  # Ensure the entry is a string
        # Regular expression to find numbers and units
        match = re.match(r'(\d+\.?\d*)(mi|km)', entry)
        if match:
            distance = float(match.group(1))
            unit = match.group(2)
            if unit == 'mi':
                # Convert miles to kilometers
                return distance * 1.6
            elif unit == 'km':
                # Return kilometers as is
                return distance
    return None  # In case of no match or non-string data

def convert_seconds(seconds):
    td = timedelta(seconds=seconds)
    return str(td)

def standardize_terrain(terrain):
    terrain = terrain.lower()
    if 'trail' in terrain:
        return 'trail'
    elif 'road' in terrain:
        return 'road'
    elif 'track' in terrain:
        return 'track'
    else:
        return 'other'
    
def parse_date_range(date_str):
    # If it's a single date, return as is
    if '-' not in date_str:
        return date_str
    
    # Split the range and extract start and end dates
    start, end = date_str.split('-')
    start = start.strip()
    end = end.strip()
    
    # Extract day, month, and year components
    start_parts = re.findall(r'\d+', start)
    end_parts = re.findall(r'\d+', end)
    
    # Ensure we have day and month for start date
    if len(start_parts) < 2:
        return date_str  # Return original if format is unexpected
    
    # Get year from end date if available, otherwise use current year
    year = end_parts[-1] if len(end_parts) == 3 else pd.Timestamp.now().year
    
    # Construct the full start date
    return f"{start_parts[0].zfill(2)}.{start_parts[1].zfill(2)}.{year}"

def extract_location(event):
    match = re.search(r'\((\w+)\)$', event)
    if match:
        return match.group(1)
    return 'Unknown'


In [58]:
df = runner_name(df)
df = split_distance_column(df)
df = parse_performance(df)
df = parse_performance_winner(df)

df['Terrain'] = df['Terrain'].apply(standardize_terrain)
df['Distance KM'] = df['Distance/Time'].apply(convert_miles_to_km)
df['Distance KM'] = df['Distance KM'].round(0)
df['Total Finishers'] = df['Finishers'].str.extract(r'^(\d+)')
df['Total Finishers'] = df['Total Finishers'].astype(int)
df['Finish Percentage'] = df['Rank']/df['Total Finishers'].round(2)
df['Winner Time Percentage'] = 1 - df['Time Seconds Winner']/df['Time Seconds Finish'].round(2)
df['Distance KM'] = df['Distance KM'].replace(0, pd.NA) 
df['Average Speed'] = df['Time Seconds Finish'] / df['Distance KM']
df['Race Location'] = df['Event'].apply(extract_location)

In [59]:
df.head()

Unnamed: 0,Rank,Performance,"Surname, first name",Club,Nat.,YOB,M/F,Rank M/F,Cat,Cat. Rank,...,Time Seconds Finish,Distance Finish,Time Seconds Winner,Distance Winner,Distance KM,Total Finishers,Finish Percentage,Winner Time Percentage,Average Speed,Race Location
0,1,6:20:12 h,"Duong, Tam",,AUS,1983,M,1,M40,1,...,22812.0,,22812.0,,50.0,1,1.0,1.0,456.24,AUS
1,1,7:53:07 h,"Saji, Alen",,IND,1997,M,1,M23,1,...,28387.0,,28387.0,,63.0,1,1.0,1.0,450.587302,AUS
2,1,5:08:53 h,"Chatton, Guillaume",SL CSL NeufBrisach,FRA,1984,M,1,M40,1,...,18533.0,,18533.0,,59.0,281,0.003559,1.0,314.118644,FRA
3,2,5:12:04 h,"Eneros Rilling, Sebastian",,CHI,1988,M,2,M35,1,...,18724.0,,18533.0,,59.0,281,0.007117,0.989799,317.355932,FRA
4,3,5:16:04 h,"Stritt, Karim-Henri",,FRA,1983,M,3,M40,2,...,18964.0,,18533.0,,59.0,281,0.010676,0.977273,321.423729,FRA


In [53]:
# Convert 'Date' to datetime
df['Date'] = pd.to_datetime(df['Date'].apply(parse_date_range), format='%d.%m.%Y')

# Convert 'YOB' to integer, handling any non-numeric values
df['YOB'] = pd.to_numeric(df['YOB'], errors='coerce').astype('Int64')

# Convert 'Avg.Speed km/h' to float
df['Avg.Speed km/h'] = df['Avg.Speed km/h'].astype(float)

# Convert 'Elevation Gain' to numeric, removing 'm' and 'Hm'
df['Elevation Gain'] = df['Elevation Gain'].replace({'Hm': '','m': ''}, regex=True)
df['Elevation Gain'] = pd.to_numeric(df['Elevation Gain'], errors='coerce')

# Handle any missing values
df = df.fillna({'M/F': 'Unknown', 'Cat': 'Unknown'})

df['Club'] = df['Club'].str.strip().str.replace(r'[^\w\s]', '', regex=True)
df['Nat.'] = df['Nat.'].str.strip().str.upper()

df['Age'] = df['Date'].dt.year - df['YOB']

In [55]:
df['Elevation Gain'] = pd.to_numeric(df['Elevation Gain'], errors='coerce')


Unnamed: 0,Rank,Performance,"Surname, first name",Club,Nat.,YOB,M/F,Rank M/F,Cat,Cat. Rank,...,Event,Date,Distance,Finishers,Winner Time,Elevation Gain,Event ID,"Original name\nSurname, first name",hours,Age
345,1,11:58:06 h,"Klipfel, Gilles",SL US Palaiseau,FRA,1983,M,1,M40,1,...,10ème Ultra Trail du Haut-Koenigsbourg (FRA),2024-08-31,102km trail race,"128 (120 M, 8 F)",11:58:06 h,4610.0,105226,,,41
346,2,12:04:21 h,"Knittel, Stephane",,FRA,1985,M,2,M35,1,...,10ème Ultra Trail du Haut-Koenigsbourg (FRA),2024-08-31,102km trail race,"128 (120 M, 8 F)",11:58:06 h,4610.0,105226,,,39
347,3,12:08:34 h,"Abas, Frederic",,FRA,1979,M,3,M40,2,...,10ème Ultra Trail du Haut-Koenigsbourg (FRA),2024-08-31,102km trail race,"128 (120 M, 8 F)",11:58:06 h,4610.0,105226,,,45
348,4,12:24:52 h,"Huck, Thibaut",,FRA,1990,M,4,M23,1,...,10ème Ultra Trail du Haut-Koenigsbourg (FRA),2024-08-31,102km trail race,"128 (120 M, 8 F)",11:58:06 h,4610.0,105226,,,34
349,5,13:13:43 h,"Garcia, Esteban",,FRA,1992,M,5,M23,2,...,10ème Ultra Trail du Haut-Koenigsbourg (FRA),2024-08-31,102km trail race,"128 (120 M, 8 F)",11:58:06 h,4610.0,105226,,,32
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
468,124,20:31:28 h,"Zanna, David",,FRA,1981,M,116,M40,18,...,10ème Ultra Trail du Haut-Koenigsbourg (FRA),2024-08-31,102km trail race,"128 (120 M, 8 F)",11:58:06 h,4610.0,105226,,,43
469,125,20:35:31 h,"Baiz, Taoufik",,MAR,1991,M,117,M23,40,...,10ème Ultra Trail du Haut-Koenigsbourg (FRA),2024-08-31,102km trail race,"128 (120 M, 8 F)",11:58:06 h,4610.0,105226,,,33
470,126,20:37:05 h,"Fourcault, Damien",,FRA,1997,M,118,M23,41,...,10ème Ultra Trail du Haut-Koenigsbourg (FRA),2024-08-31,102km trail race,"128 (120 M, 8 F)",11:58:06 h,4610.0,105226,,,27
471,127,20:38:57 h,"Cisse, Jonathan",,FRA,1998,M,119,M23,42,...,10ème Ultra Trail du Haut-Koenigsbourg (FRA),2024-08-31,102km trail race,"128 (120 M, 8 F)",11:58:06 h,4610.0,105226,,,26


In [60]:
columns_to_keep = ['Runner ID','First Name','Surname','Nat.','M/F','Age','Cat','YOB','Event','Date','Race Location','Elevation Gain','Finishers','Total Finishers','Rank','Rank M/F','Cat. Rank','Finish Percentage','Winner Time Percentage','Distance/Time','Distance KM','Terrain','Time Seconds Finish','Distance Finish','Average Speed','Avg.Speed km/h']
df = df[columns_to_keep]

In [61]:
df.to_csv('../tmp_clean.csv',index=False)