In [15]:
import pandas as pd
import numpy as np
import matplotlib
import glob
import os
import re
from datetime import datetime, timedelta

### Load data

In [11]:
def load_and_concat_csv(folder_path, chunksize=None):
    all_files = glob.glob(os.path.join(folder_path, "*.csv"))
    df_list = []

    for filename in all_files:
        try:
            # Read the CSV file, potentially in chunks
            df_chunks = pd.read_csv(filename, chunksize=chunksize, 
                                    low_memory=False, encoding='utf-8')
            
            if chunksize:
                # If reading in chunks, concatenate the chunks
                df = pd.concat(df_chunks, ignore_index=True)
            else:
                df = next(df_chunks)
            
            df['source_file'] = os.path.basename(filename)
            df_list.append(df)
        except Exception as e:
            print(f"Error reading file {filename}: {str(e)}")

    combined_df = pd.concat(df_list, ignore_index=True, sort=False)
    
    # Handle missing values
    #combined_df = combined_df.fillna('Unknown')  # or use another appropriate method
    
    return combined_df


In [13]:
folder_path = "/Users/riz/Projects/duvdata/output/"
result_df = load_and_concat_csv(folder_path, chunksize=100000)

print(result_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8009673 entries, 0 to 8009672
Data columns (total 23 columns):
 #   Column                             Dtype  
---  ------                             -----  
 0   Rank                               object 
 1   Performance                        object 
 2   Surname, first name                object 
 3   Club                               object 
 4   Nat.                               object 
 5   YOB                                object 
 6   M/F                                object 
 7   Rank M/F                           float64
 8   Cat                                object 
 9   Cat. Rank                          float64
 10  Avg.Speed km/h                     object 
 11  Age graded performance             object 
 12  Runner ID                          object 
 13  Event                              object 
 14  Date                               object 
 15  Distance                           object 
 16  Finishers         

In [14]:
result_df.head()

Unnamed: 0,Rank,Performance,"Surname, first name",Club,Nat.,YOB,M/F,Rank M/F,Cat,Cat. Rank,...,Event,Date,Distance,Finishers,Winner Time,Elevation Gain,Event ID,"Original name\nSurname, first name",source_file,hours
0,1,8:22:27 h,"Trason, Ann",,USA,1960.0,F,1.0,W40,1.0,...,"18th Across the Years, 24h - 100km Split (USA)",31.12.2000,100km track,"1 (0 M, 1 F)",8:22:27 h,,14091.0,,all_events_data_2000.csv,
1,1,6:25:16 h,"Trason, Ann",,USA,1960.0,F,1.0,W40,1.0,...,"18th Across the Years, 24h - 50mi Split (USA)",31.12.2000,50mi track,"1 (0 M, 1 F)",6:25:16 h,,14092.0,,all_events_data_2000.csv,
2,1,3:47:13 h,"Trason, Ann",,USA,1960.0,F,1.0,W40,1.0,...,"18th Across the Years, 24h - 50km Split (USA)",31.12.2000,50km track,"1 (0 M, 1 F)",3:47:13 h,,14093.0,,all_events_data_2000.csv,
3,1,217.200 km,"Bakwin, Peter",,USA,1962.0,M,1.0,M35,1.0,...,"18th Across the Years, 24h (USA)",31.12.2000-01.01.2001,24h track,"42 (27 M, 15 F)",217.200 km,,5356.0,,all_events_data_2000.csv,
4,2,191.600 km,"Runyan, Janet",,USA,1962.0,F,1.0,W35,1.0,...,"18th Across the Years, 24h (USA)",31.12.2000-01.01.2001,24h track,"42 (27 M, 15 F)",217.200 km,,5356.0,,all_events_data_2000.csv,


### Clean data

In [21]:
import pandas as pd

def sample_csv(input_file, output_file, n, random_state=None):
    """
    Sample n rows from a CSV file and save to a new CSV file.
    
    :param input_file: Path to the input CSV file
    :param output_file: Path to save the output CSV file
    :param n: Number of rows to sample
    :param random_state: Seed for random number generator (optional)
    """
    # Read the CSV file
    df = pd.read_csv(input_file)
    
    # Sample n rows
    sampled_df = df.sample(n=n, random_state=random_state)
    
    # Save the sampled DataFrame to a new CSV file
    sampled_df.to_csv(output_file, index=False)
    
    print(f"Sampled {n} rows from {input_file} and saved to {output_file}")

# Usage
input_file = '/Users/riz/Projects/duvdata/output/all_events_data_2024.csv'
output_file = '2024_sample.csv'
sample_size = 500 

sample_csv(input_file, output_file, sample_size)

Sampled 500 rows from /Users/riz/Projects/duvdata/output/all_events_data_2024.csv and saved to 2024_sample.csv
