In [11]:
import pandas as pd
import os
import re

# Function to sanitize filenames by replacing invalid characters with underscores
def sanitize_filename(name):
    """
    Replace any character that is not a word character, hyphen, or underscore with an underscore.
    
    Parameters:
        name (str): The original filename string.
        
    Returns:
        str: The sanitized filename string.
    """
    return re.sub(r'[^\w\-]', '_', name)

# Define a mapping for country name standardization
country_mapping = {
    "USA": "United States",
    "U.S.A.": "United States",
    "Great Britain": "United Kingdom",
    "Soviet Union": "Russia",
    "Ain": "Russia"  # Added as per your mapping
    # Add more mappings if necessary
}

# Load the data
file_path = "summerOly_athletes.csv"  # Replace with your actual file path
try:
    data = pd.read_csv(file_path, header=None, names=[
        "Athlete", "Gender", "Country", "Country Code", "Year", "City", 
        "Sport", "Event", "Medal"
    ])
except FileNotFoundError:
    print(f"Error: The file '{file_path}' does not exist.")
    exit(1)

# Normalize and map country names to ensure consistency
data["Country"] = data["Country"].str.strip().str.title()
data["Country"] = data["Country"].replace(country_mapping)

# Create a directory to save the output files
output_dir = "processed_olympics_data"
os.makedirs(output_dir, exist_ok=True)

# Process data by grouping
for (country, sport, gender), group in data.groupby(["Country", "Sport", "Gender"]):
    # Sanitize country and sport names to avoid invalid characters
    country_sanitized = sanitize_filename(country)
    sport_sanitized = sanitize_filename(sport)
    gender_label = "mens" if gender == "M" else "womens"

    # Construct the filename
    filename = f"{country_sanitized}_{sport_sanitized}_{gender_label}.csv"
    filepath = os.path.join(output_dir, filename)

    # Add meta-label for athletes competing multiple times
    group["Multiple Events"] = group["Athlete"].duplicated(keep=False)

    try:
        # Save the group to its CSV file
        group.to_csv(filepath, index=False)
    except OSError as e:
        print(f"Error saving file '{filepath}': {e}")

print(f"Processed data has been saved to the '{output_dir}' directory.")


  data = pd.read_csv(file_path, header=None, names=[


Processed data has been saved to the 'processed_olympics_data' directory.
