In [6]:
import pandas as pd
import os
import re

# Function to sanitize filenames by replacing invalid characters with underscores
def sanitize_filename(name):
    """
    Replace any character that is not a word character, hyphen, or underscore with an underscore.
    """
    return re.sub(r'[^\w\-]', '_', name)

# Define a mapping for country name standardization
country_mapping = {
    "USA": "United States",
    "U.S.A.": "United States",
    "Great Britain": "United Kingdom",
    "Soviet Union": "Russia",
    "Ain": "Russia"
}

# Define sport-discipline-code mapping
sport_discipline_code_mapping = {
    ("Aquatics", "Artistic Swimming"): "SWA",
    ("Aquatics", "Diving"): "DIV",
    ("Aquatics", "Marathon Swimming"): "OWS",
    ("Aquatics", "Swimming"): "SWM",
    ("Aquatics", "Water Polo"): "WPO",
    ("Archery", "Archery"): "ARC",
    ("Athletics", "Athletics"): "ATH",
    ("Badminton", "Badminton"): "BDM",
    ("Baseball", "Baseball"): "BSB",
    ("Softball", "Softball"): "SBL",
    ("Basketball", "3x3"): "BK3",
    ("Basketball", "Basketball"): "BKB",
    ("Basque Pelota", "Basque Pelota"): "PEL",
    ("Boxing", "Boxing"): "BOX",
    ("Breaking", "Breaking"): "BKG",
    ("Canoeing", "Sprint"): "CSP",
    ("Canoeing", "Slalom"): "CSL",
    ("Cricket", "Cricket"): "CKT",
    ("Croquet", "Croquet"): "CQT",
    ("Cycling", "BMX Freestyle"): "BMF",
    ("Cycling", "BMX Racing"): "BMX",
    ("Cycling", "Mountain Bike"): "MTB",
    ("Cycling", "Road"): "CRD",
    ("Cycling", "Track"): "CTR",
    ("Equestrian", "Dressage"): "EDR",
    ("Equestrian", "Eventing"): "EVE",
    ("Equestrian", "Jumping"): "EJP",
    ("Equestrian", "Vaulting"): "EVL",
    ("Equestrian", "Driving"): "EDV",
    ("Fencing", "Fencing"): "FEN",
    ("Field Hockey", "Field Hockey"): "HOC",
    ("Flag Football", "Flag Football"): "AFB",
    ("Football", "Football"): "FBL",
    ("Golf", "Golf"): "GLF",
    ("Gymnastics", "Artistic"): "GAR",
    ("Gymnastics", "Rhythmic"): "GRY",
    ("Gymnastics", "Trampoline"): "GTR",
    ("Handball", "Indoor"): "HBL",
    ("Handball", "Field"): "HBL",
    ("Judo", "Judo"): "JUD",
    ("Karate", "Karate"): "KTE",
    ("Lacrosse", "Sixes"): "LAX",
    ("Lacrosse", "Field"): "LAX",
    ("Modern Pentathlon", "Modern Pentathlon"): "MPN",
    ("Polo", "Polo"): "POL",
    ("Rackets", "Racquets"): "RQT",
    ("Roque", "Roque"): "ROQ",
    ("Rowing", "Coastal"): "ROC",
    ("Rowing", "Rowing"): "ROW",
    ("Rugby", "Sevens"): "RU7",
    ("Rugby", "Union"): "RUG",
    ("Sailing", "Sailing"): "SAL",
    ("Shooting", "Shooting"): "SHO",
    ("Skateboarding", "Skateboarding"): "SKB",
    ("Sport Climbing", "Sport Climbing"): "CLB",
    ("Squash", "Squash"): "SQU",
    ("Surfing", "Surfing"): "SRF",
    ("Table Tennis", "Table Tennis"): "TTE",
    ("Taekwondo", "Taekwondo"): "TKW",
    ("Tennis", "Tennis"): "TEN",
    ("Triathlon", "Triathlon"): "TRI",
    ("Tug of War", "Tug of War"): "TOW",
    ("Beach Volleyball", "Beach Volleyball"): "VBV",
    ("Indoor Volleyball", "Indoor Volleyball"): "VVO",
    ("Power Boat Racing", "Power Boat Racing"): "PBT",
    ("Water Motorsports", "Power Boat Racing"): "PBT",
    ("Weightlifting", "Weightlifting"): "WLF",
    ("Wrestling", "Freestyle"): "WRF",
    ("Wrestling", "Greco-Roman"): "WRG",
    ("Skating", "Figure"): "FSK",
    ("Figure Skating", "Figure Skating"): "FSK",
    ("Ice Hockey", "Ice Hockey"): "IHO",
    ("Jeu De Paume", "Jeu De Paume"): "JDP",
    ("Art Competitions", "Art Competitions"): "ART",
    ("Alpinism", "Alpinism"): "ALP",
    ("Aeronautics", "Aeronautics"): "AER",
    ('3x3 Basketball', 'Men'): "BK3",
    ('3x3 Basketball', 'Women'): "BK3",
    ('3x3 Basketball, Basketball', 'Women'): "BK3",
    ('Archery', "Men's"): "ARC",
    ('Archery', 'Mixed'): "ARC",
    ('Archery', "Women's"): "ARC",
    ('Art Competitions', 'Art'): "ART",
    ('Artistic Gymnastics', 'Men'): "GAR",
    ('Artistic Gymnastics', "Men's"): "GAR",
    ('Artistic Gymnastics', 'Women'): "GAR",
    ('Artistic Gymnastics', "Women's"): "GAR",
    ('Artistic Swimming', 'Duet'): "SWA",
    ('Artistic Swimming', 'Team'): "SWA",
    ('Athletics', '4'): "ATH",
    ('Athletics', 'Marathon'): "ATH",
    ('Athletics', "Men's"): "ATH",
    ('Athletics', "Women's"): "ATH",
    ('Badminton', "Men's"): "BDM",
    ('Badminton', 'Mixed'): "BDM",
    ('Badminton', "Women's"): "BDM",
    ('Baseball/Softball', 'Baseball'): "BSB",
    ('Baseball/Softball', 'Softball'): "SBL",
    ('Basketball', 'Men'): "BKB",
    ('Basketball', 'Women'): "BKB",
    ('Basque Pelota', 'Basque'): "PEL",
    ('Beach Volleyball', 'Beach'): "VBV",
    ('Beach Volleyball', 'Men'): "VBV",
    ('Beach Volleyball', 'Women'): "VBV",
    ('Boxing', "Men's"): "BOX",
    ('Boxing', "Women's"): "BOX",
    ('Breaking', 'B-Boys'): "BKG",
    ('Breaking', 'B-Girls'): "BKG",
    ('Canoe Slalom', "Men's"): "CSL",
    ('Canoe Slalom', "Women's"): "CSL",
    ('Canoe Sprint', "Men's"): "CSP",
    ('Canoe Sprint', "Women's"): "CSP",
    ('Canoeing', 'Canoeing'): "CSL",  # Defaulting to Slalom
    ('Cycling', 'Cycling'): "CTR",  # Defaulting to Track
    ('Cycling BMX Freestyle', "Men's"): "BMF",
    ('Cycling BMX Freestyle', "Women's"): "BMF",
    ('Cycling BMX Racing', 'Men'): "BMX",
    ('Cycling BMX Racing', 'Women'): "BMX",
    ('Cycling Mountain Bike', "Men's"): "MTB",
    ('Cycling Mountain Bike', "Women's"): "MTB",
    ('Cycling Road', "Men's"): "CRD",
    ('Cycling Road', "Women's"): "CRD",
    ('Cycling Road, Cycling Mountain Bike', "Men's"): "MTB",
    ('Cycling Road, Cycling Mountain Bike', "Women's"): "MTB",
    ('Cycling Road, Cycling Track', "Men's"): "CTR",
    ('Cycling Road, Cycling Track', "Women's"): "CTR",
    ('Cycling Road, Triathlon', "Women's"): "TRI",
    ('Cycling Track', "Men's"): "CTR",
    ('Cycling Track', "Women's"): "CTR",
    ('Diving', 'Diving'): "DIV",
    ('Diving', "Men's"): "DIV",
    ('Diving', "Women's"): "DIV",
    ('Equestrianism', 'Equestrianism'): "EVE",  # Defaulting to Eventing
    ('Fencing', "Men's"): "FEN",
    ('Fencing', "Women's"): "FEN",
    ('Figure Skating', 'Figure'): "FSK",
    ('Football', 'Men'): "FBL",
    ('Football', 'Women'): "FBL",
    ('Golf', "Men's"): "GLF",
    ('Golf', "Women's"): "GLF",
    ('Gymnastics', 'Gymnastics'): "GAR",  # Defaulting to Artistic
    ('Handball', 'Handball'): "HBL",
    ('Handball', 'Men'): "HBL",
    ('Handball', 'Women'): "HBL",
    ('Hockey', 'Hockey'): "HOC",
    ('Hockey', 'Men'): "HOC",
    ('Hockey', 'Women'): "HOC",
    ('Ice Hockey', 'Ice'): "IHO",
    ('Jeu De Paume', 'Jeu'): "JDP",
    ('Judo', 'Men'): "JUD",
    ('Judo', 'Mixed'): "JUD",
    ('Judo', 'Women'): "JUD",
    ('Karate', "Men's"): "KTE",
    ('Karate', "Women's"): "KTE",
    ('Lacrosse', 'Lacrosse'): "LAX",
    ('Marathon Swimming', "Men's"): "OWS",
    ('Marathon Swimming', "Women's"): "OWS",
    ('Marathon Swimming, Swimming', "Men's"): "SWM",
    ('Marathon Swimming, Swimming', "Women's"): "SWM",
    ('Modern Pentathlon', "Men's"): "MPN",
    ('Modern Pentathlon', 'Modern'): "MPN",
    ('Modern Pentathlon', "Women's"): "MPN",
    ('Motorboating', 'Motorboating'): "PBT",
    ('Racquets', 'Racquets'): "RQT",
    ('Rhythmic Gymnastics', 'Group'): "GRY",
    ('Rhythmic Gymnastics', 'Individual'): "GRY",
    ('Rhythmic Gymnastics', 'Rhythmic'): "GRY",
    ('Rowing', 'Lightweight'): "ROW",
    ('Rowing', "Men's"): "ROW",
    ('Rowing', "Women's"): "ROW",
    ('Rugby', 'Rugby'): "RUG",
    ('Rugby Sevens', 'Men'): "RU7",
    ('Rugby Sevens', 'Rugby'): "RU7",
    ('Rugby Sevens', 'Women'): "RU7",
    ('Sailing', "Men's"): "SAL",
    ('Sailing', 'Mixed'): "SAL",
    ('Sailing', "Women's"): "SAL",
    ('Shooting', '10m'): "SHO",
    ('Shooting', '25m'): "SHO",
    ('Shooting', '50m'): "SHO",
    ('Shooting', 'Skeet'): "SHO",
    ('Shooting', 'Trap'): "SHO",
    ('Skateboarding', "Men's"): "SKB",
    ('Skateboarding', "Women's"): "SKB",
    ('Sport', 'Event'): "UNK",
    ('Sport Climbing', "Men's"): "CLB",
    ('Sport Climbing', "Women's"): "CLB",
    ('Surfing', 'Men'): "SRF",
    ('Surfing', 'Women'): "SRF",
    ('Swimming', "Men's"): "SWM",
    ('Swimming', 'Mixed'): "SWM",
    ('Swimming', 'Relay'): "SWM",
    ('Swimming', 'Swimming'): "SWM",
    ('Swimming', "Women's"): "SWM",
    ('Synchronized Swimming', 'Synchronized'): "SWA",
    ('Table Tennis', "Men's"): "TTE",
    ('Table Tennis', 'Mixed'): "TTE",
    ('Table Tennis', 'Table'): "TTE",
    ('Table Tennis', "Women's"): "TTE",
    ('Taekwondo', 'Men'): "TKW",
    ('Taekwondo', 'Women'): "TKW",
    ('Tennis', "Men's"): "TEN",
    ('Tennis', 'Mixed'): "TEN",
    ('Tennis', "Women's"): "TEN",
    ('Trampoline Gymnastics', 'Men'): "GTR",
    ('Trampoline Gymnastics', 'Women'): "GTR",
    ('Trampolining', 'Trampolining'): "GTR",
    ('Triathlon', "Men's"): "TRI",
    ('Triathlon', 'Mixed'): "TRI",
    ('Triathlon', "Women's"): "TRI",
    ('Tug-Of-War', 'Tug-Of-War'): "TOW",
    ('Volleyball', 'Men'): "VVO",
    ('Volleyball', 'Volleyball'): "VVO",
    ('Volleyball', 'Women'): "VVO",
    ('Water Polo', 'Men'): "WPO",
    ('Water Polo', 'Water'): "WPO",
    ('Water Polo', 'Women'): "WPO",
    ('Weightlifting', "Men's"): "WLF",
    ('Weightlifting', "Women's"): "WLF",
    ('Wrestling', "Men's"): "WRF",
    ('Wrestling', "Women's"): "WRF",
    ('Wrestling', 'Wrestling'): "WRF"
}

# Load the data
file_path = "summerOly_athletes.csv" 
try:
    data = pd.read_csv(file_path, header=None, names=[
        "Athlete", "Gender", "Country", "Country Code", "Year", "City", 
        "Sport", "Event", "Medal"
    ])
except FileNotFoundError:
    print(f"Error: The file '{file_path}' does not exist.")
    exit(1)

# Extract Discipline from Event (simplified assumption: first word of Event is the discipline)
data["Discipline"] = data["Event"].str.split(' ').str[0].str.strip()

# Normalize and map country names to ensure consistency
data["Country"] = data["Country"].str.strip().str.title()
data["Country"] = data["Country"].replace(country_mapping)

# Set to track missing sport-discipline combinations
missing_combinations = set()

# Map sport and discipline to codes
def map_event_code(row):
    key = (row["Sport"], row["Discipline"])
    if key not in sport_discipline_code_mapping:
        missing_combinations.add(key)
        return None
    return sport_discipline_code_mapping[key]

data["Event Code"] = data.apply(map_event_code, axis=1)

# Create a directory to save the output files
output_dir = "processed_olympics_data"
os.makedirs(output_dir, exist_ok=True)

# Process data by grouping
for (country, event_code, gender), group in data.groupby(["Country", "Event Code", "Gender"]):
    if pd.isnull(event_code):  # Skip groups with missing event codes
        continue

    # Sanitize country and event code for filenames
    country_sanitized = sanitize_filename(country)
    event_code_sanitized = sanitize_filename(event_code)
    gender_label = "mens" if gender == "M" else "womens"

    # Construct the filename
    filename = f"{country_sanitized}_{event_code_sanitized}_{gender_label}.csv"
    filepath = os.path.join(output_dir, filename)

    # Add meta-label for athletes competing multiple times
    group["Multiple Events"] = group["Athlete"].duplicated(keep=False)

    try:
        # Save the group to its CSV file
        group.to_csv(filepath, index=False)
    except OSError as e:
        print(f"Error saving file '{filepath}': {e}")

# Print missing combinations
if missing_combinations:
    print("The following sport-discipline combinations are missing from the mapping:")
    for combination in sorted(missing_combinations):
        print(combination)

print(f"Processed data has been saved to the '{output_dir}' directory.")


  data = pd.read_csv(file_path, header=None, names=[


Processed data has been saved to the 'processed_olympics_data' directory.
