In [1]:
import pandas as pd
import numpy as np
import pandera.pandas as pa 

import re
import ast




In [36]:
# Load your data
bios_df = pd.read_csv("../clean_data/cleaned_biodata.csv")
results_df = pd.read_csv('../clean_data/cleaned_results.csv')

affiliation_df = pd.read_csv("../clean_data/dim_affiliation.csv")

In [3]:
bios_df

Unnamed: 0,Roles,Sex,NOC,Athlete_Id,Name,Height (cm),Weight (kg),Born_Date,Died_Date,Is_Alive,Born_City,Born_Region,Born_Country
0,['Competed in Olympic Games'],Male,['france'],2,Arnaud Boetsch,183.0,76.0,1969-04-01,,True,Meulan,Yvelines,FRA
1,['Competed in Olympic Games'],Male,['france'],1,Jean-François Blanchy,,,1886-12-12,1960-10-02,False,Bordeaux,Gironde,FRA
2,"['Competed in Olympic Games', 'Administrator']",Male,['france'],3,Jean Borotra,183.0,76.0,1898-08-13,1994-07-17,False,Biarritz,Pyrénées-Atlantiques,FRA
3,['Competed in Olympic Games'],Male,['france'],5,Albert Canet,,,1878-04-17,1930-07-25,False,Wandsworth,England,GBR
4,['Competed in Olympic Games'],Male,['france'],4,Jacques Brugnon,168.0,64.0,1895-05-11,1978-03-20,False,Paris VIIIe,Paris,FRA
...,...,...,...,...,...,...,...,...,...,...,...,...,...
145560,['Competed in Olympic Games'],Male,['republic of korea'],149217,Sin Ye-Chan,,,1995-06-13,,True,,,
145561,['Competed in Olympic Games'],Male,['cuba'],149219,Carlos García-Ordóñez,,,1927-04-24,2019-11-24,False,La Habana (Havana),Ciudad de La Habana,CUB
145562,['Competed in Olympic Games'],Male,['france'],149225,André Foussard,166.0,,1899-05-19,1986-03-18,False,Niort,Deux-Sèvres,FRA
145563,['Competed in Olympic Games'],Female,['roc'],149223,Valeriya Merkusheva,168.0,65.0,1999-09-20,,True,Moskva (Moscow),Moskva,RUS


In [4]:
bios_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145565 entries, 0 to 145564
Data columns (total 13 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   Roles         145565 non-null  object 
 1   Sex           145565 non-null  object 
 2   NOC           145565 non-null  object 
 3   Athlete_Id    145565 non-null  int64  
 4   Name          145565 non-null  object 
 5   Height (cm)   106716 non-null  float64
 6   Weight (kg)   102128 non-null  float64
 7   Born_Date     143763 non-null  object 
 8   Died_Date     33965 non-null   object 
 9   Is_Alive      145565 non-null  bool   
 10  Born_City     112574 non-null  object 
 11  Born_Region   112574 non-null  object 
 12  Born_Country  112574 non-null  object 
dtypes: bool(1), float64(2), int64(1), object(9)
memory usage: 13.5+ MB


In [5]:
bios_df.describe(include='all')

Unnamed: 0,Roles,Sex,NOC,Athlete_Id,Name,Height (cm),Weight (kg),Born_Date,Died_Date,Is_Alive,Born_City,Born_Region,Born_Country
count,145565,145565,145565,145565.0,145565,106716.0,102128.0,143763,33965,145565,112574,112574,112574
unique,69,2,490,,142845,,,41281,20009,2,24126,2436,224
top,['Competed in Olympic Games'],Male,['united states'],,Ivan Ivanov,,,1931-01-01,2002-01-01,True,?,England,USA
freq,134667,106361,10117,,13,,,63,47,111600,2347,5012,10005
mean,,,,73706.774863,,176.333746,71.889707,,,,,,
std,,,,42870.703583,,10.382117,14.466176,,,,,,
min,,,,1.0,,127.0,25.0,,,,,,
25%,,,,36680.0,,170.0,62.0,,,,,,
50%,,,,73335.0,,176.0,70.0,,,,,,
75%,,,,110354.0,,183.0,80.0,,,,,,


In [6]:
def df_nan_percentage(df):
    return df.isna().mean().mul(100).round(2).sort_values(ascending=False).map(lambda x: f"{x}%")
df_nan_percentage(bios_df)

Died_Date       76.67%
Weight (kg)     29.84%
Height (cm)     26.69%
Born_City       22.66%
Born_Country    22.66%
Born_Region     22.66%
Born_Date        1.24%
Name              0.0%
Sex               0.0%
Roles             0.0%
Athlete_Id        0.0%
NOC               0.0%
Is_Alive          0.0%
dtype: object

In [7]:
# Count how many disciplines per athlete
discipline_counts = results_df.groupby('Athlete_Id')['Discipline'].nunique()

# See if any have more than 1
multi_discipline = discipline_counts[discipline_counts > 1]

print(f"Number of athletes with multiple disciplines: {len(multi_discipline)}")
print(f"Total athletes in results: {len(discipline_counts)}")
print(f"Percentage: {len(multi_discipline) / len(discipline_counts) * 100:.2f}%")

Number of athletes with multiple disciplines: 1032
Total athletes in results: 145561
Percentage: 0.71%


In [8]:
print("Duplicate Athlete_Id in bio_df:", bios_df['Athlete_Id'].duplicated().sum())
print("Total rows in bio_df:", len(bios_df))
print("Unique Athlete_Id:", bios_df['Athlete_Id'].nunique())

Duplicate Athlete_Id in bio_df: 0
Total rows in bio_df: 145565
Unique Athlete_Id: 145565


In [9]:
def impute_height_weight_by_discipline(bio_df, results_df):
    """
    Impute Height (cm) and Weight (kg) using the median of
    Sex + Discipline (primary) and Sex only (fallback).

    Parameters
    ----------
    bio_df      : DataFrame with columns ['Athlete_Id', 'Sex', 'Height (cm)', 'Weight (kg)', ...]
    results_df  : DataFrame with columns ['Athlete_Id', 'Discipline', ...]

    Returns
    -------
    bio_imputed : DataFrame (same shape as bio_df) with imputed values and two flag columns
    """
    bio = bio_df.copy()

    # -------------------------------------------------
    # 1. ONE DISCIPLINE PER ATHLETE (most frequent)
    # -------------------------------------------------
    discipline_map = (
        results_df.groupby('Athlete_Id')['Discipline']
        .apply(lambda s: s.mode().iloc[0] if not s.mode().empty else np.nan)
        .reset_index()
    )
    bio = bio.merge(discipline_map, on='Athlete_Id', how='left')

    # -------------------------------------------------
    # 2. IMPUTATION FLAGS (before any filling)
    # -------------------------------------------------
    bio['Height_Imputed'] = bio['Height (cm)'].isna()
    bio['Weight_Imputed'] = bio['Weight (kg)'].isna()

    # -------------------------------------------------
    # 3. MEDIANS BY Sex + Discipline
    # -------------------------------------------------
    grp = bio.groupby(['Sex', 'Discipline'])
    height_med_sex_disc = grp['Height (cm)'].transform('median')
    weight_med_sex_disc = grp['Weight (kg)'].transform('median')

    # Fill with Sex+Discipline median where possible
    bio['Height (cm)'] = bio['Height (cm)'].fillna(height_med_sex_disc)
    bio['Weight (kg)'] = bio['Weight (kg)'].fillna(weight_med_sex_disc)

    # -------------------------------------------------
    # 4. FALLBACK: MEDIAN BY Sex ONLY
    # -------------------------------------------------
    bio['Height (cm)'] = bio.groupby('Sex')['Height (cm)'].transform(
        lambda x: x.fillna(x.median())
    )
    bio['Weight (kg)'] = bio.groupby('Sex')['Weight (kg)'].transform(
        lambda x: x.fillna(x.median())
    )

    # -------------------------------------------------
    # 5. UPDATE FLAGS FOR FALLBACK IMPUTATIONS
    # -------------------------------------------------
    # Original missing values that are now filled
    orig_height_na = bio_df['Height (cm)'].isna()
    orig_weight_na = bio_df['Weight (kg)'].isna()

    bio['Height_Imputed'] = bio['Height_Imputed'] | (orig_height_na & bio['Height (cm)'].notna())
    bio['Weight_Imputed'] = bio['Weight_Imputed'] | (orig_weight_na & bio['Weight (kg)'].notna())

    # -------------------------------------------------
    # 6. CLEAN-UP
    # -------------------------------------------------
    bio = bio.drop(columns=['Discipline'])

    return bio

In [10]:


# Impute!
bios_df = impute_height_weight_by_discipline(bios_df, results_df)



In [11]:
bios_df.isna().mean().mul(100).round(2).sort_values(ascending=False).map(lambda x: f"{x}%")

Died_Date         76.67%
Born_City         22.66%
Born_Country      22.66%
Born_Region       22.66%
Born_Date          1.24%
Sex                 0.0%
Roles               0.0%
Weight (kg)         0.0%
Height (cm)         0.0%
Name                0.0%
Athlete_Id          0.0%
NOC                 0.0%
Is_Alive            0.0%
Height_Imputed      0.0%
Weight_Imputed      0.0%
dtype: object

In [12]:
def impute_Born_Country_by_NOC(bios_df):
    # clean_bio_df = pd.read_csv('your_bio_df.csv')  # assuming it's already loaded
    bios_df = bios_df.copy()
    # Load the ISO country codes CSV
    iso_df = pd.read_csv('../data/wikipedia-iso-country-codes.csv')

    # Create a mapping from lowercase English short name to Alpha-3 code
    country_to_code = dict(zip(iso_df['English short name lower case'].str.lower(), iso_df['Alpha-3 code']))

    # Function to get the first NOC country code
    def get_noc_code(noc_list):
        noc_list = ast.literal_eval(noc_list)
        if len(noc_list) > 0:
            return country_to_code.get(noc_list[0])
        return np.nan

    # Create a flag column to indicate when Born_Country is derived from NOC
    bios_df['Born_Country_From_NOC'] = False

    # Apply the function only where Born_Country is missing
    mask = bios_df['Born_Country'].isna()
    bios_df.loc[mask, 'Born_Country'] = bios_df.loc[mask, 'NOC'].apply(get_noc_code)
    bios_df.loc[mask, 'Born_Country_From_NOC'] = True
    
    return bios_df

In [13]:
bios_df = impute_Born_Country_by_NOC(bios_df)

country_to_code = dict(zip(iso_df['English short name lower case'].str.lower(), iso_df['Alpha-3 code']))
country_to_code.get('france')


In [14]:
count_none = len(bios_df[(bios_df['Born_Country_From_NOC']==True) & (bios_df['Born_Country'].isna())])
count_not_none = len(bios_df[(bios_df['Born_Country_From_NOC']==True) & (bios_df['Born_Country'].notna())])

print(f"Count of None in Born_Country where derived from NOC: {count_none}")
print(f"Count of Not None in Born_Country where derived from NOC: {count_not_none}")

Count of None in Born_Country where derived from NOC: 6630
Count of Not None in Born_Country where derived from NOC: 26361


In [15]:
bios_df.isna().mean().mul(100).round(2).sort_values(ascending=False).map(lambda x: f"{x}%")

Died_Date                76.67%
Born_City                22.66%
Born_Region              22.66%
Born_Country              4.55%
Born_Date                 1.24%
Roles                      0.0%
NOC                        0.0%
Sex                        0.0%
Weight (kg)                0.0%
Height (cm)                0.0%
Name                       0.0%
Athlete_Id                 0.0%
Is_Alive                   0.0%
Height_Imputed             0.0%
Weight_Imputed             0.0%
Born_Country_From_NOC      0.0%
dtype: object

In [16]:
# bios_df[bios_df['Athlete_Id'].isin([1812, 3997, 6237, 6292, 6317, 6468, 6679, 6759, 6775, 6863, 7015, 7158, 7250, 7673, 8270, 8928, 11272, 11572, 11706, 11779, 12191, 13017, 15259, 17449, 17613, 17635, 19553, 19621, 19626, 20180, 21166, 21750, 22025, 22362, 22368, 23530, 23541, 24096, 24429, 24722, 24749, 24755, 24779, 24787, 24790, 24809, 24817, 25804, 26222, 28041, 30715, 31008, 31912, 38083, 39441, 39442, 39444, 39452, 39457, 39798, 39814, 40973, 40972, 41015, 41298, 42210, 42240, 42660, 46418, 47918, 47978, 50179, 53725, 54682, 54710, 55563, 58663, 58795, 58818, 58907, 60619, 61190, 68201, 72894, 79628, 79765, 79824, 79881, 79900, 79957, 128839])]
bios_df[bios_df['Name'].isin(['Ahmed Salem', 'Nikolaos Kaloudis', 'Georges Dubois'])]



Unnamed: 0,Roles,Sex,NOC,Athlete_Id,Name,Height (cm),Weight (kg),Born_Date,Died_Date,Is_Alive,Born_City,Born_Region,Born_Country,Height_Imputed,Weight_Imputed,Born_Country_From_NOC
16309,['Competed in Olympic Games'],Male,['egypt'],16427,Ahmed Salem,177.0,70.0,,,True,,,EGY,True,True,True
24613,['Competed in Olympic Games'],Male,['egypt'],24803,Ahmed Salem,178.0,73.0,,,True,,,EGY,True,True,True
25396,['Competed in Olympic Games'],Male,['greece'],25584,Nikolaos Kaloudis,178.0,73.0,1899-01-01,,True,,,GRC,True,True,True
53599,['Competed in Olympic Games'],Male,['greece'],54085,Nikolaos Kaloudis,186.0,86.0,1899-01-01,,True,,,GRC,True,True,True
67689,['Competed in Olympic Games'],Male,['france'],68201,Georges Dubois,180.0,71.0,,1934-01-01,False,,,FRA,True,True,True
79675,['Competed in Olympic Games'],Male,['france'],80315,Georges Dubois,168.0,63.0,,,True,,,FRA,True,True,True
84635,['Competed in Olympic Games'],Male,['switzerland'],85309,Georges Dubois,177.0,77.0,1935-05-19,2018-09-08,False,La Chaux-de-Fonds,Neuchâtel,SUI,False,False,False


In [17]:

# Exact duplicates
dupes = bios_df[bios_df.duplicated()]
print(f"Exact duplicates: {len(dupes)}")

# Potential duplicates (same Name + Born_Date or Athlete_Id)
possible_dupes = bios_df[bios_df.duplicated(subset=['Name', 'Born_Date'], keep=False)]
print(f"possible duplicates: {len(possible_dupes)}")


Exact duplicates: 0
possible duplicates: 8


In [18]:
def is_list_of_strings(x):
    return all(isinstance(r, str) for r in x)

no_duplicate_name_birth = pa.Check(
    lambda df: ~df.duplicated(subset=["Name", "Born_Date"]),
    element_wise=False,
    error="Duplicate athlete records found with same Name and Born_Date."
)

date_logic = pa.Check(
    lambda df: (df["Born_Date"].isna() | df["Died_Date"].isna()) | (df["Died_Date"] >= df["Born_Date"]),
    element_wise=False,
    error="Died_Date earlier than Born_Date."
)


height_weight_logic = pa.Check(
    lambda df: (
        (df["Weight (kg)"] / ((df["Height (cm)"] / 100) ** 2)).between(15, 45)
    ),
    error="Unrealistic height-to-weight ratio."
)

In [19]:


bio_schema = pa.DataFrameSchema(
    { 
        "Athlete_Id": pa.Column(int, pa.Check.ge(1), nullable=False, unique=True), 
        "Name": pa.Column(str, nullable=False), 
        "Sex": pa.Column(str, pa.Check.isin(["Male", "Female"]), nullable=False), 
        "NOC": pa.Column(str, pa.Check(is_list_of_strings, element_wise=False ), nullable=True),

        "Height (cm)": pa.Column(float, pa.Check.between(100,250), nullable=True), 
        "Weight (kg)": pa.Column(float, pa.Check.between(25,200), nullable=True), 
        "Height_Imputed": pa.Column(bool, nullable=False),
        "Weight_Imputed": pa.Column(bool, nullable=False),

        "Born_Date": pa.Column("datetime64[ns]", nullable=True), 
        "Died_Date": pa.Column("datetime64[ns]", nullable=True),
        'Is_Deceased': pa.Column(bool, nullable=False),

        "Born_City": pa.Column(str, nullable=True), 
        "Born_Region": pa.Column(str, nullable=True), 
        "Born_Country": pa.Column(str, pa.Check.str_length(3, 3), nullable=True), 
        "Born_Country_From_NOC": pa.Column(bool, nullable=False),

        "Roles": pa.Column(object,pa.Check(is_list_of_strings, element_wise=False ), nullable=True) 
        
        }, 
        strict=True, 
        coerce=True,
        
        checks=[

            no_duplicate_name_birth,
            date_logic,
            height_weight_logic

            ]

        ) 



try: 
    bio_schema.validate(bios_df, lazy=True) 
    print("Validation PASSED!") 
    
except pa.errors.SchemaErrors as exc: 
    print("Validation FAILED!") 
    error_df = exc.failure_cases
    print(exc)

Validation FAILED!
{
    "SCHEMA": {
        "COLUMN_NOT_IN_SCHEMA": [
            {
                "schema": null,
                "column": null,
                "check": "column_in_schema",
                "error": "column 'Is_Alive' not in DataFrameSchema {'Athlete_Id': <Schema Column(name=Athlete_Id, type=DataType(int64))>, 'Name': <Schema Column(name=Name, type=DataType(str))>, 'Sex': <Schema Column(name=Sex, type=DataType(str))>, 'NOC': <Schema Column(name=NOC, type=DataType(str))>, 'Height (cm)': <Schema Column(name=Height (cm), type=DataType(float64))>, 'Weight (kg)': <Schema Column(name=Weight (kg), type=DataType(float64))>, 'Height_Imputed': <Schema Column(name=Height_Imputed, type=DataType(bool))>, 'Weight_Imputed': <Schema Column(name=Weight_Imputed, type=DataType(bool))>, 'Born_Date': <Schema Column(name=Born_Date, type=DataType(datetime64[ns]))>, 'Died_Date': <Schema Column(name=Died_Date, type=DataType(datetime64[ns]))>, 'Is_Deceased': <Schema Column(name=Is_Deceased, 

In [20]:
error_df

Unnamed: 0,schema_context,column,check,check_number,failure_case,index
0,DataFrameSchema,Roles,Duplicate athlete records found with same Name...,0,['Competed in Olympic Games'],24613
807,DataFrameSchema,Born_City,Unrealistic height-to-weight ratio.,2,?,115052
805,DataFrameSchema,Born_City,Unrealistic height-to-weight ratio.,2,Wuhan,114682
804,DataFrameSchema,Born_City,Unrealistic height-to-weight ratio.,2,Cape Town,106747
803,DataFrameSchema,Born_City,Unrealistic height-to-weight ratio.,2,Moskva (Moscow),106742
...,...,...,...,...,...,...
400,DataFrameSchema,Name,Unrealistic height-to-weight ratio.,2,Carolina Malchair,90571
399,DataFrameSchema,Name,Unrealistic height-to-weight ratio.,2,Emilie Livingston,90534
398,DataFrameSchema,Name,Unrealistic height-to-weight ratio.,2,Miguel Núñez,66381
397,DataFrameSchema,Name,Unrealistic height-to-weight ratio.,2,Hoche Yaya Aden,66354


In [21]:


affiliation_schema = pa.DataFrameSchema(
    { 
        "Affiliation_Id": pa.Column(int, pa.Check.ge(0), nullable=False, unique=True), 

        "Affiliation_Club": pa.Column(str, nullable=True), 
        "Affiliation_City": pa.Column(str, nullable=True), 
        "Affiliation_Country": pa.Column(str, pa.Check.str_length(3, 3), nullable=True), 

        }, 
        strict=True, 
        coerce=True) 


try: 
    
    affiliation_schema.validate(affiliation_df, lazy=True) 
    print("Validation PASSED!") 
    
except pa.errors.SchemaErrors as exc: 
    print("Validation FAILED!") 
    error_df = exc.failure_cases
    print(exc)

Validation FAILED!
{
    "DATA": {
        "DATAFRAME_CHECK": [
            {
                "schema": null,
                "column": "Affiliation_Country",
                "check": "str_length(3, 3)",
                "error": "Column 'Affiliation_Country' failed element-wise validator number 0: str_length(3, 3) failure cases: MN, MO, Central Sport Klub Army, HCAW, Oder) (GER, Saale) (GER, Saale) (GER, Oder), Frankfurt (Oder) (GER, Oder) (GER, Alma-Ata, Salamanca, Campinas, CSKA, Tr\u00eas Coroas, Young Men's Christian Association, Vassoras, RJ, Steinfeld) (GER, Westfalen) (GER, Ohm) (GER, SP, Leningrad, 08, Leningrad, Durazno, SCHC, HHIJC, BHMC, EMHC, Tameside, Young Men's Christian Association, Campinas, Cairo, Saale) (GER, Young Men's Christian Association, Young Men's Christian Association, Young Men's Christian Association, Oder) (GER, Flanders, NJ, NJJK, Cairo, Young Men's Christian Association, 96, Central Sport Klub Navy, Oder) (GER, Baden) (GER, 92, 00, DZ&PC, AZ 1870, Centr

In [22]:
bios_df[bios_df['Height_Imputed']==True][['Height (cm)','Height_Imputed','Sex','Name']].sample(10)

Unnamed: 0,Height (cm),Height_Imputed,Sex,Name
23325,180.0,True,Male,Václav Rais
14510,178.0,True,Male,Eduard Gusev
41612,175.0,True,Male,Emil Collan
4810,172.0,True,Male,Gunnar Hansen
18647,177.0,True,Male,Harry Freeman
50560,170.0,True,Female,Eleanor Garatti-Saville
14281,177.0,True,Male,Kristian Frisch
85258,178.0,True,Male,Nigel Gardner
67749,180.0,True,Male,Jean Gérault
31098,168.0,True,Male,P. Gussmann


In [23]:
results_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 308905 entries, 0 to 308904
Data columns (total 11 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   Event       308905 non-null  object 
 1   Team        121932 non-null  object 
 2   Medal       44199 non-null   object 
 3   As          308905 non-null  object 
 4   NOC         308905 non-null  object 
 5   Discipline  308905 non-null  object 
 6   Athlete_Id  308905 non-null  int64  
 7   Game Year   308904 non-null  float64
 8   Game Type   308904 non-null  object 
 9   Position    283633 non-null  float64
 10  Tied        307080 non-null  object 
dtypes: float64(2), int64(1), object(8)
memory usage: 25.9+ MB


In [24]:
results_df

Unnamed: 0,Event,Team,Medal,As,NOC,Discipline,Athlete_Id,Game Year,Game Type,Position,Tied
0,"Singles, Men (Olympic)",,,Arnaud Boetsch,FRA,Tennis,2,1996.0,Summer Olympics,17.0,True
1,"Doubles, Men (Olympic)",Guillaume Raoux,,Arnaud Boetsch,FRA,Tennis,2,1996.0,Summer Olympics,17.0,True
2,"Singles, Men (Olympic)",,,Jean-François Blanchy,FRA,Tennis,1,1912.0,Summer Olympics,17.0,True
3,"Doubles, Men (Olympic)",Jean Montariol,,Jean-François Blanchy,FRA,Tennis,1,1912.0,Summer Olympics,,False
4,"Singles, Men (Olympic)",,,Jean-François Blanchy,FRA,Tennis,1,1920.0,Summer Olympics,32.0,True
...,...,...,...,...,...,...,...,...,...,...,...
308900,"Four, Open (Olympic)",Republic of Korea 2,,Shin Ye-Chan,KOR,Bobsleigh (Bobsleigh),149217,2022.0,Winter Olympics,25.0,False
308901,"Basketball, Men (Olympic)",Cuba,,Carlos García-Ordóñez,CUB,Basketball (Basketball),149219,1952.0,Summer Olympics,13.0,True
308902,"400 metres Hurdles, Men (Olympic)",,,André Foussard,FRA,Athletics,149225,1924.0,Summer Olympics,5.0,False
308903,"Ice Hockey, Women (Olympic)",ROC,,Valeriya Merkusheva,ROC,Ice Hockey (Ice Hockey),149223,2022.0,Winter Olympics,5.0,False


In [25]:
df_nan_percentage(results_df)

Medal         85.69%
Team          60.53%
Position       8.18%
Tied           0.59%
Event           0.0%
As              0.0%
NOC             0.0%
Athlete_Id      0.0%
Discipline      0.0%
Game Type       0.0%
Game Year       0.0%
dtype: object

In [None]:
# --- Schema Definition ---
fact_events_schema = pa.DataFrameSchema(
    {

        "Athlete_Id": pa.Column(int, pa.Check.ge(1), nullable=False),
        "As":   pa.Column(str, nullable=False),
        "NOC": pa.Column(str, pa.Check.str_length(3, 3), nullable=False),
        "Discipline": pa.Column(str, nullable=False),
        "Game Type": pa.Column(str, nullable=True),
        "Game Year": pa.Column(float, pa.Check.between(1896, 2024), nullable=True),


        "Event": pa.Column(str, nullable=True),
        "Team": pa.Column(str, nullable=True),
        "Tied": pa.Column(bool, nullable=True),


        "Position": pa.Column(float,pa.Check.ge(1), nullable=True),
        "Medal": pa.Column(
            str,
            pa.Check.isin(["Gold", "Silver", "Bronze"]),
            nullable=True
        ),

    },
    strict=True,
    coerce=True,
    # checks=[

    # ],
    name="fact_events_schema"
)

# --- Validation ---
try:
    fact_events_schema.validate(results_df, lazy=True)
    print("Validation PASSED!")
except pa.errors.SchemaErrors as exc:
    print("Validation FAILED!")
    error_df = exc.failure_cases
    print(exc)

Validation PASSED!


In [None]:
def rename_df_columns(df, names_dict):
    df = df.copy()
    df = df.rename(columns= names_dict)
    return df

In [28]:
events_fact_table_columns_names =\
{
    "Athlete_Id": "athlete_id",
    "As": "dim_as",
    "NOC": "dim_noc",
    "Discipline": "dim_discipline",
    "Game Type": "dim_game_type",
    "Game Year": "dim_game_year",
    "Event": "d_event_name",
    "Team": "d_team_name",
    "Tied": "m_tied_flag",
    "Position":"m_position",
    "Medal":"m_medal"

}

athlete_dim_table_columns_names =\
{
    "Athlete_Id": "athlete_id",
    "Roles": "athlete_roles",
    "Sex": "athlete_sex",
    "NOC": "athlete_NOC",
    "Name": "athlete_name",
    "Height (cm)": "athlete_height_cm",
    "Weight (kg)": "athlete_weight_kg",
    "Born_Date": "athlete_born_date",
    "Died_Date":"athlete_died_date",
    "Is_Alive":"athlete_is_alive",
    "Born_City":"athlete_born_city",
    "Born_Region":"athlete_born_region",
    "Born_Country":"athlete_born_country",
    "Height_Imputed":"athlete_is_height_imputed",
    "Weight_Imputed":"athlete_is_weight_imputed",
    "Born_Country_From_NOC":"athlete_is_born_country_from_NOC"

}

In [None]:


rename_df_columns(results_df, events_fact_table_columns_names)

Unnamed: 0,d_event_name,d_team_name,m_medal,dim_as,dim_noc,dim_discipline,athlete_id,dim_game_year,dim_game_type,m_position,m_tied_flag
0,"Singles, Men (Olympic)",,,Arnaud Boetsch,FRA,Tennis,2,1996.0,Summer Olympics,17.0,True
1,"Doubles, Men (Olympic)",Guillaume Raoux,,Arnaud Boetsch,FRA,Tennis,2,1996.0,Summer Olympics,17.0,True
2,"Singles, Men (Olympic)",,,Jean-François Blanchy,FRA,Tennis,1,1912.0,Summer Olympics,17.0,True
3,"Doubles, Men (Olympic)",Jean Montariol,,Jean-François Blanchy,FRA,Tennis,1,1912.0,Summer Olympics,,False
4,"Singles, Men (Olympic)",,,Jean-François Blanchy,FRA,Tennis,1,1920.0,Summer Olympics,32.0,True
...,...,...,...,...,...,...,...,...,...,...,...
308900,"Four, Open (Olympic)",Republic of Korea 2,,Shin Ye-Chan,KOR,Bobsleigh (Bobsleigh),149217,2022.0,Winter Olympics,25.0,False
308901,"Basketball, Men (Olympic)",Cuba,,Carlos García-Ordóñez,CUB,Basketball (Basketball),149219,1952.0,Summer Olympics,13.0,True
308902,"400 metres Hurdles, Men (Olympic)",,,André Foussard,FRA,Athletics,149225,1924.0,Summer Olympics,5.0,False
308903,"Ice Hockey, Women (Olympic)",ROC,,Valeriya Merkusheva,ROC,Ice Hockey (Ice Hockey),149223,2022.0,Winter Olympics,5.0,False


In [30]:

rename_df_columns(bios_df, athlete_dim_table_columns_names)

Unnamed: 0,athlete_roles,athlete_sex,athlete_NOC,athlete_id,athlete_name,athlete_height_cm,athlete_weight_kg,athlete_born_date,athlete_died_date,athlete_is_alive,athlete_born_city,athlete_born_region,athlete_born_country,athlete_is_height_imputed,athlete_is_weight_imputed,athlete_is_born_country_from_NOC
0,['Competed in Olympic Games'],Male,['france'],2,Arnaud Boetsch,183.0,76.0,1969-04-01,,True,Meulan,Yvelines,FRA,False,False,False
1,['Competed in Olympic Games'],Male,['france'],1,Jean-François Blanchy,185.0,79.0,1886-12-12,1960-10-02,False,Bordeaux,Gironde,FRA,True,True,False
2,"['Competed in Olympic Games', 'Administrator']",Male,['france'],3,Jean Borotra,183.0,76.0,1898-08-13,1994-07-17,False,Biarritz,Pyrénées-Atlantiques,FRA,False,False,False
3,['Competed in Olympic Games'],Male,['france'],5,Albert Canet,185.0,79.0,1878-04-17,1930-07-25,False,Wandsworth,England,GBR,True,True,False
4,['Competed in Olympic Games'],Male,['france'],4,Jacques Brugnon,168.0,64.0,1895-05-11,1978-03-20,False,Paris VIIIe,Paris,FRA,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145560,['Competed in Olympic Games'],Male,['republic of korea'],149217,Sin Ye-Chan,182.0,90.0,1995-06-13,,True,,,,True,True,True
145561,['Competed in Olympic Games'],Male,['cuba'],149219,Carlos García-Ordóñez,194.0,90.0,1927-04-24,2019-11-24,False,La Habana (Havana),Ciudad de La Habana,CUB,True,True,False
145562,['Competed in Olympic Games'],Male,['france'],149225,André Foussard,166.0,71.0,1899-05-19,1986-03-18,False,Niort,Deux-Sèvres,FRA,False,True,False
145563,['Competed in Olympic Games'],Female,['roc'],149223,Valeriya Merkusheva,168.0,65.0,1999-09-20,,True,Moskva (Moscow),Moskva,RUS,False,False,False


Validation PASSED!
