In [1]:
import pandas as pd
import pandera.pandas as pa

In [None]:

def df_nan_percentage(df):
    return df.isna().mean().mul(100).round(2).sort_values(ascending=False).map(lambda x: f"{x}%")

def is_list_of_strings(x):
    return all(isinstance(r, str) for r in x)

no_duplicate_name_birth = pa.Check(
    lambda df: ~df.duplicated(subset=["Name", "Born_Date"]),
    element_wise=False,
    error="Duplicate athlete records found with same Name and Born_Date."
)

date_logic = pa.Check(
    lambda df: (df["Born_Date"].isna() | df["Died_Date"].isna()) | (df["Died_Date"] >= df["Born_Date"]),
    element_wise=False,
    error="Died_Date earlier than Born_Date."
)


height_weight_logic = pa.Check(
    lambda df: (
        (df["Weight (kg)"] / ((df["Height (cm)"] / 100) ** 2)).between(15, 45)
    ),
    error="Unrealistic height-to-weight ratio."
)

# --- Schema Definition ---

bios_schema = pa.DataFrameSchema(
    { 
        "Athlete_Id": pa.Column(pd.Int64Dtype, pa.Check.ge(1), nullable=False, unique=True), 
        "Name": pa.Column(str, nullable=False), 
        "Sex": pa.Column(str, pa.Check.isin(["Male", "Female"]), nullable=False), 
        "NOC": pa.Column(str, pa.Check(is_list_of_strings, element_wise=False ), nullable=True),

        "Height (cm)": pa.Column(float, pa.Check.between(100,250), nullable=True), 
        "Weight (kg)": pa.Column(float, pa.Check.between(25,200), nullable=True), 
        "Height_Imputed": pa.Column(bool, nullable=False),
        "Weight_Imputed": pa.Column(bool, nullable=False),

        "Born_Date": pa.Column("datetime64[ns]", nullable=True), 
        "Died_Date": pa.Column("datetime64[ns]", nullable=True),
        'Is_Alive': pa.Column(bool, nullable=False),

        "Born_City": pa.Column(str, nullable=True), 
        "Born_Region": pa.Column(str, nullable=True), 
        "Born_Country": pa.Column(str, pa.Check.str_length(3, 3), nullable=True), 
        "Born_Country_From_NOC": pa.Column(bool, nullable=False),

        "Roles": pa.Column(object,pa.Check(is_list_of_strings, element_wise=False ), nullable=True) 
        
        }, 
        strict=True, 
        coerce=True,
        
        checks=[

            no_duplicate_name_birth,
            date_logic,
            height_weight_logic

            ]

        ) 

In [None]:

# duplicate_affiliation_content_check = pa.Check(
#     lambda df: not df.duplicated(subset=["Affiliation_Club", "Affiliation_City", "Affiliation_Country"]).any(),
#     element_wise=False,
#     error="Duplicate affiliations found with different Affiliation_Ids (same club, city, and country)."
# )

duplicate_affiliation_content_check = pa.Check(
    lambda df: ~df.duplicated(subset=["Affiliation_Club"]),
    element_wise=False,
    error="Duplicate affiliations found with different Affiliation_Ids (same club, city, and country)."
)

affiliations_schema = pa.DataFrameSchema(
    { 
        "Affiliation_Id": pa.Column(pd.Int64Dtype, pa.Check.ge(0), nullable=False, unique=True), 

        "Affiliation_Club": pa.Column(str, nullable=True), 
        "Affiliation_City": pa.Column(str, nullable=True), 
        "Affiliation_Country": pa.Column(str, pa.Check.str_length(3, 3), nullable=True), 

        }, 

        strict=True, 
        coerce=True,
        
        checks=[
            duplicate_affiliation_content_check
            ]
        ) 



medal_position_logic_check = pa.Check(
    lambda df: (
        df["Medal"].isna() | df["Position"].isna() | df["Position"] < 3
    ),
    error="Medal assigned to invalid position (must be ≤ 3)."
)

position_medal_match_check = pa.Check(
    lambda df: (
        df["Position"].isna()
        | (
            ((df["Position"] == 1) & (df["Medal"] == "Gold"))
            | ((df["Position"] == 2) & (df["Medal"] == "Silver"))
            | ((df["Position"] == 3) & (df["Medal"] == "Bronze"))
            | (df["Position"] > 3) & (df["Medal"].isna())
        )
    ),
    error="Position–Medal mismatch: check if medal corresponds to rank."
)

# --- Schema Definition ---

results_schema = pa.DataFrameSchema(
    {

        "Athlete_Id": pa.Column(pd.Int64Dtype, pa.Check.ge(1), nullable=False),
        "As":   pa.Column(str, nullable=False),
        "NOC": pa.Column(str, pa.Check.str_length(3, 3), nullable=False),
        "Discipline": pa.Column(str, nullable=False),
        "Game Type": pa.Column(str, nullable=True),
        "Game Year": pa.Column(pd.Int64Dtype, pa.Check.between(1850, 2024), nullable=True),

        "Event": pa.Column(str, nullable=True),
        "Team": pa.Column(str, nullable=True),
        "Tied": pa.Column(bool, nullable=True),

        "Position": pa.Column(pd.Int64Dtype,pa.Check.ge(1), nullable=True),
        "Medal": pa.Column(
            str,
            pa.Check.isin(["Gold", "Silver", "Bronze"]),
            nullable=True
        ),

    },
    strict=True,
    coerce=True,
    checks=[
        medal_position_logic_check,
        position_medal_match_check

    ],

)


In [None]:

game_types_list= ['Olympic Games', 'Intercalated Games', 'Youth Olympic Games','Forerunners to the Olympic Games']
edition_names_list=['Summer', 'Winter', 'Equestrian']


# no_duplicate_games_check = pa.Check(
#     lambda df: not df.duplicated(subset=["Year", "Edition_Name", "Game_Type"]).any(),
#     element_wise=False,
#     error="Duplicate game editions detected based on Year, Edition_Name, and Game_Type."
# )

no_duplicate_games_check = pa.Check(
    lambda df: ~df.duplicated(subset=["Year", "Edition_Name", "Game_Type"]),
    element_wise=False,
    error="Duplicate game editions detected based on Year, Edition_Name, and Game_Type."
)

edition_date_check = pa.Check(
    lambda df: (
        # Opened <= Closed  (or either missing)
        df["Opened"].isna() | df["Closed"].isna() | (df["Opened"] <= df["Closed"])
    ),
    
    error="Chronological order violated: check Opened, Closed edition dates."
)

Competition_date_check = pa.Check(
    lambda df: (

        # Competition_Start <= Competition_End  (or either missing)
        df["Competition_Start"].isna() | df["Competition_End"].isna() | (df["Competition_Start"] <= df["Competition_End"])
        
    ),

    error="Chronological order violated:check Start, End Competition dates."
)

# edition_Competition_date_check = pa.Check(
#     lambda df: (

#         # Opened <= Competition_Start  (or either missing)
#         df["Opened"].isna() | df["Competition_Start"].isna() | (df["Opened"] <= df["Competition_Start"])
#     ),
#     element_wise=False,
#     error="Chronological order violated: check Opened, Competition Start dates."
# )


# --- Schema Definition ---
editions_schema = pa.DataFrameSchema(
    {   
        "game_id": pa.Column(pd.Int64Dtype, pa.Check.ge(1), nullable=False),
        "Year": pa.Column(pd.Int64Dtype, pa.Check.between(1850, 2024), nullable=False),
        "Game_Type": pa.Column(str,pa.Check.isin(game_types_list), nullable=False),
        "Edition_Name": pa.Column(str,pa.Check.isin(edition_names_list), nullable=True),

        "City": pa.Column(str, nullable=False),
        "Country": pa.Column(str, nullable=False),

        "Opened": pa.Column("datetime64[ns]", nullable=True),
        "Closed": pa.Column("datetime64[ns]", nullable=True),

        "Competition_Start": pa.Column("datetime64[ns]", nullable=True),
        "Competition_End": pa.Column("datetime64[ns]", nullable=True),

        "Comments": pa.Column(str, nullable=True),
        
        "Opened_Imputed": pa.Column(bool, nullable=False),	
        "Closed_Imputed": pa.Column(bool, nullable=False),
        "Competition_Start_Imputed": pa.Column(bool, nullable=False),
        "Competition_End_Imputed": pa.Column(bool, nullable=False),

    },
    strict=True,
    coerce=True,
    checks=[
        no_duplicate_games_check,
        edition_date_check,
        Competition_date_check,
        # edition_Competition_date_check      # ensure chronological order

    ],
    name="games_schema"
)




In [5]:
# Load your data
bios_df = pd.read_csv("../clean_data_II/cleaned_biodata.csv")
results_df = pd.read_csv('../clean_data/cleaned_results.csv')
editions_df = pd.read_csv('../clean_data_II/cleaned_editions.csv')
affiliation_df = pd.read_csv("../clean_data/dim_affiliation.csv")



In [None]:


try: 
    bios_schema.validate(bios_df, lazy=True) 
    print("Validation PASSED!") 
    
except pa.errors.SchemaErrors as exc: 
    print("Validation FAILED!") 
    bios_error_df = exc.failure_cases
    print(exc)




Validation FAILED!
{
    "DATA": {
        "DATAFRAME_CHECK": [
            {
                "schema": null,
                "column": null,
                "check": "Duplicate athlete records found with same Name and Born_Date.",
                "error": "DataFrameSchema 'None' failed element-wise validator number 0: <Check <lambda>: Duplicate athlete records found with same Name and Born_Date.> failure cases: ['Competed in Olympic Games'], ['Competed in Olympic Games'], ['Competed in Olympic Games'], ['Competed in Olympic Games'], Male, Male, Male, Male, ['egypt'], ['greece'], ['france'], ['france'], 24803, 54085, 80315, 80349, Ahmed Salem, Nikolaos Kaloudis, Georges Dubois,  Favier, 178.0, 186.0, 168.0, 168.0, 73.0, 86.0, 63.0, 63.0, 1899-01-01 00:00:00, True, True, True, True, EGY, GRC, FRA, FRA, True, True, True, True, True, True, True, True, True, True, True, True"
            },
            {
                "schema": null,
                "column": null,
                "check

In [7]:


def get_error_df(df, original_df):
    # Example df
    # df has columns ['failure_case', 'column', 'check']

    # Step 1: Find max duplication count
    max_count = df['column'].value_counts().max()

    # Step 2: Keep only columns with max_count occurrences
    valid_columns = df['column'].value_counts()[df['column'].value_counts() == max_count].index
    filtered = df[df['column'].isin(valid_columns)]

    # Step 3: Get all unique checks
    all_checks = filtered['check'].unique()

    # Step 4: Transform each check separately and store results
    dfs = []

    for chk in all_checks:
        chk_filtered = filtered[filtered['check'] == chk]
        
        # Group by column, collect failure_case
        grouped = chk_filtered.groupby('column')['failure_case'].apply(list)
        
        # Create wide DataFrame
        temp_df = pd.DataFrame({col: vals for col, vals in grouped.items()})
        
        # Add failed_check column
        temp_df['failed_check'] = chk
        
        # Append to list
        dfs.append(temp_df)

    # Step 5: Concatenate all check-specific DataFrames
    wide_df = pd.concat(dfs, ignore_index=True)

    # Step 1: Identify missing columns
    # Exclude 'failed_check' if it already exists in wide_df
    missing_cols = [c for c in original_df.columns if c not in wide_df.columns and c != 'failed_check']

    # Step 2: Merge missing columns back
    # We'll use the columns that exist in wide_df (except 'failed_check') as keys
    merge_cols = [c for c in wide_df.columns if c != 'failed_check']

    # Step 3: Merge wide_df with original_df to get missing columns
    # Using left join to keep all rows in wide_df
    final_df = pd.merge(
        wide_df,
        original_df[merge_cols + missing_cols].drop_duplicates(),
        on=merge_cols,
        how='left'
    )

    # Step 4: Optional: check the result

    return final_df.drop_duplicates()



In [None]:
bios_error_df = bios_error_df[["failure_case", "column",'check']].sort_index()
get_error_df(bios_error_df,bios_df)

Unnamed: 0,Athlete_Id,Born_Country_From_NOC,Height (cm),Height_Imputed,Is_Alive,NOC,Name,Roles,Sex,Weight (kg),Weight_Imputed,failed_check,Born_Date,Died_Date,Born_City,Born_Region,Born_Country
0,24803,True,178.0,True,True,['egypt'],Ahmed Salem,['Competed in Olympic Games'],Male,73.0,True,Duplicate athlete records found with same Name...,,,,,EGY
1,54085,True,186.0,True,True,['greece'],Nikolaos Kaloudis,['Competed in Olympic Games'],Male,86.0,True,Duplicate athlete records found with same Name...,1899-01-01,,,,GRC
2,80315,True,168.0,True,True,['france'],Georges Dubois,['Competed in Olympic Games'],Male,63.0,True,Duplicate athlete records found with same Name...,,,,,FRA
3,80349,True,168.0,True,True,['france'],Favier,['Competed in Olympic Games'],Male,63.0,True,Duplicate athlete records found with same Name...,,,,,FRA
4,4531,True,128.0,False,True,['malawi'],Helman Palije,['Competed in Olympic Games'],Male,74.0,False,Unrealistic height-to-weight ratio.,1967-06-24,,,,MWI
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77,129927,True,160.0,False,True,['egypt'],Shaimaa Haridy,['Competed in Olympic Games'],Female,124.0,False,Unrealistic height-to-weight ratio.,1991-01-01,,,,EGY
78,131458,False,179.0,False,True,['puerto rico'],Yarimar Mercado,['Competed in Olympic Games'],Female,48.0,False,Unrealistic height-to-weight ratio.,1995-03-12,,Yauco,Puerto Rico,PUR
79,132440,False,180.0,False,True,['armenia'],Ruben Aleksanyan,['Competed in Olympic Games'],Male,152.0,False,Unrealistic height-to-weight ratio.,1990-03-14,,Ararat,Ararat,ARM
80,133553,False,186.0,False,True,['ecuador'],Fernando Salas,['Competed in Olympic Games'],Male,163.0,False,Unrealistic height-to-weight ratio.,1988-02-10,,Ambato,Tungurahua,ECU


In [None]:

try: 
    affiliations_schema.validate(affiliation_df, lazy=True) 
    print("Validation PASSED!") 
    
except pa.errors.SchemaErrors as exc: 
    print("Validation FAILED!") 
    affiliations_error_df = exc.failure_cases
    print(exc)




Validation FAILED!
{
    "DATA": {
        "DATAFRAME_CHECK": [
            {
                "schema": null,
                "column": "Affiliation_Country",
                "check": "str_length(3, 3)",
                "error": "Column 'Affiliation_Country' failed element-wise validator number 0: str_length(3, 3) failure cases: MN, MO, Central Sport Klub Army, HCAW, Oder) (GER, Saale) (GER, Saale) (GER, Oder), Frankfurt (Oder) (GER, Oder) (GER, Alma-Ata, Salamanca, Campinas, CSKA, Tr\u00eas Coroas, Young Men's Christian Association, Vassoras, RJ, Steinfeld) (GER, Westfalen) (GER, Ohm) (GER, SP, Leningrad, 08, Leningrad, Durazno, SCHC, HHIJC, BHMC, EMHC, Tameside, Young Men's Christian Association, Campinas, Cairo, Saale) (GER, Young Men's Christian Association, Young Men's Christian Association, Young Men's Christian Association, Oder) (GER, Flanders, NJ, NJJK, Cairo, Young Men's Christian Association, 96, Central Sport Klub Navy, Oder) (GER, Baden) (GER, 92, 00, DZ&PC, AZ 1870, Centr

In [None]:
affiliations_error_df = affiliations_error_df[["failure_case", "column",'check']].sort_index()
get_error_df(affiliations_error_df,affiliation_df)


Unnamed: 0,Affiliation_Id,failed_check,Affiliation_Club,Affiliation_City,Affiliation_Country
0,49,Duplicate affiliations found with different Af...,TSG Dülmen,,
1,66,Duplicate affiliations found with different Af...,Blau-Weiß Neuss,,
2,76,Duplicate affiliations found with different Af...,Akademischer SV Dresden,Bremer Tennisverein 1896,
3,133,Duplicate affiliations found with different Af...,?,Schaerbeek,BEL
4,257,Duplicate affiliations found with different Af...,Royal Toxophilite Society,London,GBR
...,...,...,...,...,...
2844,38812,Duplicate affiliations found with different Af...,Sundbybergs CK,,
2845,38813,Duplicate affiliations found with different Af...,Norrköpings KK,,
2846,38818,Duplicate affiliations found with different Af...,Metallurg Magnitogorsk,,
2847,38821,Duplicate affiliations found with different Af...,Sun Valley Ski Education Foundation,,


In [None]:
try:
    results_schema.validate(results_df, lazy=True)
    print("Validation PASSED!")
except pa.errors.SchemaErrors as exc:
    print("Validation FAILED!")
    results_error_df = exc.failure_cases
    print(exc)



Validation FAILED!
{
    "DATA": {
        "DATAFRAME_CHECK": [
            {
                "schema": null,
                "column": null,
                "check": "Position\u2013Medal mismatch: check if medal corresponds to rank.",
                "error": "DataFrameSchema 'None' failed element-wise validator number 1: <Check <lambda>: Position\u2013Medal mismatch: check if medal corresponds to rank.> failure cases: Doubles, Handicap, Men (Olympic (non-medal)), \u00c9p\u00e9e, Individual, Men (Olympic), Doubles, Handicap, Men (Olympic (non-medal)), Doubles, Handicap, Men (Olympic (non-medal)), Doubles, Handicap, Men (Olympic (non-medal)), Doubles, Handicap, Mixed (Olympic (non-medal)), Doubles, Handicap, Men (Olympic (non-medal)), Doubles, Handicap, Men (Olympic (non-medal)), Doubles, Handicap, Men (Olympic (non-medal)), Polo, Men (Olympic (non-medal)), Polo, Men (Olympic (non-medal)), Polo, Men (Olympic (non-medal)), Polo, Men (Olympic (non-medal)), Polo, Men (Olympic (non-medal))

In [None]:
results_error_df = results_error_df[["failure_case", "column",'check']].sort_index()
df = get_error_df(results_error_df,results_df)


In [13]:
df

Unnamed: 0,As,Athlete_Id,Discipline,Event,NOC,Position,Tied,failed_check,Team,Medal,Game Year,Game Type
0,Guy de la Chapelle,10,Tennis,"Doubles, Handicap, Men (Olympic (non-medal))",FRA,3,True,Position–Medal mismatch: check if medal corres...,Gonzalo de Candamo,,1900.0,Summer Olympics
1,"Élie, Comte de Lastours",11,Fencing,"Épée, Individual, Men (Olympic)",FRA,1,False,Position–Medal mismatch: check if medal corres...,,,1900.0,Summer Olympics
2,Max Decugis,12,Tennis,"Doubles, Handicap, Men (Olympic (non-medal))",FRA,1,False,Position–Medal mismatch: check if medal corres...,Spalding de Garmendia,,1900.0,Summer Olympics
3,Étienne Durand,14,Tennis,"Doubles, Handicap, Men (Olympic (non-medal))",FRA,3,True,Position–Medal mismatch: check if medal corres...,Adrien Fauchier-Magnan,,1900.0,Summer Olympics
4,Adrien Fauchier-Magnan,16,Tennis,"Doubles, Handicap, Men (Olympic (non-medal))",FRA,3,True,Position–Medal mismatch: check if medal corres...,Étienne Durand,,1900.0,Summer Olympics
...,...,...,...,...,...,...,...,...,...,...,...,...
9481,Morgann LeLeux,147121,Athletics,"Pole Vault, Women (Olympic)",USA,2,False,Position–Medal mismatch: check if medal corres...,,,2020.0,Summer Olympics
9482,David Kendziera,147323,Athletics,"400 metres Hurdles, Men (Olympic)",USA,3,False,Position–Medal mismatch: check if medal corres...,,,2020.0,Summer Olympics
9483,Michael Shuey,147343,Athletics,"Javelin Throw, Men (Olympic)",USA,1,False,Position–Medal mismatch: check if medal corres...,,,2020.0,Summer Olympics
9484,Raphael de Ligne,147805,Rowing,"Coxed Fours, Men (Olympic)",BEL,2,False,Position–Medal mismatch: check if medal corres...,Belgium,,1920.0,Summer Olympics


In [None]:
try:
    editions_schema.validate(editions_df, lazy=True)
    print("Validation PASSED!")
except pa.errors.SchemaErrors as exc:
    print("Validation FAILED!")
    editions_error_df = exc.failure_cases
    print(exc)

Validation FAILED!
{
    "SCHEMA": {
        "COLUMN_NOT_IN_DATAFRAME": [
            {
                "schema": "games_schema",
                "column": "games_schema",
                "check": "column_in_dataframe",
                "error": "column 'game_id' not in dataframe. Columns in dataframe: ['Year', 'City', 'Country', 'Opened', 'Closed', 'Comments', 'Game_Type', 'Edition_Name', 'Competition_Start', 'Competition_End', 'Opened_Imputed', 'Closed_Imputed', 'Competition_Start_Imputed', 'Competition_End_Imputed']"
            }
        ]
    },
    "DATA": {
        "DATAFRAME_CHECK": [
            {
                "schema": "games_schema",
                "column": "Year",
                "check": "in_range(1850, 2024)",
                "error": "Column 'Year' failed element-wise validator number 0: in_range(1850, 2024) failure cases: 2028, 2032, 2026, 2026"
            },
            {
                "schema": "games_schema",
                "column": "games_schema",
         

In [15]:
pd.set_option('display.max_colwidth', None)

In [None]:
editions_error_df = editions_error_df[["failure_case", "column",'check']].sort_index()
get_error_df(editions_error_df,editions_df)

Unnamed: 0,Year,failed_check,City,Country,Opened,Closed,Comments,Game_Type,Edition_Name,Competition_Start,Competition_End,Opened_Imputed,Closed_Imputed,Competition_Start_Imputed,Competition_End_Imputed
0,2028,"in_range(1850, 2024)",Los Angeles,USA,,,,Olympic Games,Summer,,,False,False,False,False
1,2032,"in_range(1850, 2024)",Brisbane,AUS,,,,Olympic Games,Summer,,,False,False,False,False
2,2026,"in_range(1850, 2024)",Milano-Cortina d'Ampezzo,ITA,2026-02-06,2026-02-22,,Olympic Games,Winter,2026-02-04,2026-02-22,False,False,False,False
3,2026,"in_range(1850, 2024)",Dakar,SEN,2026-10-31,2026-11-13,,Youth Olympic Games,Summer,2026-10-29,2026-11-13,False,False,False,False
6,1889,"Chronological order violated: check Opened, Closed edition dates.",Athina,GRE,1889-12-01,1889-04-30,,Forerunners to the Olympic Games,,1889-12-01,1889-04-30,True,True,False,False
7,1889,"Chronological order violated:check Start, End Competition dates.",Athina,GRE,1889-12-01,1889-04-30,,Forerunners to the Olympic Games,,1889-12-01,1889-04-30,True,True,False,False
