In [1]:
import pandas as pd

In [2]:
s3_endpoint = "http://localhost:9000" 

access_key = "accesskey"
secret_key = "secretkey"

bronze_bucket = "bronze"
silver_bucket = "silver"
gold_bucket = "gold"

s3fs_opts = {
    "key": access_key,
    "secret": secret_key,
    "client_kwargs": {"endpoint_url": s3_endpoint},
}




In [3]:
def df_nan_percentage(df):
    """Return a Series showing percentage of NaN values per column."""

    return df.isna().mean().mul(100).round(2).sort_values(ascending=False).map(lambda x: f"{x}%")


# Raw Data Sample

In [4]:
bios_raw_data_df = pd.read_parquet(f"s3://{bronze_bucket}/raw_data/biodata.parquet", storage_options=s3fs_opts)
editions_raw_data_df = pd.read_parquet(f"s3://{bronze_bucket}/raw_data/editions.parquet", storage_options=s3fs_opts)
results_raw_data_df = pd.read_parquet(f"s3://{bronze_bucket}/raw_data/results.parquet", storage_options=s3fs_opts)

In [5]:
bios_raw_data_df.sample(5)

Unnamed: 0,Roles,Sex,Full name,Used name,Born,Died,NOC,Athlete_Id,Affiliations,Nick/petnames,Measurements,Title(s),Nationality,Other names,Original name,Name order
48913,Competed in Olympic Games,Female,Luanne•Maurice,Luanne•Maurice,3 August 1972,,Mauritius,49285,,,,,,,,
76612,Competed in Olympic Games,Female,Tamara Aleksandrovna•Kazachkova (-Sorokina),Tamara•Kazachkova,"15 August 1950 in Kazanovka, Chelyabinsk (RUS)",,Soviet Union,77218,,,164 cm / 52 kg,,Russian Federation,,Тамара Александровна•Казачкова (-Сорокина),
6578,Competed in Olympic Games,Male,Francisco•Martínez Cordero,Francisco•Martínez,"20 June 1912 in Ciudad Juárez, Chihuahua (MEX)","1 December 1993 in El Paso, Texas (USA)",Mexico,6622,,El Quico,,,,,,
125635,Competed in Olympic Games,Female,Silvia•Bertagna,Silvia•Bertagna,"30 November 1986 in Bressanone, Bolzano-Bozen ...",,Italy,127934,"SC Gardena, Santa Cristina Valgardena (ITA)",,170 cm / 50 kg,,,,,
11392,Competed in Olympic Games,Female,Brigitte•Schockaert,Brigitte•Schockaert,"23 June 1933 in Zottegem, Oost-Vlaanderen (BEL)",,Belgium,11454,,,168 cm / 61 kg,,,,,


In [6]:

results_raw_data_df.sample(5)

Unnamed: 0,Games,Event,Team,Pos,Medal,As,NOC,Discipline,Athlete_Id,Nationality,Unnamed: 7
249894,2008 Summer Olympics,"Olympic Distance, Women (Olympic)",,45,,Lisa Mensink,NED,Triathlon,116205,,
7704,2000 Summer Olympics,"Bantamweight, Men (Olympic)",,=9,,Kazumasa Tsujimoto,JPN,Boxing,4271,,
129908,1996 Summer Olympics,"One Person Dinghy, Open (Olympic)",,32,,Cao Xiaobo,CHN,Sailing,61543,,
56137,1960 Summer Olympics,"Individual All-Around, Women (Olympic)",,21,,Atanasia Ionescu,ROU,Artistic Gymnastics (Gymnastics),29016,,
267150,2010 Summer Youth Olympics,"4 × 100 metres Medley Relay, Girls (YOG)",Canada,4,,Tera Van Beilen,CAN,Swimming (Aquatics),124758,,


In [7]:
editions_raw_data_df.head()

Unnamed: 0,#,Year,City,Country,Opened,Closed,Competition,Unnamed: 7,Game_Type,Edition_Name
0,I,1896,Athina,GRE,6 April,15 April,6 – 13 April,,Olympic Games,Summer
1,II,1900,Paris,FRA,,,14 May – 28 October,,Olympic Games,Summer
2,III,1904,St. Louis,USA,14 May,,1 July – 26 November,,Olympic Games,Summer
3,IV,1908,London,GBR,13 July,25 July,27 April – 31 October,,Olympic Games,Summer
4,V,1912,Stockholm,SWE,6 July,15 July,5 May – 27 July,,Olympic Games,Summer


In [8]:
scrape_failures_df = pd.read_parquet(f"s3://{bronze_bucket}/scrape_failures/failed_athletes.parquet", storage_options=s3fs_opts)

In [9]:
scrape_failures_df.sample(5)

Unnamed: 0,failed_athlete_id,error_message
1572,119871,Status 404 for athlete 119871
2175,126043,Status 404 for athlete 126043
3356,142153,Status 404 for athlete 142153
3168,139951,Status 404 for athlete 139951
576,76087,Status 404 for athlete 76087


# Cleaned Data Stage I

In [10]:
cleaned_biodata_df = pd.read_parquet(f"s3://{silver_bucket}/clean_data/cleaned_biodata.parquet", storage_options=s3fs_opts)
cleaned_affiliations_df = pd.read_parquet(f"s3://{silver_bucket}/clean_data/dim_affiliation.parquet", storage_options=s3fs_opts)
bridge_athlete_affiliation_df = pd.read_parquet(f"s3://{silver_bucket}/clean_data/bridge_athlete_affiliation.parquet", storage_options=s3fs_opts)
cleaned_editions_df = pd.read_parquet(f"s3://{silver_bucket}/clean_data/cleaned_editions.parquet", storage_options=s3fs_opts)
cleaned_results_df = pd.read_parquet(f"s3://{silver_bucket}/clean_data/cleaned_results.parquet", storage_options=s3fs_opts)

In [11]:
cleaned_biodata_df.sample(5)

Unnamed: 0,Roles,Sex,NOC,Athlete_Id,Name,Height (cm),Weight (kg),Born_Date,Died_Date,Is_Alive,Born_City,Born_Region,Born_Country
127315,Competed in Olympic Games,Female,china,129709,Zhang Weiwei,182.0,66.0,1990-10-07,NaT,True,Chengdu,Sichuan,CHN
118138,Competed in Olympic Games,Female,germany,119675,Isabell Ost,164.0,68.0,1988-10-21,NaT,True,Ost-Berlin (East Berlin),Berlin,GER
62817,Competed in Olympic Games,Male,fiji,63300,"Colin Philp, Sr.",181.0,85.0,1947-11-04,NaT,True,Hobart,Tasmania,AUS
121440,Competed in Olympic Games,Male,argentina,123377,Gonzalo Carou,190.0,100.0,1979-08-15,NaT,True,Buenos Aires,Ciudad Autónoma de Buenos Aires,ARG
50502,Competed in Olympic Games,Male,soviet union,50878,Endel Press,178.0,72.0,1929-02-16,1982-05-06,False,Tallinn,Harjumaa,EST


In [12]:
cleaned_affiliations_df.sample(5)

Unnamed: 0,Affiliation_Id,Affiliation_Club,Affiliation_City,Affiliation_Country
30411,30411,Limerick Swimming Club,Limerick,IRL
38054,38054,Clapham Chasers,Clapham,GBR
32842,32842,AOO Specialized [Brazil],,
25407,25407,Kyoto Club,,
5165,5165,HTC Stuttgarter Kickers,Stuttgart,GER


In [13]:
bridge_athlete_affiliation_df.sample(5)

Unnamed: 0,Athlete_Id,Affiliation_Id
62001,59399,0
145466,139091,0
114093,108231,0
5815,5608,1730
121124,114461,1349


In [14]:
cleaned_editions_df.sample(5)

Unnamed: 0,Year,City,Country,Opened,Closed,Comments,Game_Type,Edition_Name,Competition_Start,Competition_End,Game_Id
31,1960,Roma,ITA,1960-08-25,1960-09-11,,Olympic Games,Summer,1960-08-25,1960-09-11,32
43,1984,Los Angeles,USA,1984-07-28,1984-08-12,,Olympic Games,Summer,1984-07-06,1984-08-12,44
26,1952,Helsinki,FIN,1952-07-19,1952-08-03,,Olympic Games,Summer,1952-07-14,1952-08-03,27
37,1972,München,FRG,1972-08-26,1972-09-11,,Olympic Games,Summer,1972-08-26,1972-09-11,38
46,1988,Calgary,CAN,1988-02-13,1988-02-28,,Olympic Games,Winter,1988-02-13,1988-02-28,47


In [15]:
cleaned_results_df.sample(5)

Unnamed: 0,Event,Team,Medal,As,NOC,Discipline,Athlete_Id,Game Year,Game Type,Position,Tied
297043,Mixed Sports,CHN,,Gao Dali,CHN,Snowboarding (Skiing),139997,2020,Winter Youth Olympics,,False
179841,"10,000 metres, Men (Olympic)",,,Lennart Carlsson,SWE,Speed Skating (Skating),84612,1976,Winter Olympics,13.0,False
256561,"Mixed Doubles, Mixed (Olympic)",Rachel Homan,,John Morris,CAN,Curling,118897,2022,Winter Olympics,5.0,False
194179,"Football, Men (Olympic)",Spain,Silver,Toni,ESP,Football (Football),91054,2000,Summer Olympics,2.0,False
199255,"100 metres Butterfly, Men (Olympic)",,,Paval Lahun,BLR,Swimming (Aquatics),93311,2004,Summer Olympics,26.0,False


# Cleaned Data Stage II

In [16]:
imputed_bios_df = pd.read_parquet(f"s3://{silver_bucket}/clean_data_II/cleaned_biodata.parquet", storage_options=s3fs_opts)
imputed_editions_df = pd.read_parquet(f"s3://{silver_bucket}/clean_data_II/cleaned_editions.parquet", storage_options=s3fs_opts)

In [17]:
imputed_bios_df.sample(5)

Unnamed: 0,Roles,Sex,NOC,Athlete_Id,Name,Height (cm),Weight (kg),Born_Date,Died_Date,Is_Alive,Born_City,Born_Region,Born_Country,Height_Imputed,Weight_Imputed,Born_Country_From_NOC
60306,Competed in Olympic Games,Male,united states,60755,Jim Scherr,183.0,95.0,1961-07-27,NaT,True,Eureka,South Dakota,USA,False,False,False
121083,Competed in Olympic Games,Male,united kingdom,123002,Ed Scott,197.0,85.0,1988-05-28,NaT,True,Leeds,England,GBR,False,False,False
107610,Competed in Olympic Games,Female,brazil,108756,Alexandra Nascimento,177.0,68.0,1981-09-16,NaT,True,Limeira,São Paulo,BRA,False,False,False
94817,Competed in Olympic Games,Male,united states,95587,Ken Morris,183.0,79.0,1942-08-19,NaT,True,New York,New York,USA,False,False,False
7833,Competed in Olympic Games,Male,puerto rico,7875,Juan Venegas,172.0,63.0,1929-06-02,1987-04-16,False,Río Piedras,Puerto Rico,PUR,True,True,False


In [18]:
imputed_editions_df.sample(5)

Unnamed: 0,Year,City,Country,Opened,Closed,Comments,Game_Type,Edition_Name,Competition_Start,Competition_End,Game_Id,Opened_Imputed,Closed_Imputed
65,2018,PyeongChang,KOR,2018-02-09,2018-02-25,,Olympic Games,Winter,2018-02-08,2018-02-25,66,False,False
74,2028,Los Angeles,USA,,,,Olympic Games,Summer,,,75,False,False
17,1932,Lake Placid,USA,1932-02-04,1932-02-13,,Olympic Games,Winter,1932-02-04,1932-02-15,18,False,False
7,1906,Athina,GRE,1906-04-22,1906-05-02,,Intercalated Games,,1906-04-22,1906-05-02,8,False,False
63,2016,Rio de Janeiro,BRA,2016-08-05,2016-08-21,,Olympic Games,Summer,2016-08-03,2016-08-21,64,False,False


# Missing Values Percentage Comparison

In [19]:
bios1 = pd.Series(df_nan_percentage(cleaned_biodata_df), name="Before Imputing")
bios2 = pd.Series(df_nan_percentage(imputed_bios_df), name="After Imputing")

games1 = pd.Series(df_nan_percentage(cleaned_editions_df), name="Before Imputing")
games2 = pd.Series(df_nan_percentage(imputed_editions_df), name="After Imputing")

**Bios Data:**

In [20]:
pd.concat([bios1, bios2], axis=1)

Unnamed: 0,Before Imputing,After Imputing
Died_Date,76.67%,76.67%
Weight (kg),29.85%,0.0%
Height (cm),26.69%,0.0%
Born_City,22.67%,22.67%
Born_Country,22.67%,4.56%
Born_Region,22.67%,22.67%
Born_Date,1.24%,1.24%
Name,0.0%,0.0%
Sex,0.0%,0.0%
Roles,0.0%,0.0%


**Games Data:**

In [21]:
pd.concat([games1, games2], axis=1)


Unnamed: 0,Before Imputing,After Imputing
Comments,93.42%,93.42%
Closed,21.05%,15.79%
Opened,19.74%,14.47%
Competition_End,15.79%,15.79%
Competition_Start,14.47%,14.47%
Year,0.0%,0.0%
City,0.0%,0.0%
Country,0.0%,0.0%
Game_Type,0.0%,0.0%
Edition_Name,0.0%,0.0%


# Data Quality Failure Cases

In [22]:
bios_failure_cases_df = pd.read_parquet(f"s3://{silver_bucket}/failure_cases/bios_failure_cases.parquet", storage_options=s3fs_opts)
affiliations_failure_cases_df = pd.read_parquet(f"s3://{silver_bucket}/failure_cases/affiliations_failure_cases.parquet", storage_options=s3fs_opts)
editions_failure_cases_df = pd.read_parquet(f"s3://{silver_bucket}/failure_cases/editions_failure_cases.parquet", storage_options=s3fs_opts)
results_failure_cases_df = pd.read_parquet(f"s3://{silver_bucket}/failure_cases/results_failure_cases.parquet", storage_options=s3fs_opts)

In [23]:
bios_failure_cases_df.sample(5)

Unnamed: 0,Athlete_Id,Born_Country_From_NOC,Height (cm),Height_Imputed,Is_Alive,NOC,Name,Roles,Sex,Weight (kg),Weight_Imputed,failed_check,Born_Date,Died_Date,Born_City,Born_Region,Born_Country
69,120168,False,160.0,False,True,honduras,Claudia Fajardo,Competed in Olympic Games,Female,117.0,False,height_weight_ratio_invalid,1985-09-26,NaT,Puerto Cortés,Cortés,HON
75,126756,False,173.0,False,True,united states,Holley Mangold,Competed in Olympic Games,Female,155.0,False,height_weight_ratio_invalid,1989-12-22,NaT,Kettering,Ohio,USA
77,129927,True,160.0,False,True,egypt,Shaimaa Haridy,Competed in Olympic Games,Female,124.0,False,height_weight_ratio_invalid,1991-01-01,NaT,,,EGY
58,107849,False,170.0,False,True,germany,Lisa Ingildeeva,Competed in Olympic Games,Female,42.0,False,height_weight_ratio_invalid,1988-12-04,NaT,Moskva (Moscow),Moskva,RUS
24,29052,False,168.0,False,True,russian federation,Irina Dzyuba,Competed in Olympic Games,Female,42.0,False,height_weight_ratio_invalid,1980-12-16,NaT,Novosibirsk,Novosibirsk,RUS


In [24]:
affiliations_failure_cases_df.sample(5)

Unnamed: 0,Affiliation_Club,Affiliation_Id,failed_check,Affiliation_City,Affiliation_Country
1020,Dynamo,21075,duplicate_affiliation_with_different_ids,Kolos Rivne,
1825,Colsanitas,31369,duplicate_affiliation_with_different_ids,Colombia,
2636,SV Baiersbronn,37520,duplicate_affiliation_with_different_ids,,
2184,Ullevi FK,34111,duplicate_affiliation_with_different_ids,Göteborg,
2240,Fujian Province,34726,duplicate_affiliation_with_different_ids,,CHN


In [25]:
editions_failure_cases_df.sample(5)

Unnamed: 0,Edition_Name,failed_check,Year,City,Country,Opened,Closed,Comments,Game_Type,Competition_Start,Competition_End,Game_Id,Opened_Imputed,Closed_Imputed,Competition_Start_Imputed,Competition_End_Imputed
13,,competition_start_after_end,1889,Athina,GRE,1889-12-01,1889-04-30,,Forerunners to the Olympic Games,1889-12-01,1889-04-30,4,True,True,False,False
4,,"isin(['Summer', 'Winter', 'Equestrian'])",1906,Athina,GRE,1906-04-22,1906-05-02,,Intercalated Games,1906-04-22,1906-05-02,8,False,False,False,False
12,,competition_start_after_end,1875,Athina,GRE,1875-05-11,1875-05-18,,Forerunners to the Olympic Games,1875-05-11,1875-05-18,3,True,True,False,False
5,,opened_after_closed,1859,Athina,GRE,,,,Forerunners to the Olympic Games,,,1,False,False,False,False
0,,"isin(['Summer', 'Winter', 'Equestrian'])",1859,Athina,GRE,,,,Forerunners to the Olympic Games,,,1,False,False,False,False


In [26]:
results_failure_cases_df.sample(5)

Unnamed: 0,As,Athlete_Id,Discipline,Event,NOC,Position,Tied,failed_check,Team,Medal,Game Year,Game Type
2311,Juan Ecker,35213,Rowing,"Coxed Fours, Men (Olympic)",ARG,3,False,position_medal_mismatch,Argentina,,1952,Summer Olympics
8165,Pyotr Vasilyev,94030,Swimming (Aquatics),"4 × 100 metres Freestyle Relay, Men (Olympic)",UZB,1,False,position_medal_mismatch,Uzbekistan,,2000,Summer Olympics
1442,Alfonso López,21532,Fencing,"Épée, Team, Men (Olympic)",CUB,3,False,position_medal_mismatch,Cuba,,1924,Summer Olympics
8603,Robert Lathouwers,115322,Athletics,"800 metres, Men (Olympic)",NED,3,False,position_medal_mismatch,,,2008,Summer Olympics
5838,Eamonn Martin,69280,Athletics,"10,000 metres, Men (Olympic)",GBR,2,False,position_medal_mismatch,,,1988,Summer Olympics


# Final Cleaned Warehouse Modeld Data 

In [27]:
dim_athletes_df=pd.read_parquet(f"s3://{gold_bucket}/clean_data_final/dim_athletes.parquet", storage_options=s3fs_opts)
dim_affiliations_df=pd.read_parquet(f"s3://{gold_bucket}/clean_data_final/dim_affiliations.parquet", storage_options=s3fs_opts)
bridge_athletes_affiliations_df=pd.read_parquet(f"s3://{gold_bucket}/clean_data_final/bridge_athletes_affiliations.parquet", storage_options=s3fs_opts)
dim_games_df=pd.read_parquet(f"s3://{gold_bucket}/clean_data_final/dim_games.parquet", storage_options=s3fs_opts)
fct_results_df=pd.read_parquet(f"s3://{gold_bucket}/clean_data_final/fct_results.parquet", storage_options=s3fs_opts)


In [28]:
dim_athletes_df.sample(5)

Unnamed: 0,athlete_id,athlete_name,athlete_roles,athlete_sex,athlete_NOC,athlete_height_cm,athlete_weight_kg,athlete_born_date,athlete_died_date,athlete_is_alive,athlete_born_city,athlete_born_region,athlete_born_country,athlete_is_height_imputed,athlete_is_weight_imputed,athlete_is_born_country_from_NOC
43714,44057,Hans Egli,Competed in Olympic Games,Male,switzerland,175.0,76.0,NaT,NaT,True,,,CHE,True,True,True
92194,92934,Miguel Nunes,Competed in Olympic Games,Male,portugal,180.0,72.0,1976-08-11,NaT,True,Lisboa,Distrito de Lisboa,POR,False,False,False
118475,120167,Abeba Aregawi,Competed in Olympic Games,Female,ethiopia,170.0,52.0,1990-07-05,NaT,True,Adigrat,Tigray,ETH,False,False,False
33523,33779,Gabriel Goldschmied,Competed in Olympic Games,Male,mexico,181.0,80.0,1939-04-22,NaT,True,Ciudad de México (Mexico City),Ciudad de México,MEX,False,False,False
1446,1559,Fabio Betto,Competed in Olympic Games,Male,italy,189.0,95.0,1972-09-25,NaT,True,Treviso,Treviso,ITA,False,False,False


In [29]:
dim_affiliations_df.sample(5)

Unnamed: 0,affiliation_id,dim_affiliation_club,dim_affiliation_city,dim_affiliation_country
8294,8294,Valur,Reykjavík,ISL
12300,12300,Suntory Sunbirds,Osaka,JPN
4279,4279,ÚDA Praha,,
28199,28199,Volley Corigliano,"Corigliano Calabro, Corigliano-Rossano",ITA
20707,20707,Skiverein Schwoich,,


In [30]:
bridge_athletes_affiliations_df.sample(5)

Unnamed: 0,athlete_id,affiliation_id
144166,137756,36600
53557,51403,12003
26680,25567,6560
74718,71429,16375
32164,30912,0


In [31]:
dim_games_df.sample(5)

Unnamed: 0,game_id,dim_game_type,dim_edition_name,dim_game_year,dim_city,dim_country,dim_opened,dim_closed,dim_competition_start,dim_competition_end,dim_comments,dim_opened_imputed,dim_closed_imputed,dim_competition_start_imputed,dim_competition_end_imputed
46,47,Olympic Games,Winter,1988,Calgary,CAN,1988-02-13,1988-02-28,1988-02-13,1988-02-28,,False,False,False,False
25,26,Olympic Games,Winter,1948,Sankt Moritz,SUI,1948-01-30,1948-02-08,1948-01-30,1948-02-08,,False,False,False,False
27,28,Olympic Games,Winter,1952,Oslo,NOR,1952-02-15,1952-02-25,1952-02-14,1952-02-25,,False,False,False,False
53,54,Olympic Games,Winter,2002,Salt Lake City,USA,2002-02-08,2002-02-24,2002-02-09,2002-02-24,,False,False,False,False
9,10,Olympic Games,Summer,1912,Stockholm,SWE,1912-07-06,1912-07-15,1912-05-05,1912-07-27,,False,False,False,False


In [32]:
fct_results_df.sample(5)

Unnamed: 0,athlete_id,dim_noc,dim_discipline,dim_game_type,dim_game_year,dim_event_name,dim_team_name,dim_as,m_tied_flag,m_position,m_medal
234061,108385,CUB,Baseball (Baseball/Softball),Summer Olympics,2004,"Baseball, Men (Olympic)",Cuba,Osmani Urrutia,False,1,Gold
257081,119194,CAN,Ice Hockey (Ice Hockey),Winter Olympics,2014,"Ice Hockey, Men (Olympic)",Canada,Drew Doughty,False,1,Gold
145393,69567,GBR,Athletics,Summer Olympics,1972,"Discus Throw, Men (Olympic)",,John Watts,False,24,
43361,22901,MAR,Fencing,Summer Olympics,1960,"Foil, Individual, Men (Olympic)",,Charles El-Gressy,False,6,
35521,19948,NZL,Hockey,Summer Olympics,1964,"Hockey, Men (Olympic)",New Zealand,Bill Schaefer,True,13,
