In [1]:
import pandas as pd

In [2]:
s3_endpoint = "http://localhost:9000" 

access_key = "accesskey"
secret_key = "secretkey"

bronze_bucket = "bronze"
silver_bucket = "silver"
gold_bucket = "gold"

s3fs_opts = {
    "key": access_key,
    "secret": secret_key,
    "client_kwargs": {"endpoint_url": s3_endpoint},
}




In [3]:
def df_nan_percentage(df):
    """Return a Series showing percentage of NaN values per column."""

    return df.isna().mean().mul(100).round(2).sort_values(ascending=False).map(lambda x: f"{x}%")


# Raw Data Sample

In [4]:
bios_raw_data_df = pd.read_parquet(f"s3://{bronze_bucket}/raw_data/biodata.parquet", storage_options=s3fs_opts)
editions_raw_data_df = pd.read_parquet(f"s3://{bronze_bucket}/raw_data/editions.parquet", storage_options=s3fs_opts)
results_raw_data_df = pd.read_parquet(f"s3://{bronze_bucket}/raw_data/results.parquet", storage_options=s3fs_opts)

**Bios Raw Data:**

In [5]:
bios_raw_data_df.sample(5)

Unnamed: 0,Roles,Sex,Full name,Used name,Born,Died,NOC,Athlete_Id,Affiliations,Nick/petnames,Measurements,Title(s),Nationality,Other names,Original name,Name order
142546,Competed in Olympic Games,Female,Yevgeniya Andreyevna•Kosetskaya,Yevgeniya•Kosetskaya,"16 December 1994 in Chelyabinsk, Chelyabinsk (...",,ROC,146099,,,,,,,Евгения Андреевна•Косецкая,
114993,Competed in Olympic Games,Male,Tigran Gevorg•Martirosyan,Tigran Gevorg•Martirosyan,"9 June 1988 in Gyumri, Shirak (ARM)",,Armenia,116329,,,69 kg,,,,Տիգրան Գևորգ•Մարտիրոսյան,
89502,Competed in Olympic Games,Male,Nam•Sung-Ho,Nam•Sung-Ho,10 October 1975,,Republic of Korea,90221,,,181 cm / 79 kg,,,,,Oriental
94379,Competed in Olympic Games,Male,Kim•In-Seop,Kim•In-Seop,"2 March 1973 in Daegu, Daegu (KOR)",,Republic of Korea,95145,"Samsung Life Sports Club, Seoul (KOR)",,160 cm / 58 kg,,,Kim In-Sub,김•인섭,Oriental
37547,Competed in Olympic Games,Male,Arthur•Heyne,Arthur•Heyne,"25 August 1946 in Kaiserslautern, Rheinland-Pf...",,West Germany,37845,"WSV Godesberg, Bonn (GER)",,184 cm / 82 kg,,,,,


**Results Raw Data:**

In [6]:

results_raw_data_df.sample(5)

Unnamed: 0,Games,Event,Team,Pos,Medal,As,NOC,Discipline,Athlete_Id,Nationality,Unnamed: 7
165282,1908 Summer Olympics,"Standing High Jump, Men (Olympic)",,16,,Martin Sheridan,USA,Athletics,79031,,
35992,1988 Summer Olympics,"Hockey, Men (Olympic)",Soviet Union,7,,Igor Yulchiyev,URS,Hockey,20316,,
61021,1956 Summer Olympics,"Horse Vault, Men (Olympic)",,=41,,Raimo Heinonen,FIN,Artistic Gymnastics (Gymnastics),29777,,
152716,1976 Summer Olympics,"4 × 400 metres Relay, Men (Olympic)",Kingdom of Saudi Arabia,DNS,,Mohamed Ali Al-Malky,KSA,Athletics,73000,,
256120,2014 Winter Olympics,"15 kilometres Skiathlon, Women (Olympic)",,13,,Krista Lähteenmäki,FIN,Cross Country Skiing (Skiing),118787,,


**Games Raw Data:**

In [7]:
editions_raw_data_df.head()

Unnamed: 0,#,Year,City,Country,Opened,Closed,Competition,Unnamed: 7,Game_Type,Edition_Name
0,I,1896,Athina,GRE,6 April,15 April,6 – 13 April,,Olympic Games,Summer
1,II,1900,Paris,FRA,,,14 May – 28 October,,Olympic Games,Summer
2,III,1904,St. Louis,USA,14 May,,1 July – 26 November,,Olympic Games,Summer
3,IV,1908,London,GBR,13 July,25 July,27 April – 31 October,,Olympic Games,Summer
4,V,1912,Stockholm,SWE,6 July,15 July,5 May – 27 July,,Olympic Games,Summer


**Scrape Failures Logs:**

In [8]:
scrape_failures_df = pd.read_parquet(f"s3://{bronze_bucket}/scrape_failures/failed_athletes.parquet", storage_options=s3fs_opts)

In [9]:
scrape_failures_df.sample(5)

Unnamed: 0,failed_athlete_id,error_message
4086,149664,Status 404 for athlete 149664
4270,149845,Status 404 for athlete 149845
870,102274,Status 404 for athlete 102274
3767,149342,Status 404 for athlete 149342
4350,149930,Status 404 for athlete 149930


# Cleaned Data Stage I

In [10]:
cleaned_biodata_df = pd.read_parquet(f"s3://{silver_bucket}/clean_data/cleaned_biodata.parquet", storage_options=s3fs_opts)
cleaned_affiliations_df = pd.read_parquet(f"s3://{silver_bucket}/clean_data/dim_affiliation.parquet", storage_options=s3fs_opts)
bridge_athlete_affiliation_df = pd.read_parquet(f"s3://{silver_bucket}/clean_data/bridge_athlete_affiliation.parquet", storage_options=s3fs_opts)
cleaned_editions_df = pd.read_parquet(f"s3://{silver_bucket}/clean_data/cleaned_editions.parquet", storage_options=s3fs_opts)
cleaned_results_df = pd.read_parquet(f"s3://{silver_bucket}/clean_data/cleaned_results.parquet", storage_options=s3fs_opts)

**Cleaned Bios Data:**

In [11]:
cleaned_biodata_df.sample(5)

Unnamed: 0,Roles,Sex,NOC,Athlete_Id,Name,Height (cm),Weight (kg),Born_Date,Died_Date,Is_Alive,Born_City,Born_Region,Born_Country
120770,Competed in Olympic Games,Male,united kingdom,122671,Peter Wilson,198.0,90.0,1986-09-15,NaT,True,Dorchester,England,GBR
46672,"Competed in Olympic Games,Referee",Female,united kingdom,47025,Lindsey Fraser,165.0,53.0,1958-01-24,NaT,True,Woolwich,England,GBR
20216,Competed in Olympic Games,Female,united states,20360,Anita Miller,172.0,61.0,1951-05-14,NaT,True,Bryn Mawr,Pennsylvania,USA
34586,"Competed in Olympic Games,Other",Male,hungary,34856,János Martinek,175.0,69.0,1965-05-23,NaT,True,Budapest,Budapest,HUN
142762,Competed in Olympic Games,Female,romania,146323,Kriszta Incze,170.0,,1996-05-15,NaT,True,Sfântu Gheorghe,Covasna,ROU


**Cleaned Afiliations Data:**

In [12]:
cleaned_affiliations_df.sample(5)

Unnamed: 0,Affiliation_Id,Affiliation_Club,Affiliation_City,Affiliation_Country
32302,32302,Spordiklubi Biathlon,,
31850,31850,SG Stadtwerke München,München,GER
35172,35172,Debreceni Sportiskola,,
25335,25335,Singapore Badminton Association,,
36766,36766,HC Bílí Tygři Liberec,,ELH


**Athlete - Affiliation Bridge Data:**

In [13]:
bridge_athlete_affiliation_df.sample(5)

Unnamed: 0,Athlete_Id,Affiliation_Id
23790,22785,0
59744,57229,13072
29515,28332,7302
119213,112707,27725
111041,105286,14741


**Cleaned Games Data:**

In [14]:
cleaned_editions_df.sample(5)

Unnamed: 0,Year,City,Country,Opened,Closed,Comments,Game_Type,Edition_Name,Competition_Start,Competition_End,Game_Id
50,1996,Atlanta,USA,1996-07-19,1996-08-04,,Olympic Games,Summer,1996-07-20,1996-08-04,51
40,1976,Innsbruck,AUT,1976-02-04,1976-02-15,,Olympic Games,Winter,1976-02-03,1976-02-15,41
1,1870,Athina,GRE,,,,Forerunners to the Olympic Games,,,,2
20,1940,Helsinki,FIN,,,Not held due to war,Olympic Games,Summer,,,21
62,2014,Nanjing,CHN,2014-08-16,2014-08-28,,Youth Olympic Games,Summer,2014-08-14,2014-08-28,63


**Cleaned results Data:**

In [15]:
cleaned_results_df.sample(5)

Unnamed: 0,Event,Team,Medal,As,NOC,Discipline,Athlete_Id,Game Year,Game Type,Position,Tied
5296,"Doubles, Men (Olympic)",František Týř,,Otto Wofek,TCH,Tennis,2663,1920,Summer Olympics,9,True
194761,"Rings, Men (Olympic)",,,Víctor Cano,ESP,Artistic Gymnastics (Gymnastics),91287,2000,Summer Olympics,52,True
244468,"Cross-Country, Women (Olympic)",,,Adelheid Morath,GER,Cycling Mountain Bike (Cycling),113137,2008,Summer Olympics,18,False
301744,"Lightweight, Freestyle, Women (Olympic)",,,Anshu Malik,IND,Wrestling,143745,2020,Summer Olympics,9,False
140379,"10,000 metres, Men (Olympic)",,,Mariano Haro,ESP,Athletics,67231,1972,Summer Olympics,4,False


# Cleaned Data Stage II

In [16]:
imputed_bios_df = pd.read_parquet(f"s3://{silver_bucket}/clean_data_II/cleaned_biodata.parquet", storage_options=s3fs_opts)
imputed_editions_df = pd.read_parquet(f"s3://{silver_bucket}/clean_data_II/cleaned_editions.parquet", storage_options=s3fs_opts)

**imputed Bios Data:**

In [17]:
imputed_bios_df.sample(5)

Unnamed: 0,Roles,Sex,NOC,Athlete_Id,Name,Height (cm),Weight (kg),Born_Date,Died_Date,Is_Alive,Born_City,Born_Region,Born_Country,Height_Imputed,Weight_Imputed,Born_Country_From_NOC
45142,Competed in Olympic Games,Male,belgium,45491,Paul De Backer,184.0,77.0,1894-11-28,1963-01-09,False,Schaerbeek,Région de Bruxelles-Capitale,BEL,True,True,False
103034,Competed in Olympic Games,Female,united kingdom,104012,Laura Baldwin,170.0,63.0,1980-01-17,NaT,True,London,England,GBR,False,False,False
39750,Competed in Olympic Games,Male,czechoslovakia,40059,Petr Pulkrábek,178.0,74.0,1939-05-04,NaT,True,,,,False,False,True
90015,Competed in Olympic Games,Male,sweden,90739,Pether Markne,175.0,68.0,1962-07-27,NaT,True,"Turinge, Nykvarn",Stockholm,SWE,False,False,False
73570,"Competed in Olympic Games,Administrator",Male,norway,74157,Charles Hoff,182.0,68.0,1902-05-09,1985-02-19,False,"Glemmen, Fredrikstad",Viken,NOR,False,False,False


**imputed Games Data:**

In [18]:
imputed_editions_df.sample(5)

Unnamed: 0,Year,City,Country,Opened,Closed,Comments,Game_Type,Edition_Name,Competition_Start,Competition_End,Game_Id,Opened_Imputed,Closed_Imputed
0,1859,Athina,GRE,,,,Forerunners to the Olympic Games,,,,1,False,False
15,1928,Sankt Moritz,SUI,1928-02-11,1928-02-17,,Olympic Games,Winter,1928-02-11,1928-02-19,16,False,False
37,1972,München,FRG,1972-08-26,1972-09-11,,Olympic Games,Summer,1972-08-26,1972-09-11,38,False,False
19,1936,Garmisch-Partenkirchen,GER,1936-02-06,1936-02-16,,Olympic Games,Winter,1936-02-06,1936-02-16,20,False,False
62,2014,Nanjing,CHN,2014-08-16,2014-08-28,,Youth Olympic Games,Summer,2014-08-14,2014-08-28,63,False,False


# Missing Values Percentage Comparison

In [19]:
bios1 = pd.Series(df_nan_percentage(cleaned_biodata_df), name="Before Imputing")
bios2 = pd.Series(df_nan_percentage(imputed_bios_df), name="After Imputing")

games1 = pd.Series(df_nan_percentage(cleaned_editions_df), name="Before Imputing")
games2 = pd.Series(df_nan_percentage(imputed_editions_df), name="After Imputing")

**Bios Data:**

In [20]:
pd.concat([bios1, bios2], axis=1)

Unnamed: 0,Before Imputing,After Imputing
Died_Date,76.67%,76.67%
Weight (kg),29.85%,0.0%
Height (cm),26.69%,0.0%
Born_City,22.67%,22.67%
Born_Country,22.67%,4.56%
Born_Region,22.67%,22.67%
Born_Date,1.24%,1.24%
Name,0.0%,0.0%
Sex,0.0%,0.0%
Roles,0.0%,0.0%


**Games Data:**

In [21]:
pd.concat([games1, games2], axis=1)


Unnamed: 0,Before Imputing,After Imputing
Comments,93.42%,93.42%
Closed,21.05%,15.79%
Opened,19.74%,14.47%
Competition_End,15.79%,15.79%
Competition_Start,14.47%,14.47%
Year,0.0%,0.0%
City,0.0%,0.0%
Country,0.0%,0.0%
Game_Type,0.0%,0.0%
Edition_Name,0.0%,0.0%


# Data Quality Failure Cases

In [22]:
bios_failure_cases_df = pd.read_parquet(f"s3://{silver_bucket}/failure_cases/bios_failure_cases.parquet", storage_options=s3fs_opts)
affiliations_failure_cases_df = pd.read_parquet(f"s3://{silver_bucket}/failure_cases/affiliations_failure_cases.parquet", storage_options=s3fs_opts)
editions_failure_cases_df = pd.read_parquet(f"s3://{silver_bucket}/failure_cases/editions_failure_cases.parquet", storage_options=s3fs_opts)
results_failure_cases_df = pd.read_parquet(f"s3://{silver_bucket}/failure_cases/results_failure_cases.parquet", storage_options=s3fs_opts)

**Bios Data Quality Failure Cases:**

In [23]:
bios_failure_cases_df.sample(5)

Unnamed: 0,Athlete_Id,Born_Country_From_NOC,Height (cm),Height_Imputed,Is_Alive,NOC,Name,Roles,Sex,Weight (kg),Weight_Imputed,failed_check,Born_Date,Died_Date,Born_City,Born_Region,Born_Country
62,116419,False,183.0,False,True,australia,Damon Kelly,Competed in Olympic Games,Male,154.0,False,height_weight_ratio_invalid,1983-12-01,NaT,Atherton,Queensland,AUS
30,42066,True,168.0,False,True,france,Françoise Decharne,Competed in Olympic Games,Female,42.0,False,height_weight_ratio_invalid,1963-05-20,NaT,,,FRA
75,126756,False,173.0,False,True,united states,Holley Mangold,Competed in Olympic Games,Female,155.0,False,height_weight_ratio_invalid,1989-12-22,NaT,Kettering,Ohio,USA
33,56889,False,185.0,False,False,soviet union,Vasily Alekseyev,"Competed in Olympic Games,Other",Male,160.0,False,height_weight_ratio_invalid,1942-01-07,2011-11-25,Pokrovo-Shishkino,Ryazan,RUS
12,28292,False,165.0,False,True,spain,Marta Baldó,Competed in Olympic Games,Female,40.0,False,height_weight_ratio_invalid,1979-04-08,NaT,Villajoyosa,Alicante,ESP


**Affiliations Data Quality Failure Cases:**

In [24]:
affiliations_failure_cases_df.sample(5)

Unnamed: 0,Affiliation_Club,Affiliation_Id,failed_check,Affiliation_City,Affiliation_Country
2234,Dynamiques de Brebeuf,34671,duplicate_affiliation_with_different_ids,Montreal,
1507,Kelme,28563,duplicate_affiliation_with_different_ids,ESP,
2140,Metz Handball,33805,duplicate_affiliation_with_different_ids,,
1613,Asker SK,29866,duplicate_affiliation_with_different_ids,,
2326,SC im Theresianum,35542,duplicate_affiliation_with_different_ids,,"Vienna, AUT"


**Games Data Quality Failure Cases:**

In [25]:
editions_failure_cases_df.sample(5)

Unnamed: 0,Edition_Name,failed_check,Year,City,Country,Opened,Closed,Comments,Game_Type,Competition_Start,Competition_End,Game_Id,Opened_Imputed,Closed_Imputed
4,,"isin(['Summer', 'Winter', 'Equestrian'])",1906,Athina,GRE,1906-04-22,1906-05-02,,Intercalated Games,1906-04-22,1906-05-02,8,False,False
6,,opened_after_closed,1870,Athina,GRE,,,,Forerunners to the Olympic Games,,,2,False,False
7,,opened_after_closed,1875,Athina,GRE,1875-05-11,1875-05-18,,Forerunners to the Olympic Games,1875-05-11,1875-05-18,3,True,True
5,,opened_after_closed,1859,Athina,GRE,,,,Forerunners to the Olympic Games,,,1,False,False
14,,competition_start_after_end,1906,Athina,GRE,1906-04-22,1906-05-02,,Intercalated Games,1906-04-22,1906-05-02,8,False,False


**Results Data Quality Failure Cases:**

In [26]:
results_failure_cases_df.sample(5)


Unnamed: 0,As,Athlete_Id,Discipline,Event,Game Type,Game Year,Medal,NOC,Position,Tied,failed_check,Team
18,Lu Xiaojun,121984,Weightlifting,"Middleweight, Men (Olympic)",Summer Olympics,2016,Silver,CHN,1,False,Position–Medal mismatch,
20,Daniel Böhm,127807,Biathlon,"4 × 7.5 kilometres Relay, Men (Olympic)",Winter Olympics,2014,Silver,GER,1,False,Position–Medal mismatch,Germany
0,Albert Pettersson,56746,Weightlifting,"Middleweight, Men (Olympic)",Summer Olympics,1920,Bronze,SWE,2,True,Position–Medal mismatch,
6,Mickey Patterson,77934,Athletics,"200 metres, Women (Olympic)",Summer Olympics,1948,Bronze,USA,4,False,Position–Medal mismatch,
11,Christoph Sumann,101195,Biathlon,"4 × 7.5 kilometres Relay, Men (Olympic)",Winter Olympics,2014,Bronze,AUT,2,False,Position–Medal mismatch,Austria


# Final Cleaned Warehouse Model

In [27]:
dim_athletes_df=pd.read_parquet(f"s3://{gold_bucket}/clean_data_final/dim_athletes.parquet", storage_options=s3fs_opts)
dim_affiliations_df=pd.read_parquet(f"s3://{gold_bucket}/clean_data_final/dim_affiliations.parquet", storage_options=s3fs_opts)
bridge_athletes_affiliations_df=pd.read_parquet(f"s3://{gold_bucket}/clean_data_final/bridge_athletes_affiliations.parquet", storage_options=s3fs_opts)
dim_games_df=pd.read_parquet(f"s3://{gold_bucket}/clean_data_final/dim_games.parquet", storage_options=s3fs_opts)
fct_results_df=pd.read_parquet(f"s3://{gold_bucket}/clean_data_final/fct_results.parquet", storage_options=s3fs_opts)


**Athletes Dimension Table:**

In [28]:
dim_athletes_df.sample(5)

Unnamed: 0,athlete_id,athlete_name,athlete_roles,athlete_sex,athlete_NOC,athlete_height_cm,athlete_weight_kg,athlete_born_date,athlete_died_date,athlete_is_alive,athlete_born_city,athlete_born_region,athlete_born_country,athlete_is_height_imputed,athlete_is_weight_imputed,athlete_is_born_country_from_NOC
81519,82192,Tina Riegel,Competed in Olympic Games,Female,germany,150.0,38.0,1965-08-25,NaT,True,Stuttgart,Baden-Württemberg,GER,False,False,False
126756,129088,Lea Yanitsas,Competed in Olympic Games,Female,australia,173.0,77.0,1989-03-15,NaT,True,Paddington,New South Wales,AUS,False,False,False
4425,4451,Ray Cillien,Competed in Olympic Games,Male,luxembourg,181.0,81.0,1939-06-26,1991-09-22,False,Esch-sur-Alzette,Luxembourg,LUX,False,False,False
12760,12831,Sandro Chikhladze,Competed in Olympic Games,Male,unified team,168.0,65.0,1965-06-20,NaT,True,,,,False,False,True
20079,20222,Armand Schlée,Competed in Olympic Games,Male,switzerland,177.0,73.0,1911-01-01,NaT,True,"Castagnola, Lugano",Ticino,SUI,True,True,False


**Affiliations Dimension Table:**

In [29]:
dim_affiliations_df.sample(5)

Unnamed: 0,affiliation_id,dim_affiliation_club,dim_affiliation_city,dim_affiliation_country
32686,32686,Western Athletics,,
3472,3472,Reitclub Burgdorf,,
37291,37291,Kloten,,
15551,15551,Grenoble Union Club,,
29245,29245,Notts Gymnastics Academy,Nottingham,GBR


**Athletes-Affiliations Bridge Table:**

In [30]:
bridge_athletes_affiliations_df.sample(5)

Unnamed: 0,athlete_id,affiliation_id
16253,15536,0
77291,73881,16927
93156,88816,20529
66571,63863,14383
82206,78459,11913


**Games Dimension Table:**

In [31]:
dim_games_df.sample(5)

Unnamed: 0,game_id,dim_game_type,dim_edition_name,dim_game_year,dim_city,dim_country,dim_opened,dim_closed,dim_competition_start,dim_competition_end,dim_comments,dim_opened_imputed,dim_closed_imputed
58,59,Youth Olympic Games,Summer,2010,Singapore,SGP,2010-08-14,2010-08-26,2010-08-12,2010-08-26,,False,False
26,27,Olympic Games,Summer,1952,Helsinki,FIN,1952-07-19,1952-08-03,1952-07-14,1952-08-03,,False,False
3,4,Forerunners to the Olympic Games,,1889,Athina,GRE,1889-12-01,1889-04-30,1889-12-01,1889-04-30,,True,True
31,32,Olympic Games,Summer,1960,Roma,ITA,1960-08-25,1960-09-11,1960-08-25,1960-09-11,,False,False
5,6,Olympic Games,Summer,1900,Paris,FRA,1900-05-14,1900-10-28,1900-05-14,1900-10-28,,True,True


**Results Fact Table:**

In [32]:
fct_results_df.sample(5)

Unnamed: 0,athlete_id,dim_noc,dim_discipline,dim_game_type,dim_game_year,dim_event_name,dim_team_name,dim_as,m_tied_flag,m_position,m_medal
106883,48647,JPN,Swimming (Aquatics),Summer Olympics,1988,"100 metres Butterfly, Women (Olympic)",,Takayo Kitano,False,14,
68014,30886,POL,Artistic Gymnastics (Gymnastics),Summer Olympics,1968,"Floor Exercise, Men (Olympic)",,Aleksander Rokosa,True,51,
164311,78586,USA,Athletics,Summer Olympics,1904,"Discus Throw, Handicap, Men (Olympic (non-medal))",,Will Hunter,False,2,
11565,7288,BEL,Basketball (Basketball),Summer Olympics,1936,"Basketball, Men (Olympic)",Belgium,Gustave Vereecken,True,19,
10647,6605,YUG,Basketball (Basketball),Summer Olympics,1972,"Basketball, Men (Olympic)",Yugoslavia,Milun Marović,False,5,
