In [1]:
import pandas as pd

In [2]:
s3_endpoint = "http://localhost:9000" 

access_key = "accesskey"
secret_key = "secretkey"

bronze_bucket = "bronze"
silver_bucket = "silver"
gold_bucket = "gold"

s3fs_opts = {
    "key": access_key,
    "secret": secret_key,
    "client_kwargs": {"endpoint_url": s3_endpoint},
}




In [3]:
def df_nan_percentage(df):
    """Return a Series showing percentage of NaN values per column."""

    return df.isna().mean().mul(100).round(2).sort_values(ascending=False).map(lambda x: f"{x}%")


# Raw Data Sample

In [4]:
bios_raw_data_df = pd.read_parquet(f"s3://{bronze_bucket}/raw_data/biodata.parquet", storage_options=s3fs_opts)
editions_raw_data_df = pd.read_parquet(f"s3://{bronze_bucket}/raw_data/editions.parquet", storage_options=s3fs_opts)
results_raw_data_df = pd.read_parquet(f"s3://{bronze_bucket}/raw_data/results.parquet", storage_options=s3fs_opts)

**Bios Raw Data:**

In [33]:
bios_raw_data_df.sample(5)

Unnamed: 0,Roles,Sex,Full name,Used name,Born,Died,NOC,Athlete_Id,Affiliations,Nick/petnames,Measurements,Title(s),Nationality,Other names,Original name,Name order
26119,Competed in Olympic Games,Male,Abdullah Mohammad Hussain•Al-Buloshi,Abdullah•Al-Buloshi,16 February 1960 in Madinat al-Kuwait (Kuwait ...,,Kuwait,26312,"Al-Arabi, Madinat al-Kuwait (KUW)",,176 cm / 72 kg,,,,عبدالله محمد حسين•البلوشي,
88159,Competed in Olympic Games,Female,Choi•Hyeon-Jeong,Choi•Hyeon-Jeong,21 July 1981,,Republic of Korea,88867,"Sangmung University, Chungnam",,172 cm / 67 kg,,,Choi Hyun-Jung,최•현정,Oriental
50953,Competed in Olympic Games,Male,"David William ""Dave""•Fairbank",Dave•Fairbank,"19 December 1954 in Sacramento, California (USA)",,United States,51328,"Arden Hills Swim Club, Sacramento (USA)",,193 cm / 72 kg,,,,,
100754,Competed in Olympic Games,Female,Yuliya•Holovina,Yuliya•Holovina,"30 September 1982 in Kharkiv, Kharkiv (UKR)",,Ukraine,101623,"Kolos Kharkiv, Kharkiv (UKR)",,165 cm / 50 kg,,,,Юлія•Головіна,
38632,Competed in Olympic Games,Male,"Paulus Jan ""Paul""•Lotsij",Paul•Lotsij,"4 February 1880 in Dordrecht, Zuid-Holland (NED)","19 September 1910 in Amsterdam, Noord-Holland ...",Netherlands,38937,"Nereus, Amsterdam (NED)",,,,,,,


**Results Raw Data:**

In [34]:

results_raw_data_df.sample(5)

Unnamed: 0,Games,Event,Team,Pos,Medal,As,NOC,Discipline,Athlete_Id,Nationality,Unnamed: 7
252336,2012 Summer Olympics,"Singles, Men (Olympic)",,=17,,Bojan Tokič,SLO,Table Tennis,117536,,
72810,1948 Summer Olympics,"Parallel Bars, Men (Olympic)",,=92,,Karel Janež,YUG,Artistic Gymnastics (Gymnastics),31499,,
248746,2008 Summer Olympics,"100 metres Backstroke, Women (Olympic)",,47,,Christie Bodden,PAN,Swimming (Aquatics),115673,,
40887,1952 Summer Olympics,"Foil, Team, Men (Olympic)",Great Britain,3 p2 r2/4,,Allan Jay,GBR,Fencing,22153,,
93398,1924 Summer Olympics,"Discus Throw, Men (Olympic)",,31,,António Martins,POR,Athletics,43764,,


**Games Raw Data:**

In [7]:
editions_raw_data_df.head()

Unnamed: 0,#,Year,City,Country,Opened,Closed,Competition,Unnamed: 7,Game_Type,Edition_Name
0,I,1896,Athina,GRE,6 April,15 April,6 – 13 April,,Olympic Games,Summer
1,II,1900,Paris,FRA,,,14 May – 28 October,,Olympic Games,Summer
2,III,1904,St. Louis,USA,14 May,,1 July – 26 November,,Olympic Games,Summer
3,IV,1908,London,GBR,13 July,25 July,27 April – 31 October,,Olympic Games,Summer
4,V,1912,Stockholm,SWE,6 July,15 July,5 May – 27 July,,Olympic Games,Summer


**Scrape Failures Logs:**

In [8]:
scrape_failures_df = pd.read_parquet(f"s3://{bronze_bucket}/scrape_failures/failed_athletes.parquet", storage_options=s3fs_opts)

In [38]:
scrape_failures_df.sample(5)

Unnamed: 0,failed_athlete_id,error_message
4177,149752,Status 404 for athlete 149752
1573,119871,Status 404 for athlete 119871
3518,145611,Status 404 for athlete 145611
2060,124931,Status 404 for athlete 124931
3439,143732,Status 404 for athlete 143732


# Cleaned Data Stage I

In [10]:
cleaned_biodata_df = pd.read_parquet(f"s3://{silver_bucket}/clean_data/cleaned_biodata.parquet", storage_options=s3fs_opts)
cleaned_affiliations_df = pd.read_parquet(f"s3://{silver_bucket}/clean_data/dim_affiliation.parquet", storage_options=s3fs_opts)
bridge_athlete_affiliation_df = pd.read_parquet(f"s3://{silver_bucket}/clean_data/bridge_athlete_affiliation.parquet", storage_options=s3fs_opts)
cleaned_editions_df = pd.read_parquet(f"s3://{silver_bucket}/clean_data/cleaned_editions.parquet", storage_options=s3fs_opts)
cleaned_results_df = pd.read_parquet(f"s3://{silver_bucket}/clean_data/cleaned_results.parquet", storage_options=s3fs_opts)

**Cleaned Bios Data:**

In [46]:
cleaned_biodata_df.sample(5)

Unnamed: 0,Roles,Sex,NOC,Athlete_Id,Name,Height (cm),Weight (kg),Born_Date,Died_Date,Is_Alive,Born_City,Born_Region,Born_Country
46034,Competed in Olympic Games,Male,egypt,46387,Said Daw,173.0,71.0,1960-07-22,NaT,True,,,
139845,Competed in Olympic Games,Female,germany,143286,Liane Lippert,168.0,,1998-01-13,NaT,True,Friedrichshafen,Baden-Württemberg,GER
40381,Competed in Olympic Games,Male,united states,40692,John Houser,,,1909-12-02,1991-12-29,False,Philadelphia,Pennsylvania,USA
125115,Competed in Olympic Games,Female,brazil,127410,Fabiana Santos,168.0,75.0,1983-11-03,NaT,True,Santo André,São Paulo,BRA
14882,Competed in Olympic Games,Male,denmark,14982,Per Lyngemark,178.0,68.0,1941-05-23,2010-04-02,False,Frederiksberg,Hovedstaden,DEN


**Cleaned Afiliations Data:**

In [47]:
cleaned_affiliations_df.sample(5)

Unnamed: 0,Affiliation_Id,Affiliation_Club,Affiliation_City,Affiliation_Country
15776,15776,Brooks Racing Team,,USA
2806,2806,AD Zamora,Zamora,ESP
6067,6067,Valencia CF,Valencia,ESP
26899,26899,Örebro HK,Örebro,SWE
30329,30329,FDJ-BigMat,France,


**Athlete - Affiliation Bridge Data:**

In [57]:
bridge_athlete_affiliation_df.sample(5)

Unnamed: 0,Athlete_Id,Affiliation_Id
99378,94512,12204
2728,2712,865
144511,138096,0
145942,139563,37336
15320,14652,3816


**Cleaned Games Data:**

In [61]:
cleaned_editions_df.sample(5)

Unnamed: 0,Year,City,Country,Opened,Closed,Comments,Game_Type,Edition_Name,Competition_Start,Competition_End,Game_Id
46,1988,Calgary,CAN,1988-02-13,1988-02-28,,Olympic Games,Winter,1988-02-13,1988-02-28,47
61,2014,Sochi,RUS,2014-02-07,2014-02-23,,Olympic Games,Winter,2014-02-06,2014-02-23,62
32,1960,Squaw Valley,USA,1960-02-18,1960-02-28,,Olympic Games,Winter,1960-02-19,1960-02-28,33
43,1984,Los Angeles,USA,1984-07-28,1984-08-12,,Olympic Games,Summer,1984-07-06,1984-08-12,44
8,1908,London,GBR,1908-07-13,1908-07-25,,Olympic Games,Summer,1908-04-27,1908-10-31,9


**Cleaned results Data:**

In [72]:
cleaned_results_df.sample(5)

Unnamed: 0,Event,Team,Medal,As,NOC,Discipline,Athlete_Id,Game Year,Game Type,Position,Tied
296358,"3-on-3 Ice Hockey, Boys (YOG)",Team Blue,,Simone Terraneo,SUI,3-on-3 Ice Hockey (Ice Hockey),139626,2020,Winter Youth Olympics,8,False
286095,"5,000 metres, Men (Olympic)",,,Caleb Ndiku,KEN,Athletics,134874,2016,Summer Olympics,6,False
240395,"Slalom, Men (Olympic)",,Silver,Reinfried Herbst,AUT,Alpine Skiing (Skiing),111034,2006,Winter Olympics,2,False
204646,"Ice Hockey, Men (Olympic)",Finland,Bronze,Mika Nieminen,FIN,Ice Hockey (Ice Hockey),95832,1998,Winter Olympics,3,False
898,"Sur La Perche À La Herse, Men (Olympic)",,Bronze,"Druart, Jr.",BEL,Archery,323,1900,Summer Olympics,3,False


# Cleaned Data Stage II

In [16]:
imputed_bios_df = pd.read_parquet(f"s3://{silver_bucket}/clean_data_II/cleaned_biodata.parquet", storage_options=s3fs_opts)
imputed_editions_df = pd.read_parquet(f"s3://{silver_bucket}/clean_data_II/cleaned_editions.parquet", storage_options=s3fs_opts)

**imputed Bios Data:**

In [77]:
imputed_bios_df.sample(5)

Unnamed: 0,Roles,Sex,NOC,Athlete_Id,Name,Height (cm),Weight (kg),Born_Date,Died_Date,Is_Alive,Born_City,Born_Region,Born_Country,Height_Imputed,Weight_Imputed,Born_Country_From_NOC
69916,Competed in Olympic Games,Male,germany,70458,Jens Reimers,203.0,125.0,1941-08-15,2001-11-19,False,Salzwedel,Sachsen-Anhalt,GER,False,False,False
22030,"Competed in Olympic Games,Referee",Male,united kingdom,22187,Arthur Pilbrow,180.0,75.0,1902-05-18,1987-07-16,False,,,GBR,True,True,True
55127,Competed in Olympic Games,Male,egypt,55541,Kamal Mahgoub,170.0,77.0,1921-12-10,2007-02-09,False,Al-Qahira (Cairo),Al-Qahira,EGY,True,True,False
21996,Competed in Olympic Games,Male,united kingdom,22155,Ralph Johnson,175.0,74.0,1948-06-03,NaT,True,London,England,GBR,False,False,False
142834,Competed in Olympic Games,Female,south africa,146397,Kaylene Corbett,170.0,60.0,1999-06-15,NaT,True,,,ZAF,True,True,True


**imputed Games Data:**

In [81]:
imputed_editions_df.sample(5)

Unnamed: 0,Year,City,Country,Opened,Closed,Comments,Game_Type,Edition_Name,Competition_Start,Competition_End,Game_Id,Opened_Imputed,Closed_Imputed
7,1906,Athina,GRE,1906-04-22,1906-05-02,,Intercalated Games,,1906-04-22,1906-05-02,8,False,False
49,1994,Lillehammer,NOR,1994-02-12,1994-02-27,,Olympic Games,Winter,1994-02-12,1994-02-27,50,False,False
2,1875,Athina,GRE,1875-05-11,1875-05-18,,Forerunners to the Olympic Games,,1875-05-11,1875-05-18,3,True,True
35,1968,Ciudad de México,MEX,1968-10-12,1968-10-27,,Olympic Games,Summer,1968-10-12,1968-10-27,36,False,False
48,1992,Albertville,FRA,1992-02-08,1992-02-23,,Olympic Games,Winter,1992-02-08,1992-02-23,49,False,False


# Missing Values Percentage Comparison

In [19]:
bios1 = pd.Series(df_nan_percentage(cleaned_biodata_df), name="Before Imputing")
bios2 = pd.Series(df_nan_percentage(imputed_bios_df), name="After Imputing")

games1 = pd.Series(df_nan_percentage(cleaned_editions_df), name="Before Imputing")
games2 = pd.Series(df_nan_percentage(imputed_editions_df), name="After Imputing")

**Bios Data:**

In [20]:
pd.concat([bios1, bios2], axis=1)

Unnamed: 0,Before Imputing,After Imputing
Died_Date,76.67%,76.67%
Weight (kg),29.85%,0.0%
Height (cm),26.69%,0.0%
Born_City,22.67%,22.67%
Born_Country,22.67%,4.56%
Born_Region,22.67%,22.67%
Born_Date,1.24%,1.24%
Name,0.0%,0.0%
Sex,0.0%,0.0%
Roles,0.0%,0.0%


**Games Data:**

In [21]:
pd.concat([games1, games2], axis=1)


Unnamed: 0,Before Imputing,After Imputing
Comments,93.42%,93.42%
Closed,21.05%,15.79%
Opened,19.74%,14.47%
Competition_End,15.79%,15.79%
Competition_Start,14.47%,14.47%
Year,0.0%,0.0%
City,0.0%,0.0%
Country,0.0%,0.0%
Game_Type,0.0%,0.0%
Edition_Name,0.0%,0.0%


# Data Quality Failure Cases

In [145]:
bios_failure_cases_df = pd.read_parquet(f"s3://{silver_bucket}/failure_cases/bios_failure_cases.parquet", storage_options=s3fs_opts)
affiliations_failure_cases_df = pd.read_parquet(f"s3://{silver_bucket}/failure_cases/affiliations_failure_cases.parquet", storage_options=s3fs_opts)
editions_failure_cases_df = pd.read_parquet(f"s3://{silver_bucket}/failure_cases/editions_failure_cases.parquet", storage_options=s3fs_opts)
results_failure_cases_df = pd.read_parquet(f"s3://{silver_bucket}/failure_cases/results_failure_cases.parquet", storage_options=s3fs_opts)

**Bios Data Quality Failure Cases:**

In [23]:
bios_failure_cases_df.sample(5)

Unnamed: 0,Athlete_Id,Born_Country_From_NOC,Height (cm),Height_Imputed,Is_Alive,NOC,Name,Roles,Sex,Weight (kg),Weight_Imputed,failed_check,Born_Date,Died_Date,Born_City,Born_Region,Born_Country
20,28381,False,161.0,False,True,france,Audrey Grosclaude,Competed in Olympic Games,Female,38.0,False,height_weight_ratio_invalid,1980-09-29,NaT,Tassin-la-Demi-Lune,Rhône,FRA
2,80315,True,168.0,True,True,france,Georges Dubois,Competed in Olympic Games,Male,63.0,True,duplicate_name_birth_date,NaT,NaT,,,FRA
27,33697,True,165.0,False,True,egypt,Sherif El-Digwy,Competed in Olympic Games,Male,130.0,False,height_weight_ratio_invalid,1965-04-16,NaT,,,EGY
53,104638,False,175.0,False,True,nauru,Itte Detenamo,"Competed in Olympic Games,Other",Male,148.0,False,height_weight_ratio_invalid,1986-09-22,NaT,Buada,Buada,NRU
58,107849,False,170.0,False,True,germany,Lisa Ingildeeva,Competed in Olympic Games,Female,42.0,False,height_weight_ratio_invalid,1988-12-04,NaT,Moskva (Moscow),Moskva,RUS


**Affiliations Data Quality Failure Cases:**

In [83]:
affiliations_failure_cases_df.sample(5)

Unnamed: 0,Affiliation_Club,Affiliation_Id,failed_check,Affiliation_City,Affiliation_Country
2556,Ready,36966,duplicate_affiliation_with_different_ids,,
1900,Kerikeri Cruising Club,31934,duplicate_affiliation_with_different_ids,New Zealand,
1817,NSW Arrows,31298,duplicate_affiliation_with_different_ids,"Sydney, NSW, AUS",
1608,Tokyo Biso,29847,duplicate_affiliation_with_different_ids,"Tokyo, Japan",
2024,HPC-Ontario,32957,duplicate_affiliation_with_different_ids,Toronto,CAN


**Games Data Quality Failure Cases:**

In [88]:
editions_failure_cases_df.sample(5)

Unnamed: 0,Edition_Name,failed_check,Year,City,Country,Opened,Closed,Comments,Game_Type,Competition_Start,Competition_End,Game_Id,Opened_Imputed,Closed_Imputed
4,,"isin(['Summer', 'Winter', 'Equestrian'])",1906,Athina,GRE,1906-04-22,1906-05-02,,Intercalated Games,1906-04-22,1906-05-02,8,False,False
9,,opened_after_closed,1906,Athina,GRE,1906-04-22,1906-05-02,,Intercalated Games,1906-04-22,1906-05-02,8,False,False
12,,competition_start_after_end,1875,Athina,GRE,1875-05-11,1875-05-18,,Forerunners to the Olympic Games,1875-05-11,1875-05-18,3,True,True
11,,competition_start_after_end,1870,Athina,GRE,,,,Forerunners to the Olympic Games,,,2,False,False
14,,competition_start_after_end,1906,Athina,GRE,1906-04-22,1906-05-02,,Intercalated Games,1906-04-22,1906-05-02,8,False,False


**Results Data Quality Failure Cases:**

In [147]:
results_failure_cases_df.sample(5)


Unnamed: 0,As,Athlete_Id,Discipline,Event,Game Type,Game Year,Medal,NOC,Position,Tied,failed_check,Team
17,Simon Schempp,118583,Biathlon,"4 × 7.5 kilometres Relay, Men (Olympic)",Winter Olympics,2014,Silver,GER,1,False,Position–Medal mismatch,Germany
0,Albert Pettersson,56746,Weightlifting,"Middleweight, Men (Olympic)",Summer Olympics,1920,Bronze,SWE,2,True,Position–Medal mismatch,
5,Charles Lomberg,76283,Athletics,"Decathlon, Men (Olympic)",Summer Olympics,1912,Silver,SWE,3,False,Position–Medal mismatch,
6,Mickey Patterson,77934,Athletics,"200 metres, Women (Olympic)",Summer Olympics,1948,Bronze,USA,4,False,Position–Medal mismatch,
11,Christoph Sumann,101195,Biathlon,"4 × 7.5 kilometres Relay, Men (Olympic)",Winter Olympics,2014,Bronze,AUT,2,False,Position–Medal mismatch,Austria


# Final Cleaned Warehouse Model

In [27]:
dim_athletes_df=pd.read_parquet(f"s3://{gold_bucket}/clean_data_final/dim_athletes.parquet", storage_options=s3fs_opts)
dim_affiliations_df=pd.read_parquet(f"s3://{gold_bucket}/clean_data_final/dim_affiliations.parquet", storage_options=s3fs_opts)
bridge_athletes_affiliations_df=pd.read_parquet(f"s3://{gold_bucket}/clean_data_final/bridge_athletes_affiliations.parquet", storage_options=s3fs_opts)
dim_games_df=pd.read_parquet(f"s3://{gold_bucket}/clean_data_final/dim_games.parquet", storage_options=s3fs_opts)
fct_results_df=pd.read_parquet(f"s3://{gold_bucket}/clean_data_final/fct_results.parquet", storage_options=s3fs_opts)


**Athletes Dimension Table:**

In [28]:
dim_athletes_df.sample(5)

Unnamed: 0,athlete_id,athlete_name,athlete_roles,athlete_sex,athlete_NOC,athlete_height_cm,athlete_weight_kg,athlete_born_date,athlete_died_date,athlete_is_alive,athlete_born_city,athlete_born_region,athlete_born_country,athlete_is_height_imputed,athlete_is_weight_imputed,athlete_is_born_country_from_NOC
29990,30216,Bernd Jäger,Competed in Olympic Games,Male,germany,160.0,59.0,1951-11-18,NaT,True,Kahla,Thüringen,GER,False,False,False
97765,98560,Al Van,Competed in Olympic Games,Male,united states,181.0,84.0,1915-03-30,1995-08-27,False,Newport,Minnesota,USA,True,True,False
121630,123581,Dan Martin,Competed in Olympic Games,Male,ireland,176.0,63.0,1986-08-20,NaT,True,Birmingham,England,GBR,False,False,False
46377,46729,Laurence Guillou,Competed in Olympic Games,Female,france,184.0,68.0,1969-10-01,NaT,True,Bordeaux,Gironde,FRA,False,False,False
79488,80121,P. Lomvardos,Competed in Intercalated Games,Male,greece,188.0,85.0,NaT,NaT,True,,,GRC,True,True,True


**Affiliations Dimension Table:**

In [159]:
dim_affiliations_df.sample(5)

Unnamed: 0,affiliation_id,dim_affiliation_club,dim_affiliation_city,dim_affiliation_country
28988,28988,Centre africain de lutte féminine,Tunis,TUN
5381,5381,Plon Skoroszyce,Skoroszyce,POL
10493,10493,Columbus Revolver Club,Columbus,USA
17086,17086,Unia Krywałd,,
1010,1010,LG Twins,Seoul,KOR


**Athletes-Affiliations Bridge Table:**

In [156]:
bridge_athletes_affiliations_df.sample(5)

Unnamed: 0,athlete_id,affiliation_id
42391,40739,6033
82997,79186,6040
106647,101261,23887
111519,105728,8737
150489,144348,0


**Games Dimension Table:**

In [148]:
dim_games_df.sample(5)

Unnamed: 0,game_id,dim_game_type,dim_edition_name,dim_game_year,dim_city,dim_country,dim_opened,dim_closed,dim_competition_start,dim_competition_end,dim_comments,dim_opened_imputed,dim_closed_imputed
28,29,Olympic Games,Equestrian,1956,Stockholm,SWE,1956-06-10,1956-06-17,1956-06-11,1956-06-17,,False,False
62,63,Youth Olympic Games,Summer,2014,Nanjing,CHN,2014-08-16,2014-08-28,2014-08-14,2014-08-28,,False,False
16,17,Olympic Games,Summer,1932,Los Angeles,USA,1932-07-30,1932-08-14,1932-07-30,1932-08-14,,False,False
14,15,Olympic Games,Summer,1928,Amsterdam,NED,1928-07-28,1928-08-12,1928-05-17,1928-08-12,,False,False
17,18,Olympic Games,Winter,1932,Lake Placid,USA,1932-02-04,1932-02-13,1932-02-04,1932-02-15,,False,False


**Results Fact Table:**

In [32]:
fct_results_df.sample(5)

Unnamed: 0,athlete_id,dim_noc,dim_discipline,dim_game_type,dim_game_year,dim_event_name,dim_team_name,dim_as,m_tied_flag,m_position,m_medal
28751,15811,NED,Cycling Road (Cycling),Summer Olympics,1924,"Team Pursuit, 4,000 metres, Men (Olympic)",Netherlands,Jan Maas,True,7,
272150,127442,CAN,Cross Country Skiing (Skiing),Winter Olympics,2018,"4 × 5 kilometres Relay, Women (Olympic)",Canada,Emily Nishikawa,False,13,
290640,137259,GER,Bobsleigh (Bobsleigh),Winter Olympics,2018,"Four, Open (Olympic)",Germany 3,Eric Franke,True,2,Silver
15271,9839,GDR,Canoe Sprint (Canoeing),Summer Olympics,1980,"Canadian Doubles, 1,000 metres, Men (Olympic)",Olaf Heukrodt,Uwe Madeja,False,2,Silver
281321,131806,SWE,Swimming (Aquatics),Summer Youth Olympics,2010,"100 metres Backstroke, Girls (YOG)",,Ida Lindborg,False,6,
