In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

## ---------------------
## Extract Stage
## ---------------------

### Validate data
- Data should be consistent.

Check if the EntriesGender table is consistent.

In [2]:
# Compare the number of athletes in EntriesGender and Athletes tables. 
# Choose several disciplines to check on. 

# Discpilines - Cycling BMX Racing, Rugby Sevens
discp1 = "Cycling BMX Racing"
discp2 = "Rugby Sevens"
# Compare numbers in athletes and genders
athl_path = "../Resources/Athletes.csv"
athl = pd.read_csv(athl_path)
gen_path = "../Resources/EntriesGender.csv"
gen = pd.read_csv(gen_path)

a_num = len(athl.loc[athl['Discipline'] == discp1])
g_num = gen.loc[gen['Discipline'] == discp1]['Total'].values[0]

a1_num = len(athl.loc[athl['Discipline'] == discp2])
g1_num = gen.loc[gen['Discipline'] == discp2]['Total'].values[0]

print(f"Number of athletes for {discp1} from Athletes: {a_num}\n"
        f"Number of athletes for {discp1} from EntriesGender: {g_num}")

print(f"Number of athletes for {discp2} from Athletes: {a1_num}\n"
        f"Number of athletes for {discp2} from EntriesGender: {g1_num}")

Number of athletes for Cycling BMX Racing from Athletes: 43
Number of athletes for Cycling BMX Racing from EntriesGender: 48
Number of athletes for Rugby Sevens from Athletes: 283
Number of athletes for Rugby Sevens from EntriesGender: 297


* Result of the Analysis:
* Table EntriesGender should be removed

----------------------------------------------------------------------------------------
- Keep data that have values in the expected ranges.
We want to analyze countries that won at least one Gold Medal, and that are ranked
within 20 first countries in Total Rank of Medals.

Data was analyzed via AWS services.
Initial Data Sets were uploaded into AWS S3,
Athena tables were created via AWS Glue Crawler.
Results of SQL queries in Athena loaded into:
* athena_query_medals.csv
* athena_query_athletes.csv
* athena_query_coaches.csv
* athena_query_teams.csv

## ---------------------
## Transform Stage
## ---------------------

### Business Rules
	
	- Each field should contain a single value.

	- Each Table should have unique id column.

	- All columns in table should be dependent on unique id column.

	- Values should be capitalized.

In [3]:
# Load extracted via sql athena data about medals
medals_path = "Resources/athena_query_medals.csv"
medals = pd.read_csv(medals_path)
medals

Unnamed: 0,noc,gold,total,rank,rank_by_total
0,United States of America,39,113,1,1
1,People's Republic of China,38,88,2,2
2,Japan,27,58,3,5
3,Great Britain,22,65,4,4
4,ROC,20,71,5,3
5,Australia,17,46,6,6
6,Netherlands,10,36,7,9
7,France,10,33,8,10
8,Germany,10,37,9,8
9,Italy,10,40,10,7


### Add new table Countries

In [4]:
# Get names of countries 
noc = medals["noc"].values
noc.sort()
print(noc)

['Australia' 'Brazil' 'Canada' 'Cuba' 'France' 'Germany' 'Great Britain'
 'Hungary' 'Italy' 'Japan' 'Netherlands' 'New Zealand'
 "People's Republic of China" 'Poland' 'ROC' 'Republic of Korea' 'Spain'
 'Switzerland' 'Turkey' 'Ukraine' 'United States of America']


In [5]:
# Create Data Frame for new table Countries with two columns - country_id, country_name
countries = pd.DataFrame({
    "country_id":medals.index,
    "country_name":noc
})
countries.set_index("country_id", inplace = True)
countries

Unnamed: 0_level_0,country_name
country_id,Unnamed: 1_level_1
0,Australia
1,Brazil
2,Canada
3,Cuba
4,France
5,Germany
6,Great Britain
7,Hungary
8,Italy
9,Japan


In [6]:
# Export data for table Countries to csv
countries.to_csv("../Output/countries.csv")

### Transform table Medals

In [7]:
# Change column noc values to country_id values
for index, row in countries.iterrows():
    medals.loc[medals['noc']==row["country_name"], 'noc'] = index
medals.head()

Unnamed: 0,noc,gold,total,rank,rank_by_total
0,0,39,113,1,1
1,1,38,88,2,2
2,2,27,58,3,5
3,3,22,65,4,4
4,4,20,71,5,3


In [8]:
# Rename columns and set index
medals.rename(columns={'noc':'country_id'}, inplace = True)
medals.set_index("country_id", inplace=True)
medals.head()

Unnamed: 0_level_0,gold,total,rank,rank_by_total
country_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,39,113,1,1
1,38,88,2,2
2,27,58,3,5
3,22,65,4,4
4,20,71,5,3


In [9]:
# Export transformed Medals to csv
medals.to_csv("../Output/medals.csv")

### Add new table Disciplines

In [10]:
# Load extracted via sql athena data about athletes - that correspond to countries of interest
athl_path = "Resources/athena_query_athletes.csv"
athletes = pd.read_csv(athl_path)
athletes.head()

Unnamed: 0,name,noc,discipline
0,ABAD Nestor,Spain,Artistic Gymnastics
1,ABAGNALE Giovanni,Italy,Rowing
2,ABALDE Alberto,Spain,Basketball
3,ABALDE Tamara,Spain,Basketball
4,ABALO Luc,France,Handball


In [11]:
sports = athletes['discipline'].unique()
sports.sort()
print(sports)

['3x3 Basketball' 'Archery' 'Artistic Gymnastics' 'Artistic Swimming'
 'Athletics' 'Badminton' 'Baseball/Softball' 'Basketball'
 'Beach Volleyball' 'Boxing' 'Canoe Slalom' 'Canoe Sprint'
 'Cycling BMX Freestyle' 'Cycling BMX Racing' 'Cycling Mountain Bike'
 'Cycling Road' 'Cycling Track' 'Diving' 'Equestrian' 'Fencing' 'Football'
 'Golf' 'Handball' 'Hockey' 'Judo' 'Karate' 'Marathon Swimming'
 'Modern Pentathlon' 'Rhythmic Gymnastics' 'Rowing' 'Rugby Sevens'
 'Sailing' 'Shooting' 'Skateboarding' 'Sport Climbing' 'Surfing'
 'Swimming' 'Table Tennis' 'Taekwondo' 'Tennis' 'Trampoline Gymnastics'
 'Triathlon' 'Volleyball' 'Water Polo' 'Weightlifting' 'Wrestling']


In [12]:
# Create Data Frame for Disciplines with two columns - discipline_id, discipline_name
disciplines = pd.DataFrame({
    "discipline_id":np.arange(len(sports)),
    "discipline_name":sports
})
disciplines.set_index("discipline_id", inplace = True)
disciplines

Unnamed: 0_level_0,discipline_name
discipline_id,Unnamed: 1_level_1
0,3x3 Basketball
1,Archery
2,Artistic Gymnastics
3,Artistic Swimming
4,Athletics
5,Badminton
6,Baseball/Softball
7,Basketball
8,Beach Volleyball
9,Boxing


In [13]:
# Export Disciplines to csv
disciplines.to_csv("../Output/disciplines.csv")

### Transform table Athletes

In [14]:
# Apply BR: Each cell should contain a single value

# Using Pandas split Name column values into two separated columns
columnsplit = athletes['name'].str.split(" ", expand=True)
athletes = athletes.assign(last_name=columnsplit[0],first_name=columnsplit[1])
athletes

Unnamed: 0,name,noc,discipline,last_name,first_name
0,ABAD Nestor,Spain,Artistic Gymnastics,ABAD,Nestor
1,ABAGNALE Giovanni,Italy,Rowing,ABAGNALE,Giovanni
2,ABALDE Alberto,Spain,Basketball,ABALDE,Alberto
3,ABALDE Tamara,Spain,Basketball,ABALDE,Tamara
4,ABALO Luc,France,Handball,ABALO,Luc
...,...,...,...,...,...
6354,ZWICKER Martin Detlef,Germany,Hockey,ZWICKER,Martin
6355,ZWOLINSKA Klaudia,Poland,Canoe Slalom,ZWOLINSKA,Klaudia
6356,ZYKOVA Yulia,ROC,Shooting,ZYKOVA,Yulia
6357,ZYUZINA Ekaterina,ROC,Sailing,ZYUZINA,Ekaterina


In [15]:
# Drop name column and Reorder columns
athletes = athletes.drop(['name'], axis=1)
athletes = athletes[["last_name","first_name", "noc", "discipline"]]
athletes.head()

Unnamed: 0,last_name,first_name,noc,discipline
0,ABAD,Nestor,Spain,Artistic Gymnastics
1,ABAGNALE,Giovanni,Italy,Rowing
2,ABALDE,Alberto,Spain,Basketball
3,ABALDE,Tamara,Spain,Basketball
4,ABALO,Luc,France,Handball


In [17]:
# Apply BR: Values in cells should be Capitalized
athletes['last_name'] = athletes['last_name'].str.capitalize()
athletes['first_name'] = athletes['first_name'].str.capitalize()
athletes.head()

Unnamed: 0,last_name,first_name,noc,discipline
0,Abad,Nestor,Spain,Artistic Gymnastics
1,Abagnale,Giovanni,Italy,Rowing
2,Abalde,Alberto,Spain,Basketball
3,Abalde,Tamara,Spain,Basketball
4,Abalo,Luc,France,Handball


In [18]:
# Change column noc values to country_id values
for index, row in countries.iterrows():
    athletes.loc[athletes['noc']==row["country_name"], 'noc'] = index
athletes.head()

Unnamed: 0,last_name,first_name,noc,discipline
0,Abad,Nestor,16,Artistic Gymnastics
1,Abagnale,Giovanni,8,Rowing
2,Abalde,Alberto,16,Basketball
3,Abalde,Tamara,16,Basketball
4,Abalo,Luc,4,Handball


In [19]:
# Change values in column discipline to discipline_id values
for index, row in disciplines.iterrows():
    athletes.loc[athletes['discipline']==row["discipline_name"], 'discipline'] = index
athletes.head()

Unnamed: 0,last_name,first_name,noc,discipline
0,Abad,Nestor,16,2
1,Abagnale,Giovanni,8,29
2,Abalde,Alberto,16,7
3,Abalde,Tamara,16,7
4,Abalo,Luc,4,22


In [20]:
# Rename columns and name index
athletes.rename(columns={'noc':'country_id', 'discipline':'discipline_id'}, inplace = True)
athletes.index.name='athlete_id'
athletes.head()

Unnamed: 0_level_0,last_name,first_name,country_id,discipline_id
athlete_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,Abad,Nestor,16,2
1,Abagnale,Giovanni,8,29
2,Abalde,Alberto,16,7
3,Abalde,Tamara,16,7
4,Abalo,Luc,4,22


In [21]:
# Export transformed Athletes to csv
athletes.to_csv("../Output/athletes.csv")

### Create table Events

In [69]:
# Load extracted via sql athena data about teams
team_path = "Resources/athena_query_teams.csv"
teams = pd.read_csv(team_path)
teams.head()

Unnamed: 0,name,discipline,noc,event
0,China,3x3 Basketball,People's Republic of China,Men
1,China,3x3 Basketball,People's Republic of China,Women
2,France,3x3 Basketball,France,Women
3,Italy,3x3 Basketball,Italy,Women
4,Japan,3x3 Basketball,Japan,Men


In [23]:
# From Teams get the list of events
sport_type = teams['event'].unique()
sport_type.sort()
print(sport_type)

['4 x 400m Relay Mixed' 'Baseball' 'Duet' 'Group All-Around' 'Men'
 "Men's 4 x 100m Freestyle Relay" "Men's 4 x 100m Medley Relay"
 "Men's 4 x 100m Relay" "Men's 4 x 200m Freestyle Relay"
 "Men's 4 x 400m Relay" "Men's Foil Team" "Men's Madison"
 "Men's Sabre Team" "Men's Team" "Men's Team Pursuit" "Men's Team Sprint"
 "Men's Épée Team" 'Mixed 4 x 100m Medley Relay' 'Mixed Doubles'
 'Mixed Relay' 'Mixed Team' 'Softball' 'Team' 'Women'
 "Women's 4 x 100m Freestyle Relay" "Women's 4 x 100m Medley Relay"
 "Women's 4 x 100m Relay" "Women's 4 x 200m Freestyle Relay"
 "Women's 4 x 400m Relay" "Women's Foil Team" "Women's Madison"
 "Women's Sabre Team" "Women's Team" "Women's Team Pursuit"
 "Women's Team Sprint" "Women's Épée Team"]


In [25]:
# Create Data Frame for new table Events with two columns - event_id, event_name
events = pd.DataFrame({
    "event_id":np.arange(len(sport_type)),
    "event_name":sport_type
})
events.set_index("event_id", inplace = True)
events.head()

Unnamed: 0_level_0,event_name
event_id,Unnamed: 1_level_1
0,4 x 400m Relay Mixed
1,Baseball
2,Duet
3,Group All-Around
4,Men


In [26]:
# Export cleaned data to csv
events.to_csv("../Output/events.csv")

### Transform table Teams

In [27]:
# Check if we have duplicate events but different name for it,
# e.g. Men and Men's Team are the same? if they are, we will remove duplicates
teams.loc[teams["noc"]=='Australia']

Unnamed: 0,name,discipline,noc,event
11,Australia,Archery,Australia,Men's Team
12,Australia,Archery,Australia,Mixed Team
67,Australia,Artistic Swimming,Australia,Duet
68,Australia,Artistic Swimming,Australia,Team
87,Australia,Athletics,Australia,Women's 4 x 400m Relay
138,Australia,Baseball/Softball,Australia,Softball
146,Australia,Basketball,Australia,Men
147,Australia,Basketball,Australia,Women
161,McHugh/Schumann,Beach Volleyball,Australia,Men
162,Artacho Del Solar/Clancy,Beach Volleyball,Australia,Women


* These are just different names of events, they are not duplicates. So we don't have duplicates events to remove.

In [70]:
# Apply BR: Values in cells should be Capitalized

for i, row in teams.loc[teams['name'].str.contains(' / ')].iterrows():
    ready = ""
    for val in row['name'].split():       
        ready = ready + " " + val.capitalize()
    teams.iloc[i, 0] = ready
    print(row['name'])
teams.loc[teams['name'].str.contains(' / ')]

HU Heming / TAPPER Melissa
WANG Zhen / ZHANG Mo
XU Xin / LIU Shiwen
CAMPOS Jorge / FONSECA Daniela
LEBESSON Emmanuel / YUAN Jia Nan
FRANZISKA Patrick / SOLJA Petrissa
SZUDI Adam / PERGEL Szandra
MIZUTANI Jun / ITO Mima
LEE Sangsu / JEON Jihee


Unnamed: 0,name,discipline,noc,event
457,Hu Heming / Tapper Melissa,Table Tennis,Australia,Mixed Doubles
461,Wang Zhen / Zhang Mo,Table Tennis,Canada,Mixed Doubles
463,Xu Xin / Liu Shiwen,Table Tennis,People's Republic of China,Mixed Doubles
465,Campos Jorge / Fonseca Daniela,Table Tennis,Cuba,Mixed Doubles
467,Lebesson Emmanuel / Yuan Jia Nan,Table Tennis,France,Mixed Doubles
470,Franziska Patrick / Solja Petrissa,Table Tennis,Germany,Mixed Doubles
472,Szudi Adam / Pergel Szandra,Table Tennis,Hungary,Mixed Doubles
475,Mizutani Jun / Ito Mima,Table Tennis,Japan,Mixed Doubles
479,Lee Sangsu / Jeon Jihee,Table Tennis,Republic of Korea,Mixed Doubles


In [71]:
# Change column noc values to country_id values
for index, row in countries.iterrows():
    teams.loc[teams['noc']==row["country_name"], 'noc'] = index
teams.head()

Unnamed: 0,name,discipline,noc,event
0,China,3x3 Basketball,12,Men
1,China,3x3 Basketball,12,Women
2,France,3x3 Basketball,4,Women
3,Italy,3x3 Basketball,8,Women
4,Japan,3x3 Basketball,9,Men


In [72]:
# Change column discipline values to discipline_id values
for index, row in disciplines.iterrows():
    teams.loc[teams['discipline']==row["discipline_name"], 'discipline'] = index
teams.head()

Unnamed: 0,name,discipline,noc,event
0,China,0,12,Men
1,China,0,12,Women
2,France,0,4,Women
3,Italy,0,8,Women
4,Japan,0,9,Men


In [73]:
# Change column event values to event_id values
for index, row in events.iterrows():
    teams.loc[teams['event']==row["event_name"], 'event'] = index
teams.head()

Unnamed: 0,name,discipline,noc,event
0,China,0,12,4
1,China,0,12,23
2,France,0,4,23
3,Italy,0,8,23
4,Japan,0,9,4


In [74]:
# Rename columns and set index
teams.rename(columns={'discipline':'discipline_id', 'noc':'country_id', 'event':'event_id'}, inplace = True)
teams.index.name = "team_id"
teams.head()

Unnamed: 0_level_0,name,discipline_id,country_id,event_id
team_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,China,0,12,4
1,China,0,12,23
2,France,0,4,23
3,Italy,0,8,23
4,Japan,0,9,4


In [75]:
# Export transformed Teams to csv
teams.to_csv("../Output/teams.csv")

### Transform table Coaches

In [76]:
# Load extracted via sql athena data about coaches with business rules applied:
# allowed countries and allowed disciplines
coach_path = "Resources/athena_query_coaches.csv"
coaches = pd.read_csv(coach_path)
coaches

Unnamed: 0,name,noc,discipline,event
0,ABE Junya,Japan,Volleyball,
1,ABE Katsuhiko,Japan,Basketball,
2,AGEBA Yuya,Japan,Volleyball,
3,AIKMAN Siegfried Gottlieb,Japan,Hockey,Men
4,AL SAADI Kais,Germany,Hockey,Men
...,...,...,...,...
232,YURKIN Sergey,ROC,Volleyball,
233,ZAITSEVA Olesia,Ukraine,Artistic Swimming,Duet
234,ZAMORA PEDREIRA Javier,Spain,Basketball,
235,ZHANG Xiaohuan,People's Republic of China,Artistic Swimming,


In [77]:
# Apply BR: Each cell should contain a single value

# Using Pandas split Name column values into two separated columns
columnsplit = coaches['name'].str.split(" ", expand=True)
coaches = coaches.assign(last_name=columnsplit[0],first_name=columnsplit[1])

# Drop Name column and Reorder columns
coaches = coaches.drop(['name'], axis=1)
coaches = coaches[["last_name","first_name", "noc", "discipline", "event"]]
coaches.head()

Unnamed: 0,last_name,first_name,noc,discipline,event
0,ABE,Junya,Japan,Volleyball,
1,ABE,Katsuhiko,Japan,Basketball,
2,AGEBA,Yuya,Japan,Volleyball,
3,AIKMAN,Siegfried,Japan,Hockey,Men
4,AL,SAADI,Germany,Hockey,Men


In [78]:
# Apply BR: Values in cells should be Capitalized
coaches['last_name'] = coaches['last_name'].str.capitalize()
coaches['first_name'] = coaches['first_name'].str.capitalize()
coaches

Unnamed: 0,last_name,first_name,noc,discipline,event
0,Abe,Junya,Japan,Volleyball,
1,Abe,Katsuhiko,Japan,Basketball,
2,Ageba,Yuya,Japan,Volleyball,
3,Aikman,Siegfried,Japan,Hockey,Men
4,Al,Saadi,Germany,Hockey,Men
...,...,...,...,...,...
232,Yurkin,Sergey,ROC,Volleyball,
233,Zaitseva,Olesia,Ukraine,Artistic Swimming,Duet
234,Zamora,Pedreira,Spain,Basketball,
235,Zhang,Xiaohuan,People's Republic of China,Artistic Swimming,


In [79]:
# Change column noc values to country_id values
for index, row in countries.iterrows():
    coaches.loc[coaches['noc']==row["country_name"], 'noc'] = index
coaches.head()

Unnamed: 0,last_name,first_name,noc,discipline,event
0,Abe,Junya,9,Volleyball,
1,Abe,Katsuhiko,9,Basketball,
2,Ageba,Yuya,9,Volleyball,
3,Aikman,Siegfried,9,Hockey,Men
4,Al,Saadi,5,Hockey,Men
...,...,...,...,...,...
232,Yurkin,Sergey,14,Volleyball,
233,Zaitseva,Olesia,19,Artistic Swimming,Duet
234,Zamora,Pedreira,16,Basketball,
235,Zhang,Xiaohuan,12,Artistic Swimming,


In [80]:
# Change column discipline values to discipline_id values
for index, row in disciplines.iterrows():
    coaches.loc[coaches['discipline']==row["discipline_name"], 'discipline'] = index
coaches.head()

Unnamed: 0,last_name,first_name,noc,discipline,event
0,Abe,Junya,9,42,
1,Abe,Katsuhiko,9,7,
2,Ageba,Yuya,9,42,
3,Aikman,Siegfried,9,23,Men
4,Al,Saadi,5,23,Men


In [81]:
# Change column event values to event_id values
for index, row in events.iterrows():
    coaches.loc[coaches['event']==row["event_name"], 'event'] = index
coaches.head()

Unnamed: 0,last_name,first_name,noc,discipline,event
0,Abe,Junya,9,42,
1,Abe,Katsuhiko,9,7,
2,Ageba,Yuya,9,42,
3,Aikman,Siegfried,9,23,4.0
4,Al,Saadi,5,23,4.0


In [82]:
# Rename columns and name index
coaches.rename(columns={'discipline':'discipline_id', 'noc':'country_id', 'event':'event_id'}, inplace = True)
coaches.index.name='coach_id'
coaches

Unnamed: 0_level_0,last_name,first_name,country_id,discipline_id,event_id
coach_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,Abe,Junya,9,42,
1,Abe,Katsuhiko,9,7,
2,Ageba,Yuya,9,42,
3,Aikman,Siegfried,9,23,4
4,Al,Saadi,5,23,4
...,...,...,...,...,...
232,Yurkin,Sergey,14,42,
233,Zaitseva,Olesia,19,3,2
234,Zamora,Pedreira,16,7,
235,Zhang,Xiaohuan,12,3,


In [83]:
# Export transformed Coaches to csv
coaches.to_csv("../Output/coaches.csv")