In [1]:
import pandas as pd
from sqlalchemy import create_engine
from sqlalchemy import inspect
import config

## Extract CSVs into DataFrame

In [2]:
country_socioeconomic_file = "Resources/country_profile_variables.csv"
country_socioeconomic_df = pd.read_csv(country_socioeconomic_file, na_values=["-99"])
country_socioeconomic_df.head()

Unnamed: 0,country,Region,Population in thousands (2017),"Population density (per km2, 2017)",GDP: Gross domestic product (million current US$),GDP per capita (current US$)
0,Afghanistan,SouthernAsia,35530,54.4,20270.0,623.2
1,Albania,SouthernEurope,2930,106.9,11541.0,3984.2
2,Algeria,NorthernAfrica,41318,17.3,164779.0,4154.1
3,American Samoa,Polynesia,56,278.2,,
4,Andorra,SouthernEurope,77,163.8,2812.0,39896.4


In [3]:
country_olympics_file = "Resources/olympics_medals_country_wise.csv"
country_olympics_df = pd.read_csv(country_olympics_file)
country_olympics_df.head()

Unnamed: 0,country,summer_total,winter_total,total_participation,total_won
0,Afghanistan,2,0,15,2
1,Algeria,17,0,17,17
2,Argentina,77,0,45,77
3,Armenia,18,0,15,18
4,Australasia,12,0,2,12


## Transform country_socioeconomic DataFrame

In [4]:
# Create a filtered dataframe from specific columns
country_socioeconomic_cols = ["country","Region","Population in thousands (2017)","GDP: Gross domestic product (million current US$)"]
country_socioeconomic_transformed = country_socioeconomic_df[country_socioeconomic_cols].copy()

# Rename the column headers
country_socioeconomic_transformed= country_socioeconomic_transformed.rename(columns={
                                                          "Population in thousands (2017)": "population",
                                                          "Region": "region",
                                                          "GDP: Gross domestic product (million current US$)": "gdp"
                                                          })

# Clean the data by dropping duplicates and NA values and setting the index
# new_country_socioeconomic_transformed = country_socioeconomic_transformed.dropna(how="any")
country_socioeconomic_transformed.drop_duplicates("country", inplace=True)
country_socioeconomic_transformed.head()

Unnamed: 0,country,region,population,gdp
0,Afghanistan,SouthernAsia,35530,20270.0
1,Albania,SouthernEurope,2930,11541.0
2,Algeria,NorthernAfrica,41318,164779.0
3,American Samoa,Polynesia,56,
4,Andorra,SouthernEurope,77,2812.0


## Transform country_olympics DataFrame

In [5]:
# Create a filtered dataframe from specific columns
country_olympics_cols = ["country", "summer_total", "winter_total", "total_participation", "total_won"]
country_olympics_transformed = country_olympics_df[country_olympics_cols]

# Clean the data by dropping duplicates and NA values and setting the index
new_country_olympics_transformed = country_olympics_transformed.dropna(how="any")
new_country_olympics_transformed.drop_duplicates("country", inplace=True)
new_country_olympics_transformed.head()

Unnamed: 0,country,summer_total,winter_total,total_participation,total_won
0,Afghanistan,2,0,15,2
1,Algeria,17,0,17,17
2,Argentina,77,0,45,77
3,Armenia,18,0,15,18
4,Australasia,12,0,2,12


In [6]:
#Re-nameing country's within the data set to match the new_country_socioeconomic_transformed country data for the join
new_country_olympics_transformed['country'] = new_country_olympics_transformed['country'].replace(['United States'], 'United States of America')
new_country_olympics_transformed['country'] = new_country_olympics_transformed['country'].replace(['Russia'], 'Russian Federation')
new_country_olympics_transformed['country'] = new_country_olympics_transformed['country'].replace(['Great Britain'], 'United Kingdom')
new_country_olympics_transformed['country'] = new_country_olympics_transformed['country'].replace(['North Korea'], "Democratic People's Republic of Korea")
new_country_olympics_transformed['country'] = new_country_olympics_transformed['country'].replace(['South Korea'], 'Republic of Korea')
new_country_olympics_transformed['country'] = new_country_olympics_transformed['country'].replace(['Czech Republic'], 'Czechia')
new_country_olympics_transformed['country'] = new_country_olympics_transformed['country'].replace(['Hong Kong'], '"China, Hong Kong SAR"')
new_country_olympics_transformed['country'] = new_country_olympics_transformed['country'].replace(['North Macedonia'], 'The former Yugoslav Republic of Macedonia')


## Connect to local database

In [7]:
protocol = 'postgresql'
username = config.Username
password = config.Password
host = 'localhost'
port = 5432
database_name = 'olympics'
rds_connection_string = f'{protocol}://{username}:{password}@{host}:{port}/{database_name}'
engine = create_engine(rds_connection_string)
insp = inspect(engine)

## Inspect Tables

In [8]:
insp.get_table_names()

['country_socioeconomic', 'country_olympics']

## Load

In [9]:
country_socioeconomic_transformed.to_sql(name='country_socioeconomic', con=engine, if_exists='append', index=False)

229

In [10]:
new_country_olympics_transformed.to_sql(name='country_olympics', con=engine, if_exists='append', index=False)

157

## Confirm data has been added by querying the country socioeconomic table

In [11]:
pd.read_sql_query('select * from country_socioeconomic', con=engine).head()

Unnamed: 0,country,region,population,gdp
0,Afghanistan ...,SouthernAsia ...,35530,20270.0
1,Albania ...,SouthernEurope ...,2930,11541.0
2,Algeria ...,NorthernAfrica ...,41318,164779.0
3,American Samoa ...,Polynesia ...,56,
4,Andorra ...,SouthernEurope ...,77,2812.0


## Confirm data has been added by querying the country olympics table

In [12]:
pd.read_sql_query('select * from country_olympics', con=engine).head()

Unnamed: 0,country,summer_total,winter_total,total_participation,total_won
0,Afghanistan ...,2,0,15,2
1,Algeria ...,17,0,17,17
2,Argentina ...,77,0,45,77
3,Armenia ...,18,0,15,18
4,Australasia ...,12,0,2,12


## Join Data from country socioeconomic and country olympics table

In [13]:
sql_join = r"""SELECT cs.country, cs.region, cs.population, cs.gdp, co.summer_total, co.winter_total, co.total_participation, co.total_won
FROM country_socioeconomic as cs
INNER JOIN country_olympics as co
ON cs.country=co.country;"""
pd.read_sql_query(sql_join, con=engine)

Unnamed: 0,country,region,population,gdp,summer_total,winter_total,total_participation,total_won
0,Afghanistan ...,SouthernAsia ...,35530,20270,2,0,15,2
1,Algeria ...,NorthernAfrica ...,41318,164779,17,0,17,17
2,Argentina ...,SouthAmerica ...,44271,632343,77,0,45,77
3,Armenia ...,WesternAsia ...,2930,10529,18,0,15,18
4,Australia ...,Oceania ...,24451,1230859,547,19,47,566
...,...,...,...,...,...,...,...,...
122,United States of America ...,NorthernAmerica ...,324460,18036648,2629,330,52,2959
123,Uruguay ...,SouthAmerica ...,3457,53442,10,0,23,10
124,Uzbekistan ...,CentralAsia ...,31911,69004,36,1,15,37
125,Zambia ...,EasternAfrica ...,17094,21255,2,0,14,2
