In [None]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine

## Extract CSVs into DataFrames

In [None]:
# Extract happiness rankings CSV into DataFrame
happy_file = "resources/2019_world_happiness_rankings.csv"
happy_df = pd.read_csv(happy_file, encoding='UTF-8')
happy_df.head()

In [None]:
# Extract university rankings CSV into DataFrame
university_file = "resources/2019_QS_world_university_rankings.csv"
university_df = pd.read_csv(university_file, encoding='ISO-8859-1')
university_df.head()

## Clean Up of University Rankings

In [None]:
# Remove unnecessary columns
university_df = university_df.drop(['Focus', 'Research Intensity', 'Age', 'Status'], axis=1)

# Drop universities that were unranked in 2019
university_df = university_df.dropna(subset=['2019'])

In [None]:
# The dataset includes '=' at the end of ranking ties
university_df['2019'] = university_df['2019'].str.rstrip('=')
university_df['2018'] = university_df['2018'].str.rstrip('=')

In [None]:
# The dataset includes '+' at the end of the attribute ranks (i.e. International Faculty Rank of 601+)
university_df['Academic Reputation Rank'] = university_df['Academic Reputation Rank'].str.rstrip('+')
university_df['Employer Reputation Rank'] = university_df['Employer Reputation Rank'].str.rstrip('+')
university_df['Faculty Student Rank'] = university_df['Faculty Student Rank'].str.rstrip('+')
university_df['Citations per Faculty Rank'] = university_df['Citations per Faculty Rank'].str.rstrip('+')
university_df['International Faculty Rank'] = university_df['International Faculty Rank'].str.rstrip('+')
university_df['International Students Rank'] = university_df['International Students Rank'].str.rstrip('+')

In [None]:
# Because some rankings begin with a double space, we need to remove these characters
university_df['2019'] = university_df['2019'].str.strip()
university_df['2018'] = university_df['2018'].str.strip()

In [None]:
# The dataset includes ranges for higher rankings (i.e. 801-1000)
# Split the rankings from 2019 to only include lowest rank in the range
university_df['2019'] = university_df['2019'].str[:3]
university_df['2018'] = university_df['2018'].str[:3]

In [None]:
# When a score or rank was not taken for a university, the CSV notes it with either a '-' or an empty cell
# Convert all '-' cells into empty cells
university_df.replace({'-': np.nan}, inplace=True)

In [None]:
# Rename columns in both DataFrames
# Happiness rankings DataFrame
happy_df = happy_df.rename(columns = {
    'Overall rank': 'overall_rank',
    'Country or region': 'country',
    'Score': 'score',
    'GDP per capita': 'gdp_per_capita_score',
    'Social support': 'social_support_score',
    'Healthy life expectancy': 'healthy_life_expectancy_score',
    'Freedom to make life choices': 'freedom_life_choices_score',
    'Generosity': 'generosity',
    'Perceptions of corruption': 'perceived_corruption_score'
})

# University rankings DataFrame
university_df = university_df.rename(columns = {
    '2019': 'year_2019',
    '2018': 'year_2018',
    'Institution Name': 'institution_name',
    'Country': 'country',
    'Classification Size': 'class_size',
    'Academic Reputation Score': 'academic_rep_score',
    'Academic Reputation Rank': 'academic_rep_rank',
    'Employer Reputation Score': 'employer_rep_score',
    'Employer Reputation Rank': 'employer_rep_rank',
    'Faculty Student Score': 'faculty_student_score',
    'Faculty Student Rank': 'faculty_student_rank',
    'Citations per Faculty Score': 'faculty_citations_score',
    'Citations per Faculty Rank': 'faculty_citations_rank',
    'International Faculty Score': 'intl_faculty_score',
    'International Faculty Rank': 'intl_faculty_rank',
    'International Students Score': 'intl_students_score',
    'International Students Rank': 'intl_students_rank',
    'Overall Score': 'overall_score'})

In [None]:
# Set indices
university_df.set_index('institution_name', inplace=True)
happy_df.set_index('country', inplace=True)

In [None]:
# Display the cleaned Happiness DataFrame
happy_df.head()

In [None]:
# Display the cleaned DataFrame
university_df.head()

## Create database connection

In [None]:
# Require user to input postgres password
password = input("Please input your postgres password: ")

# Create connection
connection_string = f"postgres:{password}@localhost:5432/etl_project_db"
engine = create_engine(f'postgresql://{connection_string}')

In [None]:
# Confirm tables
engine.table_names()

## Load DataFrame into database

In [None]:
happy_df.to_sql(name='world_happiness', con=engine, if_exists='append', index=True)

In [None]:
university_df.to_sql(name='university_rankings', con=engine, if_exists='append', index=True)

## Export DataFrames to csv Files

In [None]:
# saving the dataframe to csv files
happy_df.to_csv(r'Results/final_world_happy_data.csv', header=True, index=False)
university_df.to_csv(r'Results/final_univ_rank_data.csv', header=True, index=False)