# Goal: Load, Clean and Upload Socioeconomic Data for all Districts of Chicago
    1. Downloading Census data of the city of chicago from 2016 to 2020
    2. Cleaning Data
    3. Upload to SQL Database and save as csv on local

### Socioeconomic data from 2016 to 2020
    source: https://storymaps.arcgis.com/stories/da5601c3e0924e5ab3ee07ade9954f7a

In [1]:
# pandas
import pandas as pd

# numpy
import numpy as np

# import mathplotlib.pyplot as plt
import matplotlib.pyplot as plt

# importing self made functions from sql_functions script
import sql_functions as sf

In [2]:
# reading socioeconomic data for chicagos districts into dataframe:
pfad = "data/socioeconomics_2016_2020.csv"
df_socio = pd.read_csv(pfad)

In [3]:
df_socio.head()

Unnamed: 0,Community Area,Community Area Number,Percent of Crowded Housing,Percent of Households With Income Below Poverty Level,Unemployment Rate for Population Age 16 and Over,Percent Aged 25 and over with no High School Diploma,Percent of Population Under Age 18 and Over Age 64,Per Capita Income,Hardship Index Score
0,Rogers Park,1,6.4%,19.8%,4.6%,11.7%,26.7%,"$29,682",40.9
1,West Ridge,2,8.1%,14.6%,6.3%,17.4%,39.4%,"$27,671",52.1
2,Uptown,3,4.4%,19.0%,3.3%,9.8%,24.4%,"$42,112",32.4
3,Lincoln Square,4,2.3%,8.2%,5.4%,6.4%,26.3%,"$49,797",24.7
4,North Center,5,0.9%,5.5%,3.2%,3.4%,34.3%,"$77,951",18.1


## 2. Cleaning Data

In [4]:
# changing the format of the Column names:
df_socio.columns = df_socio.columns.str.lower().str.replace(" ", "_")
df_socio.tail(10)

Unnamed: 0,community_area,community_area_number,percent_of_crowded_housing,percent_of_households_with_income_below_poverty_level,unemployment_rate_for_population_age_16_and_over,percent_aged_25_and_over_with_no_high_school_diploma,percent_of_population_under_age_18_and_over_age_64,per_capita_income,hardship_index_score
67,Englewood,68,1.4%,39.7%,24.9%,23.7%,42.7%,"$15,034",68.2
68,Greater Grand Crossing,69,1.8%,33.1%,19.8%,13.5%,40.4%,"$19,735",57.1
69,Ashburn,70,4.0%,13.0%,10.5%,17.8%,38.0%,"$25,905",47.4
70,Auburn Gresham,71,1.6%,24.2%,16.5%,15.0%,42.3%,"$20,665",53.2
71,Beverly,72,0.6%,5.7%,5.9%,2.5%,40.4%,"$55,811",26.3
72,Washington Heights,73,0.8%,20.0%,14.2%,11.4%,40.0%,"$24,294",45.9
73,Mount Greenwood,74,2.1%,5.8%,4.4%,4.7%,38.3%,"$41,850",30.1
74,Morgan Park,75,1.0%,11.6%,10.6%,6.3%,40.3%,"$34,946",37.0
75,O'Hare,76,3.1%,10.4%,3.8%,8.3%,32.8%,"$35,715",32.7
76,Edgewater,77,4.2%,15.1%,5.5%,9.2%,26.1%,"$42,486",32.6


In [5]:
df_socio.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77 entries, 0 to 76
Data columns (total 9 columns):
 #   Column                                                 Non-Null Count  Dtype  
---  ------                                                 --------------  -----  
 0   community_area                                         77 non-null     object 
 1   community_area_number                                  77 non-null     int64  
 2   percent_of_crowded_housing                             77 non-null     object 
 3   percent_of_households_with_income_below_poverty_level  77 non-null     object 
 4   unemployment_rate_for_population_age_16_and_over       77 non-null     object 
 5   percent_aged_25_and_over_with_no_high_school_diploma   77 non-null     object 
 6   percent_of_population_under_age_18_and_over_age_64     77 non-null     object 
 7   per_capita_income                                      77 non-null     object 
 8   hardship_index_score                                 

In [6]:
# changing Datatypes:
df_socio["percent_of_crowded_housing"] = df_socio["percent_of_crowded_housing"].str.rstrip("%").astype(float)
df_socio["percent_of_households_with_income_below_poverty_level"] = df_socio["percent_of_households_with_income_below_poverty_level"].str.rstrip("%").astype(float)
df_socio["unemployment_rate_for_population_age_16_and_over"] = df_socio["unemployment_rate_for_population_age_16_and_over"].str.rstrip("%").astype(float)
df_socio["percent_aged_25_and_over_with_no_high_school_diploma"] = df_socio["percent_aged_25_and_over_with_no_high_school_diploma"].str.rstrip("%").astype(float)
df_socio["percent_of_population_under_age_18_and_over_age_64"] = df_socio["percent_of_population_under_age_18_and_over_age_64"].str.rstrip("%").astype(float)
df_socio["per_capita_income"] = df_socio["per_capita_income"].str.lstrip("$").str.replace(",","").astype(int)

In [7]:
df_socio

Unnamed: 0,community_area,community_area_number,percent_of_crowded_housing,percent_of_households_with_income_below_poverty_level,unemployment_rate_for_population_age_16_and_over,percent_aged_25_and_over_with_no_high_school_diploma,percent_of_population_under_age_18_and_over_age_64,per_capita_income,hardship_index_score
0,Rogers Park,1,6.4,19.8,4.6,11.7,26.7,29682,40.9
1,West Ridge,2,8.1,14.6,6.3,17.4,39.4,27671,52.1
2,Uptown,3,4.4,19.0,3.3,9.8,24.4,42112,32.4
3,Lincoln Square,4,2.3,8.2,5.4,6.4,26.3,49797,24.7
4,North Center,5,0.9,5.5,3.2,3.4,34.3,77951,18.1
...,...,...,...,...,...,...,...,...,...
72,Washington Heights,73,0.8,20.0,14.2,11.4,40.0,24294,45.9
73,Mount Greenwood,74,2.1,5.8,4.4,4.7,38.3,41850,30.1
74,Morgan Park,75,1.0,11.6,10.6,6.3,40.3,34946,37.0
75,O'Hare,76,3.1,10.4,3.8,8.3,32.8,35715,32.7


## 3. Upload to SQL Database and save as csv on local

In [8]:
# constants:
path = "data/"
schema = "capstone_divvy_bikeshare"
engine = sf.get_engine()

In [9]:
# # Push DataFrame with socioeconomics data to SQL Database:
# table_name = 'socioeconomics2016_2020'

# df_socio.to_sql(name=table_name, # Name of SQL table
#                     con=engine, # Engine or connection
#                     if_exists='replace', # Drop the table before inserting new values 
#                     schema=schema, # Use schema that was defined earlier
#                     index=False, # Write DataFrame index as a column
#                     chunksize=5000, # Specify the number of rows in each batch to be written at a time
#                     method='multi') # Pass multiple values in a single INSERT clause
# print(f"The {table_name} table was imported successfully.")

In [10]:
# Save as CSV localy:
#df_socio.to_csv("data/clean_socio_2016_2020.csv")