In [None]:
import pandas as pd
from sqlalchemy import create_engine

### Extract CSVs into DataFrames

In [None]:
automate_file = "Resources/automation_data_by_state.csv"
salary_file = "Resources/occupation_salary.csv"
soc_file = "Resources/soc_codes.csv"

### Cleaning Automation Data 

In [None]:
automation_df = pd.read_csv(automate_file)
automation_df.rename(columns = {
    'District of Columbia':'District_of_Columbia',
    'New Hampshire':'New_Hampshire',
    'New Jersey':'New_Jersey','New Mexico':'New_Mexico',
    'New York':'New_York','North Carolina':'North_Carolina',
    'North Dakota':'North_Dakota','Rhode Island':'Rhode_Island',
    'South Carolina':'South_Carolina',
    'South Dakota':'South_Dakota','West Virginia':'West_Virginia',
    'Occupation':'OCCUPATION','Probability':'Automation_Probability'
},inplace=True)

automation_df.drop(columns=['OCCUPATION'],inplace=True)

automation_df.head(1)

### Cleaning Salary Data 

In [None]:
salary_df = pd.read_csv(salary_file)
salary_df = salary_df.rename(columns={'ANNUAL':'ANNUAL_only', 
                                      "HOURLY":"HOURLY_only",
                                      'OCC_CODE':'SOC'})

bool_list = ['ANNUAL_only','HOURLY_only']

salary_df[bool_list] = salary_df[bool_list].fillna(0).astype(int)

salary_df[bool_list] = salary_df[bool_list].replace(True,1).astype(int)

salary_df[bool_list] = salary_df[bool_list].astype(bool)

In [None]:
salary_df.replace(",","", regex=True,inplace=True)

salary_df = salary_df.replace('*', None)
salary_df = salary_df.replace('**', None)
salary_df = salary_df.replace('#', None)

salary_df.drop(columns=['OCC_TITLE','OCC_GROUP'],inplace=True)

In [None]:
integer_list = ['TOT_EMP','EMP_PRSE','H_MEAN','A_MEAN','MEAN_PRSE','H_PCT10',
                'H_PCT25','H_MEDIAN','H_PCT75','H_PCT90','A_PCT10','A_PCT25',
                'A_MEDIAN','A_PCT75','A_PCT90','ANNUAL_only','HOURLY_only']

salary_df[integer_list] = salary_df[integer_list].astype(float)

# Drop row for Total employment (SOC: 00-0000)
salary_df = salary_df[salary_df['SOC'] != '00-0000']
salary_df.head(1)

In [None]:
salary_df.head(1)

### Cleaning up SOC code data

In [None]:
soc_df = pd.read_csv(soc_file)
soc_df = soc_df[['SOC Code','SOC Title','SOC Group']]
soc_df.rename(columns={'SOC Code':'SOC_code',
                       'SOC Title':'SOC_title','SOC Group':'SOC_group'},
                      inplace=True)

# ERROR - missing key for this role in Primary Key list
soc_df.loc[soc_df['SOC_code']=='11-2031',:]

## Create database connection 

In [None]:
conn = 'postgres:postgres@localhost:5432/labor_stats_db'
engine = create_engine(f'postgresql://{conn}')

In [None]:
soc_df.to_sql(name='SOC_keys',con=engine,if_exists='append',index=False)

In [None]:
salary_df.to_sql(name='Salary',con=engine,if_exists='append',index=False)

In [None]:
automation_df.to_sql(name='Automation',con=engine,if_exists='append',index=False)