# 2 - DSDJ TWS - Connecting to a AWS RDS Postgres instance with Python and revisting DSDJ's Salary Prediction Project
1. We will be connecting to two Postgres databases in AWS RDS
2. We will check out access and read SQL tables
3. We will then use pandas to manipulate the resulting dataframes
4. We will then write a dataframe to a SQL table

## Pre-requisite

We will use the folling librairies
* Install https://pypi.org/project/ipython-sql/
* Install https://pypi.org/project/SQLAlchemy/
* Install https://www.psycopg.org/docs/
* check out DSDJ's module 4 Salary Prediction portfolio project: [here](https://www.datasciencedreamjob.com/products/data-science-dream-job-full-course/categories/969481/posts/3215992)

### Let's Revisit  the Salary Prediction Project and Read/Write Data To/From the Database 

__author__ = 'DSDJ Team'  <br />
__email__ = 'info@datasciencedreamjob..com' <br />
__website__ = 'www.datasciencedreamjob.com' <br />
__copyright__ = 'Copyright 2021, Data Science Dream Job LLC' <br />

In [1]:
import pandas as pd
import numpy as np
import pandas as pd
import getpass

from sqlalchemy import create_engine
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import cross_val_score
from sklearn.utils import shuffle
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [2]:
def connect_to_db(connection_string):
    '''Connect to the SQL DB'''
    engine = create_engine("postgresql+psycopg2://"+connection_string, echo=False)
    return(engine)

def load_file(file, to_db, db_engine):
    '''loads csv to pd dataframe and the SQL table'''
    df = pd.read_csv(file)
    if to_db:
        table_name = file.split('/')[1].split('.csv')[0]
        df.to_sql(table_name, con = db_engine, if_exists='replace', index = False)
    return df

def consolidate_data(df1, df2, key=None, left_index=False, right_index=False):
    '''perform inner join to return only records that are present in both dataframes'''
    return pd.merge(left=df1, right=df2, how='inner', on=key, left_index=left_index, right_index=right_index)

def clean_data(raw_df):
    '''remove rows that contain salary <= 0 or duplicate job IDs'''
    clean_df = raw_df.drop_duplicates(subset='jobId')
    clean_df = clean_df[clean_df.salary>0]
    return clean_df

def one_hot_encode_feature_df(df, cat_vars=None, num_vars=None):
    '''performs one-hot encoding on all categorical variables and combines result with continous variables'''
    cat_df = pd.get_dummies(df[cat_vars])
    num_df = df[num_vars].apply(pd.to_numeric)
    return pd.concat([cat_df, num_df], axis=1)#,ignore_index=False)

def get_target_df(df, target):
    '''returns target dataframe'''
    return df[target]

def train_model(model, feature_df, target_df, num_procs, mean_mse, cv_std):
    neg_mse = cross_val_score(model, feature_df, target_df, cv=2, n_jobs=num_procs, scoring='neg_mean_squared_error')
    mean_mse[model] = -1.0*np.mean(neg_mse)
    cv_std[model] = np.std(neg_mse)

def print_summary(model, mean_mse, cv_std):
    print('\nModel:\n', model)
    print('Average MSE:\n', mean_mse[model])
    print('Standard deviation during CV:\n', cv_std[model])

def save_results(model, meban_mse, predictions, feature_importances):
    '''saves model, model summary, feature importances, and predictions'''
    table_name_feat = "feature_importances"
    table_name_pred = "predictions"
    feature_importances.to_sql(table_name_feat, con = db_engine, if_exists='replace', index = False)
    predictions.to_sql(table_name_pred, con = db_engine, if_exists='replace', index = False)

In [3]:
# Define db connection
database_host = "dsdj-postgres-db.clpvihbunw2c.ap-southeast-2.rds.amazonaws.com"
database_name = "postgres"
database_user = "postgres"
userpass = getpass.getpass("Password :")
connection_str = database_user+":"+userpass+"@"+database_host

#define inputs
train_feature_file = 'data/train_features.csv'
train_target_file = 'data/train_salaries.csv'
test_feature_file = 'data/test_features.csv'

#define variables
categorical_vars = ['companyId', 'jobType', 'degree', 'major', 'industry']
numeric_vars = ['yearsExperience', 'milesFromMetropolis']
target_var = 'salary'

# connect to SQL base
print("Connect to database")
db_engine = connect_to_db(connection_str)

#load data
print("Loading data")
write_input_to_db = False
feature_df = load_file(train_feature_file, write_input_to_db, db_engine)
target_df = load_file(train_target_file, write_input_to_db, db_engine)
test_df = load_file(test_feature_file, write_input_to_db, db_engine)

#consolidate training data
raw_train_df = consolidate_data(feature_df, target_df, key='jobId')

#clean, shuffle, and reindex training data -- shuffling improves cross-validation accuracy
clean_train_df = shuffle(clean_data(raw_train_df)).reset_index()

#encode categorical data and get final feature dfs
print("Encoding data")
feature_encoded_df = one_hot_encode_feature_df(clean_train_df, cat_vars=categorical_vars, num_vars=numeric_vars)
test_encoded_df = one_hot_encode_feature_df(test_df, cat_vars=categorical_vars, num_vars=numeric_vars)

#get target df
target_df = get_target_df(clean_train_df, target_var)

Password :········
Connect to database
Loading data
Encoding data


In [5]:
#initialize model list and dicts
models = []
mean_mse = {}
cv_std = {}
res = {}

#define number of processes to run in parallel
num_procs = -1

#shared model paramaters
verbose_lvl = 0

In [6]:
#create models -- hyperparameter tuning already done by hand for each model
lr = LinearRegression()

models.extend([lr])

#parallel cross-validate models, using MSE as evaluation metric, and print summaries
print("Beginning cross validation")
for model in models:
    train_model(model, feature_encoded_df, target_df, num_procs, mean_mse, cv_std)
    print_summary(model, mean_mse, cv_std)

Beginning cross validation

Model:
 LinearRegression()
Average MSE:
 384.47369216569604
Standard deviation during CV:
 0.11556336117729415


In [7]:
#choose model with lowest mse
model = min(mean_mse, key=mean_mse.get)
print('\nPredictions calculated using model with lowest MSE:')
print(model)

#train model on entire dataset
model.fit(feature_encoded_df, target_df)


Predictions calculated using model with lowest MSE:
LinearRegression()


LinearRegression()

In [8]:
#create predictions based on test data
predictions = model.predict(test_encoded_df)
predictions_df = test_df[['jobId']].copy()
predictions_df['pred_salary'] = predictions

#store feature importances
if hasattr(model, 'feature_importances_'):
    importances = model.feature_importances_
else:
    #linear models don't have feature_importances_
    importances = [0]*len(feature_df.columns)
    
feature_importances_df = pd.DataFrame({'feature':feature_df.columns, 'importance':importances})
feature_importances_df.sort_values(by='importance', ascending=False, inplace=True)
#set index to 'feature'
feature_importances_df.set_index('feature', inplace=True, drop=True)

In [9]:
#save results
save_results(model, mean_mse[model], predictions_df, feature_importances_df)

### Check the table created during the Salary Prediction Process

In [10]:
%load_ext sql

In [11]:
%sql postgresql://$connection_str

'Connected: postgres@None'

In [12]:
%%sql
SELECT *
FROM pg_catalog.pg_tables
WHERE schemaname != 'pg_catalog' AND schemaname != 'information_schema';

 * postgresql://postgres:***@dsdj-postgres-db.clpvihbunw2c.ap-southeast-2.rds.amazonaws.com
3 rows affected.


schemaname,tablename,tableowner,tablespace,hasindexes,hasrules,hastriggers,rowsecurity
public,car,postgres,,False,False,False,False
public,feature_importances,postgres,,False,False,False,False
public,predictions,postgres,,False,False,False,False


In [13]:
%%sql
SELECT * FROm predictions LIMIT 100

 * postgresql://postgres:***@dsdj-postgres-db.clpvihbunw2c.ap-southeast-2.rds.amazonaws.com
100 rows affected.


jobId,pred_salary
JOB1362685407687,115.778350830078
JOB1362685407688,92.3247528076172
JOB1362685407689,166.959533691406
JOB1362685407690,105.611358642578
JOB1362685407691,118.964614868164
JOB1362685407692,158.383544921875
JOB1362685407693,98.366455078125
JOB1362685407694,118.834533691406
JOB1362685407695,104.343124389648
JOB1362685407696,108.144226074219
