## Data and package imports

In [1]:
import pandas as pd
import numpy as np
from pandas import Series, DataFrame
import matplotlib.pyplot as plt
import psycopg2 as pg
%matplotlib inline
import seaborn as sns
import dbcreds
from datetime import datetime, timedelta

import sklearn
from sklearn.metrics import precision_recall_curve, auc
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.ensemble import (RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, 
AdaBoostClassifier)
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

#from .utils import db_utils

In [15]:
#TO's iefp_data_utils

#Cleans application data
    #Removes applications of people whose application date is after max_start_date
    #Removes applications of people who are already employed/part-time employed
#Sample Usage: clean_apps=clean_applications(apps_df,'2016-04-31')
def clean_applications(applications,max_start_date):
    clean_apps = applications.copy()
    clean_apps['app_start_date'] = pd.to_datetime(clean_apps['anomes'],format="%Y%m")
    clean_apps = clean_apps[clean_apps['app_start_date'] <= max_start_date]
    clean_apps = clean_apps[~clean_apps['dcategoria'].isin(['EMPREGADO', 'EMPREGADO A TEMPO PARCIAL'])] 
    clean_apps = clean_apps.sort_values(['ute_id','app_start_date'])
    return clean_apps


#Cleans and Filters movements data
    #Removes movements that don't start with an application
    #If apps_series parameter is present, filters the movements which do not belong to the applications in the series
#Sample Usage: clean_movs=clean_movements(movements)
#              clean_movs=clean_movements(movements,apps['table_index']) 
def clean_movements(movements, apps_series=None):
    clean_data = movements.copy()
    clean_data = clean_data[clean_data['application_id'] != -1] #Removing movements that don't start with an application
    if not apps_series is None: 
        clean_data = clean_data[clean_data['application_id'].isin(apps_series)]
    clean_data = clean_data.sort_values(['ute_id','year_month','movement_date'])
    return clean_data


In [2]:
class color:
   PURPLE = '\033[95m'
   CYAN = '\033[96m'
   DARKCYAN = '\033[36m'
   BLUE = '\033[94m'
   GREEN = '\033[92m'
   YELLOW = '\033[93m'
   RED = '\033[91m'
   BOLD = '\033[1m'
   UNDERLINE = '\033[4m'
   END = '\033[0m'

In [3]:
conn = pg.connect(dbname = dbcreds.database, host=dbcreds.host, user=dbcreds.user, password = dbcreds.password)
apps_df = pd.read_sql('select * from cascais.application', con=conn)
move_df = pd.read_sql('select * from cascais.movement', con=conn)

## Filter data to use

In [4]:
# create datetime objects for applications from candidatura_data
apps_df.loc[:,'date']= apps_df.loc[:,'candidatura_data'].apply(
    lambda x: None if x=='' else datetime(year=int(x[0:4]),
                       month=int(x[5:7]),
                       day=int(x[8:10])))

#create datetime objects for cancellations,
# and placeholder dates for movements that aren't cancellations
def create_date(series):
    if series['movement_date'] == '':
        return datetime(year=int(str(series['year_month'])[0:4]),
                        month = int(str(series['year_month'])[4:6]),
                        day = 15)
    else:
        return datetime(year=int(series['movement_date'][0:4]),
                       month=int(series['movement_date'][5:7]),
                       day=int(series['movement_date'][8:10]))

move_df.loc[:, 'date'] = move_df.apply(create_date, axis=1)

In [5]:
#fill NAs for descendents
apps_df.loc[:,'ute_nr_descendentes_cargo'].fillna(0.0, inplace=True)

In [6]:
# Identify application IDs that are associated with applications submitted after
# April 2016 - to allow for 1 year of follow-up data
apps_not_late = apps_df[apps_df['date']<=datetime(2016, 4, 28)].loc[:,'table_index'].tolist()

apps_df = apps_df[apps_df['table_index'].isin(apps_not_late)]
move_df = move_df[move_df['application_id'].isin(apps_df['table_index'])]

In [7]:
# Identify applications IDs that were not unemployed
apps_df.dcategoria.value_counts()
apps_df = apps_df[apps_df['dcategoria'].isin(['DESEMPREGADO-NOVO EMPREGO',
                                            'DESEMPREGADO-1Âº EMPREGO'])]
move_df = move_df[move_df['application_id'].isin(apps_df['table_index'])]

## Generating labels

In [8]:
def generate_labels(apps_df, move_df):
    # Create dictionary of appliction dates
    app_date_dict = dict(zip(apps_df['table_index'], apps_df['date']))
    
    # Create time since app variable
    move_df.loc[:, 'app_date'] = move_df.loc[:, 'application_id'].map(app_date_dict)
    move_df.loc[:, 'time_since_app'] = move_df.loc[:, 'date'] - move_df.loc[:, 'app_date']

    # Identify applications that had an interview placement within 12 months
    apps_placed_interview = move_df[(move_df['movement_result']=='ADMITIDO / COLOCADO') \
                                & (move_df['time_since_app']<=timedelta(days=365))].\
    application_id.unique().tolist()
    
    # Identify applications that had a cancellation for becoming employed within 12 months
    employed_cancellations = ['COLOCAÃÃO POR MEIOS PRÃPRIOS, POR CONTA DE OUTREM',
                         'COLOCAÃÃO POR MEIOS PRÃPRIOS, POR CONTA PRÃPRIA',
                         'COLOCAÃÃO - CANDIDATURA EXTERNA'
                         'COLOCAÃÃO - CANDIDATURA INTERNA']

    apps_placed_cancellation = move_df[(move_df['movement_subtype'].isin(employed_cancellations)) \
                                & (move_df['time_since_app']<=timedelta(days=365))].\
    application_id.unique().tolist()

    # Create list for applications that found a job within 12 months
    apps_found_job_12 = apps_placed_interview + apps_placed_cancellation
    
    # Identify applications that had any cancellation within 12 months
    apps_cancelled_12 = move_df[(move_df['movement_type']=='cancellation') \
                                & (move_df['time_since_app']<=timedelta(days=365))].\
    application_id.unique().tolist()
    
    # grab unique app IDs and put into index
    labels_df = apps_df.rename(columns = {'table_index':'application_id'})
    labels_df = labels_df.groupby(['application_id', 'date'])['ute_id', 'anomes'].count()    

    #create intermediate variables found_job_12 and cancelled _12
    labels_df.loc[:,'found_job_12'] = labels_df.index.get_level_values(level='application_id').isin(apps_found_job_12)
    labels_df.loc[:,'canceled_12'] = labels_df.index.get_level_values(level='application_id').isin(apps_cancelled_12)
    
    #create labels, set application ID as the index, and drop intermediate variables
    #label is True if application is LTU
    labels_df.loc[:, 'label'] = ~labels_df.loc[:, 'found_job_12'] & ~labels_df.loc[:, 'canceled_12']
    labels_df.drop (['found_job_12', 'canceled_12', 'ute_id', 'anomes'], axis=1, inplace=True)

    return labels_df

## Generating features

In [9]:
def demographic_features(apps_df, column_names):
    apps_df_small = apps_df.rename(columns={'table_index': 'application_id'})
    apps_df_small = apps_df_small.groupby(['application_id', 'date']).first()
    apps_df_small = apps_df_small[column_names]

    return apps_df_small

def clean_matrix(m):
    # generate dummies for categorical values
    m = pd.get_dummies(m, drop_first=True)
    return m

## Binding Features to Labels

In [10]:
def split_data(apps_df, move_df, f_demo, start_date, run_date, follow_up=365):
    
    # get timestamps for split
    start_date = pd.to_datetime(start_date)
    run_date = pd.to_datetime(run_date)
    split_date = run_date - timedelta(days=follow_up)
    
    # create lables and features
    labels = generate_labels(apps_df, move_df)
    features_demographic = demographic_features(apps_df, f_demo)
    
    # bind labels and features
    m = labels
    for f in [features_demographic]:
        m = m.merge(f, how = 'left', left_index=True, right_index=True)
    
    # create a training set and test set based on split date
    m_train = m.iloc[(m.index.get_level_values('date')<split_date) & (m.index.get_level_values('date')>=start_date), :] 
    m_test = m.iloc[m.index.get_level_values('date')>=split_date, :]
    
    return clean_matrix(m_train), clean_matrix(m_test)

In [11]:
m_train, m_test = split_data(apps_df, move_df, ['sexo', 'ute_idade', 'sub_rsi'],
                             start_date = '1994-01-01', run_date = '2020-01-01')

In [12]:
X_train = m_train.drop('label', 1)
y_train = m_train.label.astype(float)
estimator = RandomForestClassifier(n_estimators = 100,
                   criterion = 'gini',
                   max_depth = None,
                   random_state = 4321)
#estimator.fit(X=X_train, y=y_train)


## Evaluate model

In [13]:
def run_RF(apps_df, move_df, f_demo, start_date, run_date, follow_up=365,
          n_estimators=100, max_depth=None, random_state=4321, p_threshold=0.5):
    
    # get the test and train data splits
    m_train, m_test = split_data(apps_df, move_df, f_demo, start_date, run_date, follow_up)
    
    X_train = m_train.drop('label', 1)
    y_train = m_train.label.astype(float)

    X_test = m_test.drop('label', 1)
    y_test = m_test.label.astype(float)

    #create and fit the estimator on training set
    estimator = RandomForestClassifier(n_estimators = n_estimators,
                       criterion = 'gini',
                       max_depth = max_depth,
                       random_state = random_state)
    
    estimator.fit(X=X_train, y=y_train)
    
    #generate predicted y values on test set
    y_predict = estimator.predict_proba(X=X_test)
    y_predict_prob = pd.Series(index=X_test.index, data=y_predict[:,1])
    y_predict_class = y_predict_prob.apply(lambda x: 1 if x>=p_threshold else 0)
    
    #calculate metrics
    accuracy = sklearn.metrics.accuracy_score(y_test, y_predict_class)
    accuracy_base = max (m_test.label.mean(), 1-m_test.label.mean())
    precision = sklearn.metrics.precision_score(y_test, y_predict_class)
    return (accuracy, accuracy_base)

In [14]:
run_RF(apps_df, move_df, f_demo = ['sexo', 'ute_idade', 'sub_rsi'],
       start_date = '2011-01-01', run_date = '2015-01-01')

(0.68156691462499142, 0.69272503924111106)