## Data and package imports

In [1]:
import pandas as pd
import numpy as np
from pandas import Series, DataFrame
import matplotlib.pyplot as plt
import psycopg2 as pg
%matplotlib inline
import seaborn as sns
import dbcreds
from datetime import datetime, timedelta

import sklearn
from sklearn.metrics import precision_recall_curve, auc
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.ensemble import (RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, 
AdaBoostClassifier)
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

#from .utils import db_utils

In [2]:
class color:
   PURPLE = '\033[95m'
   CYAN = '\033[96m'
   DARKCYAN = '\033[36m'
   BLUE = '\033[94m'
   GREEN = '\033[92m'
   YELLOW = '\033[93m'
   RED = '\033[91m'
   BOLD = '\033[1m'
   UNDERLINE = '\033[4m'
   END = '\033[0m'

In [3]:
conn = pg.connect(dbname = dbcreds.database, host=dbcreds.host, user=dbcreds.user, password = dbcreds.password)
apps_df = pd.read_sql('select * from cascais.application', con=conn)
move_df = pd.read_sql('select * from cascais.movement', con=conn)

In [4]:
#print lengths of original tables
print color.BOLD+'Movements, Applications, and Users: \nOriginal Dataset'+color.END
print len(move_df)
print len(apps_df)
print apps_df.ute_id.nunique()

#show dates of last movements
#move_df.groupby('date').application_id.count().sort_index(ascending=False)

[1mMovements, Applications, and Users: 
Original Dataset[0m
870061
125029
65523


In [5]:
move_df.head()
#move_df.loc[10, 'movement_date']==''
type(move_df.loc[10, 'year_month'])

numpy.int64

## Filter data to use

In [6]:
# create datetime objects for applications from candidatura_data
# apps_df.loc[:,'date']= apps_df.loc[:,'candidatura_data'].apply(
# #     lambda x: None if x=='' else datetime(year=int(x[0:4]),
# #                        month=int(x[5:7]),
# #                        day=int(x[8:10])))

apps_df.loc[:, 'date']=pd.to_datetime(apps_df.loc[:,'anomes'],format="%Y%m")

In [7]:
#create datetime objects for cancellations,
# and placeholder dates for movements that aren't cancellations
def create_date(series):
    if series['movement_date'] == '':
        return datetime(year=int(str(series['year_month'])[0:4]),
                        month = int(str(series['year_month'])[4:6]),
                        day = 15)
    else:
        return datetime(year=int(series['movement_date'][0:4]),
                       month=int(series['movement_date'][5:7]),
                       day=int(series['movement_date'][8:10]))

move_df.loc[:, 'date'] = pd.to_datetime(move_df.loc[:,'year_month'],format="%Y%m")
#move_df.apply(create_date, axis=1)

# OR create datetime objects for just cancellations from movement_date
# move_df.loc[:, 'date'] = move_df.loc[:, 'movement_date'].apply(
#     lambda x: None if x=='' else datetime(year=int(x[0:4]),
#                        month=int(x[5:7]),
#                        day=int(x[8:10])))

In [8]:
apps_df.head()

Unnamed: 0,table_index,anomes,ctipo_movimento,dtipo_movimento,ute_id,sexo,chabilitacao_escolar,dhabilitacao_escolar,cdeficiencia,ddeficiencia,...,conjuge_estado_civil,conjuge_categoria,conjuge_estado,conjuge_motivo_indisponibilidade,candidatura_categoria_anterior,candidatura_estado_anterior,ute_nr_pessoas_cargo,ute_nr_descendentes_cargo,candidatura_data_ppe,date
0,1,200701,11,PEDIDOS DE EMPREGO AO LONGO DO MÃS,1058797,M,09,9 ANOS,0,NÃO DEFICIENTE,...,,,,,,,0.0,,,2007-01-01
1,2,200701,11,PEDIDOS DE EMPREGO AO LONGO DO MÃS,1062047,F,06,6 ANOS,0,NÃO DEFICIENTE,...,,,,,,,1.0,,,2007-01-01
2,3,200701,11,PEDIDOS DE EMPREGO AO LONGO DO MÃS,1080395,F,06,6 ANOS,21,DEFICIÃNCIAS DA MEMÃRIA,...,,,,,,,1.0,,,2007-01-01
3,4,200701,11,PEDIDOS DE EMPREGO AO LONGO DO MÃS,1194683,F,LC,LICENCIATURA,0,NÃO DEFICIENTE,...,,,,,,,0.0,,,2007-01-01
4,5,200701,11,PEDIDOS DE EMPREGO AO LONGO DO MÃS,1199775,M,09,9 ANOS,0,NÃO DEFICIENTE,...,,,,,,,0.0,,,2007-01-01


In [9]:
move_df.head()

Unnamed: 0,ute_id,movement_date,year_month,application_id,movement_type,movement_subtype,movement_result,movement_index,date
0,677,,200701,-1,convocation,GERAL UTENTE,COMPARECEU,3,2007-01-01
1,677,,200702,-1,convocation,INTERVENÃÃO TÃCNICA,COMPARECEU,2752,2007-02-01
2,677,2007-09-26 00:00:00,200709,-1,cancellation,FALTA AO CONTROLO,,7180,2007-09-01
3,710,2016-12-06 00:00:00,201612,120674,application,DESEMPREGADO-NOVO EMPREGO,,120674,2016-12-01
4,710,,201701,120674,convocation,INTERVENÃÃO TÃCNICA,COMPARECEU,248206,2017-01-01


In [10]:
# Identify application IDs that are associated with applications submitted after
# April 2016 - to allow for 1 year of follow-up data
apps_not_late = apps_df[apps_df['date']<=datetime(2016, 4, 30)].loc[:,'table_index'].tolist()

apps_df = apps_df[apps_df['table_index'].isin(apps_not_late)]
move_df = move_df[move_df['application_id'].isin(apps_df['table_index'])]

print color.BOLD+'Movements, Applications, and Users: \nRemoved apps after April 28, 2016 & associated movements'+color.END
print len(move_df)
print len(apps_df)
print apps_df.ute_id.nunique()

[1mMovements, Applications, and Users: 
Removed apps after April 28, 2016 & associated movements[0m
756387
113249
61403


In [11]:
# Identify applications IDs that were not unemployed
apps_df.dcategoria.value_counts()
apps_df = apps_df[apps_df['dcategoria'].isin(['DESEMPREGADO-NOVO EMPREGO',
                                            'DESEMPREGADO-1Âº EMPREGO'])]
move_df = move_df[move_df['application_id'].isin(apps_df['table_index'])]
print color.BOLD+'Movements, Applications, and Users: \nRemoved apps after April 28, 2016 & associated movements\nRemoved employed & part-time employed'+color.END
print len(move_df)
print len(apps_df)
print apps_df.ute_id.nunique()

[1mMovements, Applications, and Users: 
Removed apps after April 28, 2016 & associated movements
Removed employed & part-time employed[0m
729811
108663
59941


## Generating labels

In [12]:
# Create dictionary of appliction dates
app_date_dict = dict(zip(apps_df['table_index'], apps_df['date']))

# Create time since app variable
move_df.loc[:, 'app_date'] = move_df.loc[:, 'application_id'].map(app_date_dict)
move_df.loc[:, 'time_since_app'] = move_df.loc[:, 'date'] - move_df.loc[:, 'app_date']

In [13]:
# Identify applications that had an interview placement within 12 months
apps_placed_interview = move_df[(move_df['movement_result']=='ADMITIDO / COLOCADO') \
                                & (move_df['time_since_app']<=timedelta(days=365))].\
application_id.unique().tolist()

In [14]:
# Identify applications that had a cancellation for becoming employed within 12 months
employed_cancellations = ['COLOCAÃÃO POR MEIOS PRÃPRIOS, POR CONTA DE OUTREM',
                         'COLOCAÃÃO POR MEIOS PRÃPRIOS, POR CONTA PRÃPRIA',
                         'COLOCAÃÃO - CANDIDATURA EXTERNA'
                         'COLOCAÃÃO - CANDIDATURA INTERNA']

apps_placed_cancellation = move_df[(move_df['movement_subtype'].isin(employed_cancellations)) \
                                & (move_df['time_since_app']<=timedelta(days=365))].\
application_id.unique().tolist()


# Identify applications that had any canacellation within 12 months
apps_cancelled_12 = move_df[(move_df['movement_type']=='cancellation') \
                                & (move_df['time_since_app']<=timedelta(days=365))].\
application_id.unique().tolist()

In [15]:
print 'Applications placed within 12 mo thru interview: '+\
str(len(apps_placed_interview))

print 'Applications placed within 12 mo thru cancellation: '+\
str(len(apps_placed_cancellation))

print 'Applications that appear in both lists: '+\
str(len(set(apps_placed_interview).intersection(apps_placed_cancellation)))

print '\nApplications cancelled within 12 months: '+\
str(len(apps_cancelled_12))
print 'Applications that were both placed and cancelled within 12 months: '+\
str(len(set(apps_placed_interview).intersection(apps_cancelled_12)))

Applications placed within 12 mo thru interview: 4245
Applications placed within 12 mo thru cancellation: 17395
Applications that appear in both lists: 223

Applications cancelled within 12 months: 71595
Applications that were both placed and cancelled within 12 months: 504


In [16]:
apps_df.groupby(['table_index', 'date'])['ute_id', 'anomes'].count().head()

Unnamed: 0_level_0,Unnamed: 1_level_0,ute_id,anomes
table_index,date,Unnamed: 2_level_1,Unnamed: 3_level_1
1,2007-01-01,1,1
2,2007-01-01,1,1
3,2007-01-01,1,1
4,2007-01-01,1,1
5,2007-01-01,1,1


In [17]:
apps_df.head()

Unnamed: 0,table_index,anomes,ctipo_movimento,dtipo_movimento,ute_id,sexo,chabilitacao_escolar,dhabilitacao_escolar,cdeficiencia,ddeficiencia,...,conjuge_estado_civil,conjuge_categoria,conjuge_estado,conjuge_motivo_indisponibilidade,candidatura_categoria_anterior,candidatura_estado_anterior,ute_nr_pessoas_cargo,ute_nr_descendentes_cargo,candidatura_data_ppe,date
0,1,200701,11,PEDIDOS DE EMPREGO AO LONGO DO MÃS,1058797,M,09,9 ANOS,0,NÃO DEFICIENTE,...,,,,,,,0.0,,,2007-01-01
1,2,200701,11,PEDIDOS DE EMPREGO AO LONGO DO MÃS,1062047,F,06,6 ANOS,0,NÃO DEFICIENTE,...,,,,,,,1.0,,,2007-01-01
2,3,200701,11,PEDIDOS DE EMPREGO AO LONGO DO MÃS,1080395,F,06,6 ANOS,21,DEFICIÃNCIAS DA MEMÃRIA,...,,,,,,,1.0,,,2007-01-01
3,4,200701,11,PEDIDOS DE EMPREGO AO LONGO DO MÃS,1194683,F,LC,LICENCIATURA,0,NÃO DEFICIENTE,...,,,,,,,0.0,,,2007-01-01
4,5,200701,11,PEDIDOS DE EMPREGO AO LONGO DO MÃS,1199775,M,09,9 ANOS,0,NÃO DEFICIENTE,...,,,,,,,0.0,,,2007-01-01


In [18]:
    lab_df = apps_df.rename(columns = {'table_index':'application_id'})
    lab_df = lab_df.groupby(['application_id', 'date'])['ute_id', 'anomes'].count()

In [19]:
lab_df.index.get_level_values(level='application_id')

Int64Index([     1,      2,      3,      4,      5,      6,      7,      8,
                 9,     10,
            ...
            113240, 113241, 113242, 113243, 113244, 113245, 113246, 113247,
            113248, 113249],
           dtype='int64', name=u'application_id', length=108663)

In [20]:
def generate_labels(apps_df, move_df):
    # Create dictionary of appliction dates
    app_date_dict = dict(zip(apps_df['table_index'], apps_df['date']))
    
    # Create time since app variable
    move_df.loc[:, 'app_date'] = move_df.loc[:, 'application_id'].map(app_date_dict)
    move_df.loc[:, 'time_since_app'] = move_df.loc[:, 'date'] - move_df.loc[:, 'app_date']

    # Identify applications that had an interview placement within 12 months
    apps_placed_interview = move_df[(move_df['movement_result']=='ADMITIDO / COLOCADO') \
                                & (move_df['time_since_app']<=timedelta(days=365))].\
    application_id.unique().tolist()
    
    # Identify applications that had a cancellation for becoming employed within 12 months
    employed_cancellations = ['COLOCAÃÃO POR MEIOS PRÃPRIOS, POR CONTA DE OUTREM',
                         'COLOCAÃÃO POR MEIOS PRÃPRIOS, POR CONTA PRÃPRIA',
                         'COLOCAÃÃO - CANDIDATURA EXTERNA'
                         'COLOCAÃÃO - CANDIDATURA INTERNA']

    apps_placed_cancellation = move_df[(move_df['movement_subtype'].isin(employed_cancellations)) \
                                & (move_df['time_since_app']<=timedelta(days=365))].\
    application_id.unique().tolist()

    # Create list for applications that found a job within 12 months
    apps_found_job_12 = apps_placed_interview + apps_placed_cancellation
    
    # Identify applications that had any cancellation within 12 months
    apps_cancelled_12 = move_df[(move_df['movement_type']=='cancellation') \
                                & (move_df['time_since_app']<=timedelta(days=365))].\
    application_id.unique().tolist()
    
    # grab unique app IDs and put into index
    labels_df = apps_df.rename(columns = {'table_index':'application_id'})
    labels_df = labels_df.groupby(['application_id', 'date'])['ute_id', 'anomes'].count()
  #  labels_df.columns = ['application_id']
    

    #create intermediate variables found_job_12 and cancelled _12
    labels_df.loc[:,'found_job_12'] = labels_df.index.get_level_values(level='application_id').isin(apps_found_job_12)
    labels_df.loc[:,'canceled_12'] = labels_df.index.get_level_values(level='application_id').isin(apps_cancelled_12)
    
    #create labels, set application ID as the index, and drop intermediate variables
    #label is True if application is LTU
    labels_df.loc[:, 'label'] = ~labels_df.loc[:, 'found_job_12'] & ~labels_df.loc[:, 'canceled_12']
    labels_df.drop (['found_job_12', 'canceled_12', 'ute_id', 'anomes'], axis=1, inplace=True)

    return labels_df

labels = generate_labels(apps_df, move_df)

In [46]:
print labels.label.value_counts(normalize=True)
print labels.label.value_counts(normalize=False)
print labels.shape

False    0.693299
True     0.306701
Name: label, dtype: float64
False    75336
True     33327
Name: label, dtype: int64
(108663, 1)


In [49]:
#matrix.describe(include='all')
print '\nNumber of applications that are LTU'
print labels.label.value_counts(dropna=False)
print '\nProportion of applications that are LTU'
print labels.label.value_counts(dropna=False, normalize=True)


Number of applications that are LTU
False    75336
True     33327
Name: label, dtype: int64

Proportion of applications that are LTU
False    0.693299
True     0.306701
Name: label, dtype: float64


In [55]:
m1 = labels.copy()
m1.reset_index(inplace=True)
m1.drop('date', axis=1, inplace=True)
m1.head()

Unnamed: 0,application_id,label
0,1,False
1,2,False
2,3,False
3,4,False
4,5,False


## Compare labels to other matrix generation file

In [67]:
m2 = pd.read_csv('/mnt/data/shared/workingData/unemployment_data_matrix.csv')

AttributeError: 'NoneType' object has no attribute 'head'

In [80]:
m2 = m2[['application_id', 'ltu']]
m2 = m2.sort_values('application_id')
m2.reset_index(inplace=True)

comparison = sum(m1['label'] != m2['ltu'])
print comparison

0


In [76]:
print type(m1.label[0])
print type(m2.ltu[0])

print m1.shape
print m2.shape

<type 'numpy.bool_'>
<type 'numpy.bool_'>
(108663, 2)
(108663, 2)


## Generating features

In [23]:
# generate demographic fetaures from applications table
def demographic_features(apps_df, column_names):
    apps_df_small = apps_df.rename(columns={'table_index': 'application_id'})
    apps_df_small = apps_df_small.groupby(['application_id', 'date']).first()
    apps_df_small = apps_df_small[column_names]
#     apps_df_small = apps_df.loc[:,column_names+['table_index', 'date']]
#     apps_df_small.rename(columns = {'table_index':'application_id'}, inplace=True)
#     apps_df_small.index = apps_df_small[['application_id', 'date']]
    return apps_df_small

features_demographic = demographic_features(apps_df, ['sexo', 'ute_idade', 'sub_rsi'])
features_demographic.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,sexo,ute_idade,sub_rsi
application_id,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,2007-01-01,M,29,N
2,2007-01-01,F,46,N
3,2007-01-01,F,31,S
4,2007-01-01,F,34,N
5,2007-01-01,M,26,N


In [24]:
# generate reapplication features from applications table
# generate interaction features from movements table

## Binding Features to Labels

In [25]:
m = labels
for f in [features_demographic]:
    m = m.merge(f, how = 'left', left_index=True, right_index=True)

In [26]:
# m.head()
# m.describe(include='all')
m.isnull().sum()

label        0
sexo         0
ute_idade    0
sub_rsi      0
dtype: int64

In [27]:
# create function for cleaning matrix
def clean_matrix(m):
    # generate dummies for categorical values
    m = pd.get_dummies(m, drop_first=False)
    return m

m = clean_matrix(m)

# Fitting the Model

In [28]:
y = m.label.astype(float)
X = m.drop('label', 1)

In [29]:
estimator = RandomForestClassifier(n_estimators=100,
                                  criterion = 'gini',
                                  max_depth = None,
                                  random_state=4321)
estimator.fit(X=X, y=y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=1, oob_score=False, random_state=4321,
            verbose=0, warm_start=False)

In [30]:
#linear regression exploration
# import statsmodels.formula.api as sm
# result = sm.ols(formula = 'LTU ~ ute_idade + sexo_F + sexo_M', data=m).fit()
# print result.params
# print result.summary()

## Evaluating the Model

In [31]:
m.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,label,ute_idade,sexo_F,sexo_M,sub_rsi_N,sub_rsi_S
application_id,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,2007-01-01,False,29,0,1,1,0
2,2007-01-01,False,46,1,0,1,0
3,2007-01-01,False,31,1,0,0,1
4,2007-01-01,False,34,1,0,1,0
5,2007-01-01,False,26,0,1,1,0


In [32]:
#m.iloc[m.index.get_level_values('date')<pd.to_datetime('2014-01-01'), :]#.sort_index(level='date', ascending = False)

In [33]:
def split_data(apps_df, move_df, start_date, run_date, follow_up):
    
    # get a timestamp
    start_date = pd.to_datetime(start_date)
    run_date = pd.to_datetime(run_date)
    split_date = run_date - timedelta(days=follow_up)
    
    # create lables and features
    labels = generate_labels(apps_df, move_df)
    features_demographic = demographic_features(apps_df, ['sexo', 'ute_idade', 'sub_rsi'])
    
    # bind labels and features
    m = labels
    for f in [features_demographic]:
        m = m.merge(f, how = 'left', left_index=True, right_index=True)
    
    # create a training set and test set based on split date
    m_train = m.iloc[(m.index.get_level_values('date')<split_date) & (m.index.get_level_values('date')>=start_date), :] 
    m_test = m.iloc[m.index.get_level_values('date')>=split_date, :]
    
    return clean_matrix(m_train), clean_matrix(m_test)

m_train, m_test = split_data(apps_df, move_df, start_date = '2012-01-01',
                             run_date = '2014-01-01', follow_up = 365)

In [34]:
m_train.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,label,ute_idade,sexo_F,sexo_M,sub_rsi_N,sub_rsi_S
application_id,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
56177,2012-01-01,True,38,0,1,1,0
56178,2012-01-01,False,33,0,1,1,0
56179,2012-01-01,True,40,1,0,1,0
56180,2012-01-01,False,35,1,0,1,0
56181,2012-01-01,True,51,1,0,1,0


In [35]:
print 'Lenth of train set: '+str(len(m_train))
print 'Lenth of test set: '+str(len(m_test))
print 'Total apps: '+str((len(m_train)+len(m_test)))

Lenth of train set: 12720
Lenth of test set: 42366
Total apps: 55086


In [36]:
# Create and train the estimator 
estimator = RandomForestClassifier(n_estimators=100,
                       criterion='gini',
                       max_depth=None,
                       random_state=4321)

X_train = m_train.drop('label', 1)
y_train = m_train.label.astype(float)

X_test = m_test.drop('label', 1)
y_test = m_test.label.astype(float)

estimator.fit(X=X_train, y=y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=1, oob_score=False, random_state=4321,
            verbose=0, warm_start=False)

In [37]:
# Use estimator to predict on test set
pred = estimator.predict_proba(X=X_test)
y_predict_prob = pd.Series(index=m_test.index, data=pred[:,1])

#translate proba to 1 or 0
y_predict_class = y_predict.apply(lambda x: 1 if x>=0.50 else 0)

NameError: name 'y_predict' is not defined

In [None]:
#print sklearn.metrics.roc_auc_score(y_test, y_predict_prob)
print 'Accuracy score:'
print sklearn.metrics.accuracy_score(y_test, y_predict_class)

print '\nBaseline accuracy:'
print max (m_test.label.mean(), 1-m.label.mean())

print '\nPrecision score:'
print sklearn.metrics.precision_score(y_test, y_predict_class)

In [None]:
# precision vs. recall curves
from sklearn.metrics import precision_recall_curve
precision_curve, recall_curve, pr_thresholds = \
precision_recall_curve(y_true = y_test, probas_pred = y_predict_prob)

In [None]:
# plotting curve
fig = plt.figure(figsize=(10,6))
ax = fig.add_subplot(111)
ax.plot(recall_curve, precision_curve)
ax.set_xlabel('Recall')
ax.set_ylabel('Precision')
ax.set_title('Precision vs. Recall for our Example model')
plt.show()

In [None]:
apps_df.columns