In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [26]:
import pandas as pd
import numpy as np
import datetime as dt

In [3]:
from utils import model_utils
from utils import db_utils
from utils import iefp_data_utils
from utils import pandas_utils

  from pandas.core import datetools


In [4]:
conn = db_utils.connect_to_db()
apps, movs = iefp_data_utils.get_clean_data(conn)

In [5]:
def apps_cancelled_within_n_months(x,n=12):
    return x[(x['movement_type'].isin(['cancellation'])) & (x['months_after_app'] < n)]['application_id'].unique()

def apps_placed_within_n_months(x,n=12):
    return x[(x['movement_result'].isin(['ADMITIDO / COLOCADO'])) & (x['months_after_app'] < n)]['application_id'].unique()

def difftime_in_months(timeA,timeB):
    return (timeA-timeB)/np.timedelta64(1, 'M')

def get_ltu_label(apps,movs):
    app_date_dict = dict(zip(apps["table_index"],apps["app_start_date"]))
    movs["app_start_date"] = movs["application_id"].map(app_date_dict)
    movs["months_after_app"] = difftime_in_months(movs["movement_event_date"],movs["app_start_date"])
    
    print "Generating LTU/Non-LTU labels"
    cancelled_12mo = apps_cancelled_within_n_months(movs)
    placed_12mo = apps_placed_within_n_months(movs)
    
    non_ltu_apps = np.unique(np.append(cancelled_12mo,placed_12mo))
    apps['ltu'] = np.logical_not(apps['table_index'].isin(non_ltu_apps))
    last_data_date = max(apps['app_start_date'])
    apps.loc[difftime_in_months(last_data_date,apps['app_start_date']) < 12,'ltu'] = False
    
    return apps

def get_ltu_label_on_date(apps,movs,date,ltu_length):
    apps_movs = pandas_utils.filter_by_time_range(movs[movs['application_id'].isin(apps['table_index'])],'movement_event_date',None,date)
    app_date_dict = dict(zip(apps["table_index"],apps["app_start_date"]))
    apps_movs["app_start_date"] = apps_movs["application_id"].map(app_date_dict)
    apps_movs["months_after_app"] = difftime_in_months(apps_movs["movement_event_date"],apps_movs["app_start_date"])
    
    print "Generating LTU/Non-LTU labels"
    cancelled_12mo = apps_cancelled_within_n_months(apps_movs,ltu_length)
    placed_12mo = apps_placed_within_n_months(apps_movs,ltu_length)
    
    non_ltu_apps = np.unique(np.append(cancelled_12mo,placed_12mo))
    apps['ltu'] = np.logical_not(apps['table_index'].isin(non_ltu_apps))
    apps.loc[difftime_in_months(date,apps['app_start_date']) < ltu_length,'ltu'] = False
    
    return apps


# Testing LTU generation on a given date

In [6]:
train_st_date = pd.to_datetime('2010-05-01')
test_st_date = pd.to_datetime('2015-05-01')
test_end_date = test_st_date + pd.Timedelta('366D')

selected_apps = pandas_utils.filter_by_time_range(apps,'app_start_date',train_st_date,test_st_date)

In [7]:
selected_apps.app_start_date.describe()

count                   60751
unique                   1712
top       2013-09-02 00:00:00
freq                      163
first     2010-05-03 00:00:00
last      2015-04-30 10:59:22
Name: app_start_date, dtype: object

In [8]:
label_on_test_date = get_ltu_label_on_date(selected_apps,movs,test_st_date,12)[['table_index','app_start_date','ltu']]
label_on_test_date_12 = get_ltu_label_on_date(selected_apps,movs,test_end_date,12)[['table_index','app_start_date','ltu']]

Generating LTU/Non-LTU labels
Generating LTU/Non-LTU labels


In [9]:
print label_on_test_date.sort_values('app_start_date',ascending=False).head()
print label_on_test_date_12.sort_values('app_start_date',ascending=False).head()

        table_index      app_start_date    ltu
100606       100607 2015-04-30 10:59:22  False
100228       100229 2015-04-30 00:00:00  False
99473         99474 2015-04-30 00:00:00  False
99251         99252 2015-04-30 00:00:00  False
99875         99876 2015-04-30 00:00:00  False
        table_index      app_start_date    ltu
100606       100607 2015-04-30 10:59:22   True
100228       100229 2015-04-30 00:00:00  False
99473         99474 2015-04-30 00:00:00   True
99251         99252 2015-04-30 00:00:00   True
99875         99876 2015-04-30 00:00:00  False


In [10]:
labels = label_on_test_date.merge(label_on_test_date_12,on=['table_index','app_start_date'])
labels.head()

Unnamed: 0,table_index,app_start_date,ltu_x,ltu_y
0,70870,2013-02-18,True,True
1,61038,2012-05-17,False,False
2,45682,2011-02-15,False,False
3,50137,2011-07-27,True,True
4,57836,2012-02-16,False,False


In [11]:
def get_apps_cancellations(x):
    return x[x['movement_type'].isin(['cancellation'])].groupby(['application_id']).first().reset_index()[['application_id','movement_event_date']].rename(columns={'movement_event_date': 'cancellation_date'})

def get_apps_placements(x):
    return x[x['movement_result'].isin(['ADMITIDO / COLOCADO'])].groupby(['application_id']).first().reset_index()[['application_id','movement_event_date']].rename(columns={'movement_event_date': 'placement_date'})

def get_last_activity_date(app,today):
    exit_date = None
    if (pd.notnull(app['cancellation_date'])):
        if (pd.notnull(app['placement_date'])):
            return min(app['cancellation_date'],app['placement_date'])
        else:
            return app['cancellation_date']
    elif (pd.notnull(app['placement_date'])):
        return app['placement_date']
    else:
        return today

In [12]:
apps_cancelled = get_apps_cancellations(movs)
apps_placed = get_apps_placements(movs)
apps_length = selected_apps.merge(apps_cancelled,how='left', left_on='table_index',right_on='application_id').merge(apps_placed,how='left', left_on='table_index',right_on='application_id')[['table_index','app_start_date','cancellation_date','placement_date']]
apps_length.head()
apps_length['app_exit_date'] = apps_length.apply((lambda x: get_last_activity_date(x,test_end_date)),axis=1)
apps_length['app_length'] = difftime_in_months(apps_length['app_exit_date'],apps_length['app_start_date'])

In [14]:
apps_length.head()

Unnamed: 0,table_index,app_start_date,cancellation_date,placement_date,app_exit_date,app_length
0,70870,2013-02-18,2015-03-26,NaT,2015-03-26,25.166841
1,61038,2012-05-17,2013-01-29,NaT,2013-01-29,8.443705
2,45682,2011-02-15,2011-06-16,NaT,2011-06-16,3.975441
3,50137,2011-07-27,2015-07-28,NaT,2015-07-28,48.033841
4,57836,2012-02-16,2012-04-01,NaT,2012-04-01,1.47847


In [15]:
labels_apps_lengths = labels.merge(apps_length,on=['table_index','app_start_date'])
labels_apps_lengths.head()

Unnamed: 0,table_index,app_start_date,ltu_x,ltu_y,cancellation_date,placement_date,app_exit_date,app_length
0,70870,2013-02-18,True,True,2015-03-26,NaT,2015-03-26,25.166841
1,61038,2012-05-17,False,False,2013-01-29,NaT,2013-01-29,8.443705
2,45682,2011-02-15,False,False,2011-06-16,NaT,2011-06-16,3.975441
3,50137,2011-07-27,True,True,2015-07-28,NaT,2015-07-28,48.033841
4,57836,2012-02-16,False,False,2012-04-01,NaT,2012-04-01,1.47847


In [16]:
changed_labels = labels_apps_lengths[labels_apps_lengths['ltu_x'] != labels_apps_lengths['ltu_y']]
changed_labels.head()

Unnamed: 0,table_index,app_start_date,ltu_x,ltu_y,cancellation_date,placement_date,app_exit_date,app_length
23,111846,2014-06-02,False,True,NaT,NaT,2016-05-01,22.965564
29,100231,2015-04-07,False,True,2016-08-01,NaT,2016-08-01,15.836054
40,96894,2015-02-09,False,True,NaT,NaT,2016-05-01,14.686133
59,90335,2014-08-18,False,True,NaT,2016-04-27,2016-04-27,20.304318
86,95154,2014-12-15,False,True,2016-05-30,NaT,2016-05-30,17.478798


In [17]:
#Assert the applications whose LTU labels changed had exit dates after test start date
assert sum(changed_labels['app_exit_date'] >= test_st_date) == changed_labels.shape[0]
assert sum(np.logical_not(changed_labels['ltu_x'])) == changed_labels.shape[0]
assert sum(changed_labels['ltu_y']) == changed_labels.shape[0]

In [18]:
lasting_labels = labels_apps_lengths[labels_apps_lengths['ltu_x'] == labels_apps_lengths['ltu_y']]
lasting_labels.head()

Unnamed: 0,table_index,app_start_date,ltu_x,ltu_y,cancellation_date,placement_date,app_exit_date,app_length
0,70870,2013-02-18,True,True,2015-03-26,NaT,2015-03-26,25.166841
1,61038,2012-05-17,False,False,2013-01-29,NaT,2013-01-29,8.443705
2,45682,2011-02-15,False,False,2011-06-16,NaT,2011-06-16,3.975441
3,50137,2011-07-27,True,True,2015-07-28,NaT,2015-07-28,48.033841
4,57836,2012-02-16,False,False,2012-04-01,NaT,2012-04-01,1.47847


# Testing Default function (without dates)

In [19]:
labelled_apps = get_ltu_label(apps,movs)

Generating LTU/Non-LTU labels


In [20]:
labelled_apps.head()

Unnamed: 0,anomes,ctipo_movimento,dtipo_movimento,ute_id,sexo,chabilitacao_escolar,dhabilitacao_escolar,cdeficiencia,ddeficiencia,cnacionalidade,...,conjuge_estado,conjuge_motivo_indisponibilidade,candidatura_categoria_anterior,candidatura_estado_anterior,ute_nr_pessoas_cargo,ute_nr_descendentes_cargo,candidatura_data_ppe,table_index,app_start_date,ltu
120686,201612,11,PEDIDOS DE EMPREGO AO LONGO DO MÃS,710,M,09,9 ANOS,0,NÃO DEFICIENTE,PT,...,,,2.0,PAS,2,2.0,2011-07-19 00:00:00,120687,2016-12-06,False
70869,201302,11,PEDIDOS DE EMPREGO AO LONGO DO MÃS,818,F,09,9 ANOS,0,NÃO DEFICIENTE,PT,...,,,,,2,2.0,,70870,2013-02-18,True
22641,200903,11,PEDIDOS DE EMPREGO AO LONGO DO MÃS,820,F,LC,LICENCIATURA,0,NÃO DEFICIENTE,PT,...,,,,,2,,,22642,2009-03-20,True
61037,201205,11,PEDIDOS DE EMPREGO AO LONGO DO MÃS,820,F,LC,LICENCIATURA,0,NÃO DEFICIENTE,PT,...,,,3.0,PAS,2,2.0,,61038,2012-05-17,False
1940,200703,11,PEDIDOS DE EMPREGO AO LONGO DO MÃS,832,M,06,6 ANOS,0,NÃO DEFICIENTE,PT,...,,,,,1,,,1941,2007-03-27,False


In [27]:
last_data_date = max(apps['app_start_date'])
today = pd.to_datetime(dt.datetime.today().strftime("%m/%d/%Y"))
assert np.logical_not(np.any(labelled_apps[difftime_in_months(today,apps['app_start_date']) < 12].ltu))

In [28]:
cancelled_12mo = apps_cancelled_within_n_months(movs)
assert np.logical_not(np.any((labelled_apps[labelled_apps['ltu']].table_index.isin(cancelled_12mo))))
assert sum((labelled_apps[np.logical_not(labelled_apps['ltu'])].table_index).isin(cancelled_12mo)) == len(cancelled_12mo)

In [29]:
placed_12mo = apps_placed_within_n_months(movs)
assert np.logical_not(np.any((labelled_apps[labelled_apps['ltu']].table_index).isin(placed_12mo)))
assert sum((labelled_apps[np.logical_not(labelled_apps['ltu'])].table_index).isin(placed_12mo)) == len(placed_12mo)

In [30]:
exited_12mo = np.unique(np.append(cancelled_12mo,placed_12mo))
assert labelled_apps[np.logical_not(labelled_apps['ltu'])].shape[0] == len(exited_12mo) + labelled_apps[(difftime_in_months(last_data_date,apps['app_start_date']) < 12) & np.logical_not((labelled_apps.table_index.isin(cancelled_12mo) | labelled_apps.table_index.isin(placed_12mo)))].shape[0]