In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
import pandas as pd
import numpy as np
import datetime as dt
from dateutil.relativedelta import relativedelta
import math

In [3]:
from utils import model_utils
from utils import db_utils
from utils import iefp_data_utils
from utils import feature_utils

  from pandas.core import datetools


In [4]:
conn = db_utils.connect_to_db()
apps, movs = iefp_data_utils.get_clean_data(conn)

In [5]:
def split_train_test_apps(apps_df,movs_df,test_date,train_start_date,ltu_length):
    selected_apps = model_utils.filter_by_time_range(apps_df,'app_start_date',train_start_date,test_date).sort_values('app_start_date')
    selected_movs = model_utils.filter_by_time_range(movs_df,'movement_event_date',train_start_date,test_date).sort_values('movement_event_date')
    exitted = model_utils.apps_exited_before_date(selected_movs,test_date)
    test_apps = selected_apps[(np.logical_not(selected_apps['table_index'].isin(exitted))) &
                              (feature_utils.difftime_in_months(test_date,selected_apps['app_start_date']) < ltu_length)]

    train_apps = selected_apps[(np.logical_not(selected_apps['table_index'].isin(test_apps['table_index'])))]
    return train_apps,test_apps

In [6]:
train_apps,test_apps = split_train_test_apps(apps,movs,pd.to_datetime('2015-05-01'),pd.to_datetime('2010-05-01'),12)

In [9]:
print train_apps.shape
print test_apps.shape


(53822, 62)
(6929, 62)


In [10]:
#Assert that no application id appears in both sets
assert sum(train_apps['table_index'].isin(test_apps['table_index'])) == 0
#train_apps['table_index'].isin(test_apps['table_index'])

In [11]:
#Assert that applications which exitted the system before the action date do not figure in the train/test sets
action_date = pd.to_datetime('2015-05-01')
train_st_date = pd.to_datetime('2010-05-01')
train_end_date = action_date
selected_train_apps = model_utils.filter_by_time_range(apps,'app_start_date',train_st_date,train_end_date)
selected_train_movs = model_utils.filter_by_time_range(movs,'movement_event_date',train_st_date,train_end_date)
exitted_apps = model_utils.apps_exited_before_date(selected_train_movs,action_date)

assert sum(test_apps['table_index'].isin(exitted_apps)) == 0

#test_st_date = train_st_date
#test_end_date = action_date
#selected_test_apps = filter_by_time_range(apps,'app_start_date',test_st_date,test_end_date)
#selected_test_movs = filter_by_time_range(movs[movs['application_id'].isin(selected_test_apps['table_index'])],'movement_event_date',test_st_date,test_end_date)
#exitted_test_apps = apps_exited_before_date(selected_test_movs,action_date)

#assert sum(test_movs['application_id'].isin(exitted_test_apps)) == 0

In [7]:
def get_cancellation_date(x):
    return x[x['movement_type'].isin(['cancellation'])].sort_values('movement_event_date').groupby(['application_id']).first().reset_index()[['application_id','movement_event_date']].rename(columns={'movement_event_date': 'cancellation_date'})

def get_placement_date(x):
    return x[x['movement_result'].isin(['ADMITIDO / COLOCADO'])].sort_values('movement_event_date').groupby(['application_id']).first().reset_index()[['application_id','movement_event_date']].rename(columns={'movement_event_date': 'placement_date'})



def get_exit_date(app,test_date,):
    exit_date = None
    if (pd.notnull(app['cancellation_date'])):
        if (pd.notnull(app['placement_date'])):
            return min(app['cancellation_date'],app['placement_date'])
        else:
            return app['cancellation_date']
    elif (pd.notnull(app['placement_date'])):
        return app['placement_date']
    else:
        return test_date

In [9]:
apps_cancellations = get_cancellation_date(movs)
apps_placements = get_placement_date(movs)

In [38]:
print apps_cancellations.head()
print apps_placements.head()


   application_id cancellation_date
0               1        2007-02-22
1               3        2007-07-24
2               4        2007-11-16
3               5        2007-04-10
4               6        2007-11-21
   application_id placement_date
0               2     2007-02-07
1              11     2007-02-27
2              36     2008-04-22
3              86     2008-02-29
4             100     2007-02-22


In [10]:
apps_length = apps.merge(apps_cancellations,how='left', left_on='table_index',right_on='application_id').merge(apps_placements,how='left', left_on='table_index',right_on='application_id')[['table_index','app_start_date','cancellation_date','placement_date']]

In [40]:
apps_length.head()

Unnamed: 0,table_index,app_start_date,cancellation_date,placement_date
0,120687,2016-12-06,NaT,NaT
1,70870,2013-02-18,2015-03-26,NaT
2,22642,2009-03-20,2011-11-29,NaT
3,61038,2012-05-17,2013-01-29,NaT
4,1941,2007-03-27,NaT,2007-10-15


In [54]:
action_date = pd.to_datetime('2015-05-01')
apps_length['app_ref_date'] = action_date
#apps_length['app_exit_date'] = min(apps_length['cancellation_date'],apps_length['placement_date'],action_date)
apps_length['exit_date'] = apps_length[['cancellation_date', 'placement_date','app_ref_date']].min(axis=1)

In [55]:
apps_length.head()

Unnamed: 0,table_index,app_start_date,cancellation_date,placement_date,app_ref_date,exit_date
0,120687,2016-12-06,NaT,NaT,2015-05-01,2015-05-01
1,70870,2013-02-18,2015-03-26,NaT,2015-05-01,2015-03-26
2,22642,2009-03-20,2011-11-29,NaT,2015-05-01,2011-11-29
3,61038,2012-05-17,2013-01-29,NaT,2015-05-01,2013-01-29
4,1941,2007-03-27,NaT,2007-10-15,2015-05-01,2007-10-15


In [15]:
def get_last_active_date(apps,movs,ref_date):
    apps_cancellations = get_cancellation_date(movs)
    apps_placements = get_placement_date(movs)
    apps_length = apps.merge(apps_cancellations,how='left', left_on='table_index',right_on='application_id').merge(apps_placements,how='left', left_on='table_index',right_on='application_id')[['table_index','app_start_date','cancellation_date','placement_date']]
    apps_length['ref_date'] = ref_date
    apps_length['last_active_date'] = apps_length[['cancellation_date', 'placement_date','ref_date']].min(axis=1)
    
    return apps_length[['table_index','app_start_date','last_active_date']]    

In [16]:
apps_last_active_date = get_last_active_date(train_apps,movs,pd.to_datetime('2015-05-01'))

In [62]:
apps_last_active_date.head()

Unnamed: 0,table_index,app_start_date,last_active_date
0,37844,2010-05-03,2013-09-25
1,37850,2010-05-03,2010-08-30
2,37851,2010-05-03,2010-12-21
3,37740,2010-05-03,2010-06-16
4,37506,2010-05-03,2010-05-03


In [70]:
movs[movs['application_id'] == 37506]

Unnamed: 0,ute_id,movement_event_date,application_id,movement_type,movement_subtype,movement_result,movement_index
396789,2730524,2010-05-03,37506,application,DESEMPREGADO-NOVO EMPREGO,,37506
396790,2730524,2010-05-03,37506,intervention,RSI - DECLARAÃÃO EMITIDA,CONCLUIU,103165
396791,2730524,2010-05-03,37506,interview,587696347,ADMITIDO / COLOCADO,62257
396792,2730524,2010-05-06,37506,interview,587698732,RECUSA DE ENTIDADE EMPREGADORA - DESAJUSTAMENT...,62256
396793,2730524,2010-05-10,37506,intervention,COLOCAÃÃO,CONCLUIU,104284


In [18]:
def get_app_length(apps,movs,ref_date=None):
    apps_last_active_date = get_last_active_date(train_apps,movs,ref_date)
    apps_last_active_date['app_length'] = apps_last_active_date['last_active_date'] - apps_last_active_date['app_start_date']
    
    return apps_last_active_date

In [56]:
apps_length = get_app_length(apps,movs,pd.to_datetime('2015-05-01'))

In [58]:
apps_length.head()

Unnamed: 0,table_index,app_start_date,last_active_date,app_length,nmonths
0,37844,2010-05-03,2013-09-25,1241 days,40.772911
1,37850,2010-05-03,2010-08-30,119 days,3.909731
2,37851,2010-05-03,2010-12-21,232 days,7.622333
3,37740,2010-05-03,2010-06-16,44 days,1.445615
4,37506,2010-05-03,2010-05-03,0 days,0.0


In [54]:
def get_app_num_months(start_year,start_month,last_active_year,last_active_month):
    if (start_year == last_active_year):
        return (last_active_month - start_month) + 2
    else:
        return ((last_active_year - start_year) - 1)*12 + \
               ((12 - start_month) + 1) + \
               (last_active_month + 1)

In [113]:
def extend_data(apps,movs,ref_date,time_delta):
    apps_length = get_app_length(apps,movs,ref_date)
    toDataFrame=[]
    for i in xrange(0,apps_length.shape[0]):
        app_id = apps_length['table_index'][i]
        app_st_date = apps_length['app_start_date'][i]
        last_active_date = apps_length['last_active_date'][i]
        count = 0
        curr_ref_date = last_active_date - (int(count)*relativedelta(months=1))
        while (curr_ref_date >= app_st_date):
            toDataFrame.append([app_id,curr_ref_date])
            count += 1
            curr_ref_date = last_active_date - (int(count)*relativedelta(months=1))
    extended_data = pd.DataFrame(toDataFrame,columns=['application_id','ref_date'])
    return extended_data

In [114]:
extended_df=extend_data(train_apps,movs,pd.to_datetime('2015-05-01'),pd.Timedelta('30D'))

In [115]:
extended_df.head(60)

Unnamed: 0,application_id,ref_date
0,37844,2013-09-25
1,37844,2013-08-25
2,37844,2013-07-25
3,37844,2013-06-25
4,37844,2013-05-25
5,37844,2013-04-25
6,37844,2013-03-25
7,37844,2013-02-25
8,37844,2013-01-25
9,37844,2012-12-25


In [161]:
extended_df.describe()

Unnamed: 0,application_id
count,675763.0
mean,62826.775775
std,15094.959593
min,1097.0
25%,50555.0
50%,62109.0
75%,74303.0
max,124755.0


In [80]:
apps_length.head()

Unnamed: 0,table_index,app_start_date,last_active_date,app_length,nmonths
0,37844,2010-05-03,2013-09-25,1241 days,40.772911
1,37850,2010-05-03,2010-08-30,119 days,3.909731
2,37851,2010-05-03,2010-12-21,232 days,7.622333
3,37740,2010-05-03,2010-06-16,44 days,1.445615
4,37506,2010-05-03,2010-05-03,0 days,0.0


In [111]:
toDataFrame=[]
for i in xrange(0,5):
    app_id = apps_length['table_index'][i]
    app_st_date = apps_length['app_start_date'][i]
    last_active_date = apps_length['last_active_date'][i]
    count = 0
    curr_ref_date = last_active_date - (int(count)*relativedelta(months=1))
    while (curr_ref_date >= app_st_date):
        toDataFrame.append([app_id,curr_ref_date])
        count += 1
        curr_ref_date = last_active_date - (int(count)*relativedelta(months=1))
extended_data = pd.DataFrame(toDataFrame,columns=['application_id','ref_date'])
        

In [112]:
extended_data

Unnamed: 0,application_id,ref_date
0,37844,2013-09-25
1,37844,2013-08-25
2,37844,2013-07-25
3,37844,2013-06-25
4,37844,2013-05-25
5,37844,2013-04-25
6,37844,2013-03-25
7,37844,2013-02-25
8,37844,2013-01-25
9,37844,2012-12-25


In [None]:
extended_data = pd.DataFrame([(app_id, ) for app_id in apps_length['app_length'][0]/pd.Timedelta('30D')]) 

In [74]:
extended_data = pd.DataFrame([(app_id, ) for app_id in test_apps['table_index']])

In [75]:
extended_data.head()

Unnamed: 0,0
0,87466
1,88025
2,88023
3,88024
4,87312


In [43]:
def extend_data(train_apps,time_delta):
    from dateutil.relativedelta import relativedelta
    apps_cancelled = get_apps_cancellations(movs)
    apps_placed = get_apps_placements(movs)
    apps_length['app_exit_date'] = apps_length.apply(lambda x: get_exit_date(x,pd.to_datetime('2015-05-01')),axis=1)
    toDataFrame=[]
    for index, row in test_apps.iterrows():
        table_index=row["table_index"]
        current_date=row["app_start_date"]
        app_exit=row["app_exit_date"]
        while current_date<app_exit:
            toDataFrame.append([table_index,current_date])
            current_date+=relativedelta(months=1)
    extended_df=pd.DataFrame(toDataFrame,columns=["application_id","date"])
    return extended_df
    
    

In [44]:

extended_df=extend_data(train_apps,pd.Timedelta('30D'))
print extended_df.shape
#def extend_data(train_apps,movs_df,time_delta):
#    extend_df=pd.DataFrame()
#test=pd.DataFrame(data=[[2,3,4]])
#print extend_data
extended_df.head()
    
    

(1212552, 2)


Unnamed: 0,application_id,date
0,70870,2013-02-18
1,70870,2013-03-18
2,70870,2013-04-18
3,70870,2013-05-18
4,70870,2013-06-18


In [42]:
from dateutil.relativedelta import relativedelta
extended_data=pd.DataFrame(columns=[1,2])
extended_data.columns=["application_id","date"]
print extended_data.head()
test=extended_data.append([2])
print test.head()
theList=[]
theList.append([2,4])
theList.append([6,7])
print theList
final=pd.DataFrame(theList,columns=["test1","test2"])
toDataFrame=[]
for index, row in test_apps.iterrows():
    table_index=row["table_index"]
    current_date=row["app_start_date"]
    app_exit=row["app_exit_date"]
    #print current_date
    #current_date+=relativedelta(months=1)
    #print current_date
    #print "enter while"
    while current_date<app_exit:
        toDataFrame.append([table_index,current_date])
        current_date+=relativedelta(months=1)
     #   print "in while"
final=pd.DataFrame(toDataFrame,columns=["application_id","date"])
final.head(100)
    

Empty DataFrame
Columns: [application_id, date]
Index: []
     0 application_id date
0  2.0            NaN  NaN
[[2, 4], [6, 7]]


Unnamed: 0,application_id,date
0,70870,2013-02-18
1,70870,2013-03-18
2,70870,2013-04-18
3,70870,2013-05-18
4,70870,2013-06-18
5,70870,2013-07-18
6,70870,2013-08-18
7,70870,2013-09-18
8,70870,2013-10-18
9,70870,2013-11-18


In [71]:
test_apps.head()

Unnamed: 0,table_index,app_start_date,cancellation_date,placement_date,app_exit_date
0,120687,2016-12-06,NaT,NaT,2015-05-01
1,70870,2013-02-18,2015-03-26,NaT,2015-03-26
2,22642,2009-03-20,2011-11-29,NaT,2011-11-29
3,61038,2012-05-17,2013-01-29,NaT,2013-01-29
4,1941,2007-03-27,NaT,2007-10-15,2007-10-15


In [51]:
#Assert that applications dates match expected dates in train set
train_app_start_date = pd.to_datetime('2015-05-01') - 2*pd.Timedelta('182.5D')
train_app_end_date = pd.to_datetime('2015-05-01') - pd.Timedelta('182.5D')
assert sum(train_movs[train_movs['movement_type'].isin(['application'])]['movement_event_date'] < train_app_start_date) == 0
assert sum(train_movs[train_movs['movement_type'].isin(['application'])]['movement_event_date'] >= train_app_end_date) == 0

In [52]:
#Assert that movements dates match expected dates in train set
train_mov_start_date = pd.to_datetime('2015-05-01') - 2*pd.Timedelta('182.5D')
train_mov_end_date = pd.to_datetime('2015-05-01') - pd.Timedelta('182.5D')
assert sum(train_movs['movement_event_date'] < train_mov_start_date) == 0
assert sum(train_movs['movement_event_date'] >= train_mov_end_date) == 0

In [53]:
#Assert that applications dates match expected dates in test set
test_app_start_date = pd.to_datetime('2015-05-01') - pd.Timedelta('182.5D')
test_app_end_date = pd.to_datetime('2015-05-01')
assert sum(test_movs[test_movs['movement_type'].isin(['application'])]['movement_event_date'] < test_app_start_date) == 0
assert sum(test_movs[test_movs['movement_type'].isin(['application'])]['movement_event_date'] >= test_app_end_date) == 0

In [54]:
#Assert that movements dates match expected dates in test set
test_mov_start_date = pd.to_datetime('2015-05-01') - pd.Timedelta('182.5D')
test_mov_end_date = pd.to_datetime('2015-05-01')
assert sum(test_movs['movement_event_date'] < test_mov_start_date) == 0
assert sum(test_movs['movement_event_date'] >= test_mov_end_date) == 0

In [55]:
print(train_apps['app_start_date'].head())
print(train_apps['app_start_date'].tail())
print "Max diff:",(train_apps['app_start_date'].iloc[-1]-train_apps['app_start_date'].iloc[1])

87555   2014-05-02
87464   2014-05-02
88025   2014-05-02
87309   2014-05-02
87465   2014-05-02
Name: app_start_date, dtype: datetime64[ns]
92518   2014-10-30 00:00:00
92517   2014-10-30 00:00:00
93233   2014-10-30 00:00:00
92778   2014-10-30 10:02:35
92779   2014-10-30 10:02:35
Name: app_start_date, dtype: datetime64[ns]
Max diff: 181 days 10:02:35


In [56]:
print(train_movs['movement_event_date'].head())
print(train_movs['movement_event_date'].tail())
print "Max diff:",(train_movs['movement_event_date'].iloc[-1]-train_movs['movement_event_date'].iloc[1])

1161098   2014-05-02
179786    2014-05-02
179787    2014-05-02
179788    2014-05-02
296838    2014-05-02
Name: movement_event_date, dtype: datetime64[ns]
1134859   2014-10-30
1134858   2014-10-30
1134857   2014-10-30
1134856   2014-10-30
1182144   2014-10-30
Name: movement_event_date, dtype: datetime64[ns]
Max diff: 181 days 00:00:00


In [57]:
print(test_apps['app_start_date'].head())
print(test_apps['app_start_date'].tail())
print "Max diff:",(test_apps['app_start_date'].iloc[-1]-test_apps['app_start_date'].iloc[1])

93627   2014-10-30 12:23:57
93667   2014-10-31 00:00:00
93668   2014-10-31 00:00:00
93670   2014-10-31 00:00:00
92596   2014-10-31 00:00:00
Name: app_start_date, dtype: datetime64[ns]
99318    2015-04-30 00:00:00
99262    2015-04-30 00:00:00
99241    2015-04-30 00:00:00
100228   2015-04-30 00:00:00
100606   2015-04-30 10:59:22
Name: app_start_date, dtype: datetime64[ns]
Max diff: 181 days 10:59:22


In [58]:
print(test_movs['movement_event_date'].head())
print(test_movs['movement_event_date'].tail())
print "Max diff:",(test_movs['movement_event_date'].iloc[-1]-test_movs['movement_event_date'].iloc[1])

1138929   2014-10-31
598007    2014-10-31
598006    2014-10-31
783576    2014-10-31
1122546   2014-10-31
Name: movement_event_date, dtype: datetime64[ns]
1037211   2015-04-30
175934    2015-04-30
175935    2015-04-30
639656    2015-04-30
1199309   2015-04-30
Name: movement_event_date, dtype: datetime64[ns]
Max diff: 181 days 00:00:00
