In [17]:
import os
import sys
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [18]:
import pandas as pd
import numpy as np

In [19]:
from utils import model_utils
from utils import db_utils
from utils import iefp_data_utils
from utils import feature_utils

In [20]:
conn = db_utils.connect_to_db()
apps, movs = iefp_data_utils.get_clean_data(conn)

In [47]:
def apps_cancelled_before_date(movs,date):
    return movs[(movs['movement_type'].isin(['cancellation'])) & (movs['movement_event_date'] < date)]['application_id'].unique()

def apps_placed_before_date(movs,date):
    return movs[(movs['movement_result'].isin(['ADMITIDO / COLOCADO'])) & (movs['movement_event_date'] < date)]['application_id'].unique()

def apps_exited_before_date(movs,date):
    return np.unique(np.append(apps_cancelled_before_date(movs,date),apps_placed_before_date(movs,date)))

def filter_by_time_range(df,column,start_date,end_date):
    return df[(df[column] >= start_date) & (df[column] < end_date)].copy()

def split_train_test_on_action_date(apps_df, movs_df,action_date,timedelta):
    #Define start and end dates for app_start_date filtering based on action date
    apps_train_st_date = action_date - 2*timedelta
    apps_train_end_date = action_date - timedelta
    apps_test_st_date = action_date - timedelta
    apps_test_end_date = action_date
    
    #Select applications for train and test sets based on their app_start_date
    train_apps = filter_by_time_range(apps_df,'app_start_date',apps_train_st_date,apps_train_end_date).sort_values('app_start_date')
    test_apps = filter_by_time_range(apps_df,'app_start_date',apps_test_st_date,apps_test_end_date).sort_values('app_start_date')
    
    #Select movements for train and test sets based on their movement_event_date
    train_movs = filter_by_time_range(movs_df[movs_df['application_id'].isin(train_apps['table_index'])],'movement_event_date',apps_train_st_date,apps_train_end_date).sort_values('movement_event_date')
    test_movs = filter_by_time_range(movs_df[movs_df['application_id'].isin(test_apps['table_index'])],'movement_event_date',apps_test_st_date,apps_test_end_date).sort_values('movement_event_date')    
    
    #Remove people who exited the system before the end date
    train_exitted = apps_exited_before_date(train_movs,action_date)
    train_apps = train_apps[np.logical_not(train_apps['table_index'].isin(train_exitted))]
    train_movs = train_movs[np.logical_not(train_movs['application_id'].isin(train_exitted))]
    
    test_exitted = apps_exited_before_date(test_movs,action_date)
    test_apps = test_apps[np.logical_not(test_apps['table_index'].isin(test_exitted))]
    test_movs = test_movs[np.logical_not(test_movs['application_id'].isin(test_exitted))]
    
    return (train_apps,train_movs,test_apps,test_movs)

In [48]:
train_apps,train_movs,test_apps,test_movs = split_train_test_on_action_date(apps,movs,pd.to_datetime('2015-05-01'),pd.Timedelta('182.5D'))

In [49]:
#Assert that no application id appears in both sets
assert sum(train_movs['application_id'].isin(test_movs['application_id'])) == 0

In [50]:
#Assert that applications which exitted the system before the action date do not figure in the train/test sets
action_date = pd.to_datetime('2015-05-01')
train_st_date = pd.to_datetime('2015-05-01') - 2*pd.Timedelta('182.5D')
train_end_date = pd.to_datetime('2015-05-01') - pd.Timedelta('182.5D')
selected_train_apps = filter_by_time_range(apps,'app_start_date',train_st_date,train_end_date)
selected_train_movs = filter_by_time_range(movs[movs['application_id'].isin(selected_train_apps['table_index'])],'movement_event_date',train_st_date,train_end_date)
exitted_train_apps = apps_exited_before_date(selected_train_movs,action_date)

assert sum(train_movs['application_id'].isin(exitted_train_apps)) == 0

test_st_date = pd.to_datetime('2015-05-01') - pd.Timedelta('182.5D')
test_end_date = pd.to_datetime('2015-05-01')
selected_test_apps = filter_by_time_range(apps,'app_start_date',test_st_date,test_end_date)
selected_test_movs = filter_by_time_range(movs[movs['application_id'].isin(selected_test_apps['table_index'])],'movement_event_date',test_st_date,test_end_date)
exitted_test_apps = apps_exited_before_date(selected_test_movs,action_date)

assert sum(test_movs['application_id'].isin(exitted_test_apps)) == 0

In [51]:
#Assert that applications dates match expected dates in train set
train_app_start_date = pd.to_datetime('2015-05-01') - 2*pd.Timedelta('182.5D')
train_app_end_date = pd.to_datetime('2015-05-01') - pd.Timedelta('182.5D')
assert sum(train_movs[train_movs['movement_type'].isin(['application'])]['movement_event_date'] < train_app_start_date) == 0
assert sum(train_movs[train_movs['movement_type'].isin(['application'])]['movement_event_date'] >= train_app_end_date) == 0

In [52]:
#Assert that movements dates match expected dates in train set
train_mov_start_date = pd.to_datetime('2015-05-01') - 2*pd.Timedelta('182.5D')
train_mov_end_date = pd.to_datetime('2015-05-01') - pd.Timedelta('182.5D')
assert sum(train_movs['movement_event_date'] < train_mov_start_date) == 0
assert sum(train_movs['movement_event_date'] >= train_mov_end_date) == 0

In [53]:
#Assert that applications dates match expected dates in test set
test_app_start_date = pd.to_datetime('2015-05-01') - pd.Timedelta('182.5D')
test_app_end_date = pd.to_datetime('2015-05-01')
assert sum(test_movs[test_movs['movement_type'].isin(['application'])]['movement_event_date'] < test_app_start_date) == 0
assert sum(test_movs[test_movs['movement_type'].isin(['application'])]['movement_event_date'] >= test_app_end_date) == 0

In [54]:
#Assert that movements dates match expected dates in test set
test_mov_start_date = pd.to_datetime('2015-05-01') - pd.Timedelta('182.5D')
test_mov_end_date = pd.to_datetime('2015-05-01')
assert sum(test_movs['movement_event_date'] < test_mov_start_date) == 0
assert sum(test_movs['movement_event_date'] >= test_mov_end_date) == 0

In [55]:
print(train_apps['app_start_date'].head())
print(train_apps['app_start_date'].tail())
print "Max diff:",(train_apps['app_start_date'].iloc[-1]-train_apps['app_start_date'].iloc[1])

87555   2014-05-02
87464   2014-05-02
88025   2014-05-02
87309   2014-05-02
87465   2014-05-02
Name: app_start_date, dtype: datetime64[ns]
92518   2014-10-30 00:00:00
92517   2014-10-30 00:00:00
93233   2014-10-30 00:00:00
92778   2014-10-30 10:02:35
92779   2014-10-30 10:02:35
Name: app_start_date, dtype: datetime64[ns]
Max diff: 181 days 10:02:35


In [56]:
print(train_movs['movement_event_date'].head())
print(train_movs['movement_event_date'].tail())
print "Max diff:",(train_movs['movement_event_date'].iloc[-1]-train_movs['movement_event_date'].iloc[1])

1161098   2014-05-02
179786    2014-05-02
179787    2014-05-02
179788    2014-05-02
296838    2014-05-02
Name: movement_event_date, dtype: datetime64[ns]
1134859   2014-10-30
1134858   2014-10-30
1134857   2014-10-30
1134856   2014-10-30
1182144   2014-10-30
Name: movement_event_date, dtype: datetime64[ns]
Max diff: 181 days 00:00:00


In [57]:
print(test_apps['app_start_date'].head())
print(test_apps['app_start_date'].tail())
print "Max diff:",(test_apps['app_start_date'].iloc[-1]-test_apps['app_start_date'].iloc[1])

93627   2014-10-30 12:23:57
93667   2014-10-31 00:00:00
93668   2014-10-31 00:00:00
93670   2014-10-31 00:00:00
92596   2014-10-31 00:00:00
Name: app_start_date, dtype: datetime64[ns]
99318    2015-04-30 00:00:00
99262    2015-04-30 00:00:00
99241    2015-04-30 00:00:00
100228   2015-04-30 00:00:00
100606   2015-04-30 10:59:22
Name: app_start_date, dtype: datetime64[ns]
Max diff: 181 days 10:59:22


In [58]:
print(test_movs['movement_event_date'].head())
print(test_movs['movement_event_date'].tail())
print "Max diff:",(test_movs['movement_event_date'].iloc[-1]-test_movs['movement_event_date'].iloc[1])

1138929   2014-10-31
598007    2014-10-31
598006    2014-10-31
783576    2014-10-31
1122546   2014-10-31
Name: movement_event_date, dtype: datetime64[ns]
1037211   2015-04-30
175934    2015-04-30
175935    2015-04-30
639656    2015-04-30
1199309   2015-04-30
Name: movement_event_date, dtype: datetime64[ns]
Max diff: 181 days 00:00:00
