In [3]:
import pandas as pd
import numpy as np
import sys
sys.path.append("../../")
from utils import model_utils
from utils import db_utils
from utils import feature_utils
from utils import iefp_data_utils
from utils import train_test_utils
import os

  from pandas.core import datetools


#Define functions

In [8]:
def generate_cumsum(movs):
    movs_count = movs.drop(['movement_subtype','movement_result','movement_index','ute_id'], axis=1)
    movs_count['movements'] = 1
    movs_count = pd.get_dummies(movs_count, columns=['movement_type'])
    movs_cumsum = movs_count.groupby(['application_id','movement_event_date']).sum().groupby(level=[0]).cumsum()
    movs_cumsum.reset_index(level=[0,1],inplace=True)
    column_names = {'movement_type_application':'apps_so_far', 'movement_type_cancellation':'cancellations_so_far',\
                   'movement_type_category_change':"cat_changes_so_far", 'movement_type_convocation':'convocations_so_far',\
                   'movement_type_intervention':'interventions_so_far','movement_type_interview':'interviews_so_far',}
    movs_cumsum.rename(columns=column_names,inplace=True)
    if 'cancellations_so_far' in movs_cumsum.columns:
        #Remove because test set should not have cancellations 
        movs_cumsum.drop(['cancellations_so_far'],axis=1,inplace=True)

    return movs_cumsum

def generate_movs_so_far(extended_data, movs):
    movs_cumsum = generate_cumsum(movs)
    
    extended_trimmed = extended_data.drop(['app_start_date','ltu'], axis=1)
    extended_trimmed['entry_type'] = 'observation'
    movs_cumsum['entry_type'] = 'mov_date'
    
    #combine the extended data (has observation dates) with movements (has cumsum data)
    movs_so_far = pd.concat([extended_trimmed, movs_cumsum])
    movs_so_far['sort_date'] = movs_so_far['ref_date'].fillna(movs_so_far['movement_event_date'])
    
    #sort by date and forward fill (so that observation has cumsums from the closest movement_event_date before it)
    movs_so_far.sort_values(['application_id','sort_date','entry_type'], inplace=True)
    movs_so_far.drop(['movement_event_date','ref_date'], axis=1,inplace=True)
    movs_so_far.fillna(method='ffill', inplace=True)
    
    return movs_so_far[movs_so_far['entry_type']=="observation"]

#Define variables

In [11]:
action_date = pd.to_datetime('2015-04-30')
train_timedelta = pd.Timedelta('730D')
test_window_size = pd.Timedelta('365D')
train_st_date = action_date - train_timedelta
ltu_length = 12

#Pipeline

In [4]:
#Connect to DB
conn = db_utils.connect_to_db()
apps,movs = iefp_data_utils.get_clean_data(conn) 

In [12]:
#Split into train and test
train_apps,test_apps,train_movs,test_movs = train_test_utils.split_train_test_apps(apps,movs,action_date,train_st_date,ltu_length)

In [13]:
#Generate LTU labels
train_labels = train_test_utils.get_ltu_label_on_date(train_apps,movs,action_date,ltu_length)
test_labels = train_test_utils.get_ltu_label_on_date(test_apps,movs,action_date + test_window_size,ltu_length)       

Generating LTU/Non-LTU labels
Generating LTU/Non-LTU labels


In [15]:
#Extend data
extended_train = train_test_utils.extend_data(train_apps,movs,train_labels,action_date,pd.Timedelta('30D'))
extended_test = train_test_utils.extend_data(test_apps,movs,test_labels,action_date,None)

#Breakout generate cumsum

In [17]:
movs_count = movs.drop(['movement_subtype','movement_result','movement_index','ute_id'], axis=1)
movs_count['movements'] = 1

movs_count.head()

Unnamed: 0,movement_event_date,application_id,movement_type,movements
5,2016-12-06,120500,application,1
6,2016-12-06,120500,intervention,1
7,2017-01-10,120500,convocation,1
8,2017-01-16,120500,intervention,1
9,2017-02-03,120500,convocation,1


In [18]:
movs_count = pd.get_dummies(movs_count, columns=['movement_type'])

movs_count.head()

Unnamed: 0,movement_event_date,application_id,movements,movement_type_application,movement_type_cancellation,movement_type_category_change,movement_type_convocation,movement_type_intervention,movement_type_interview
5,2016-12-06,120500,1,1,0,0,0,0,0
6,2016-12-06,120500,1,0,0,0,0,1,0
7,2017-01-10,120500,1,0,0,0,1,0,0
8,2017-01-16,120500,1,0,0,0,0,1,0
9,2017-02-03,120500,1,0,0,0,1,0,0


In [19]:
movs_cumsum = movs_count.groupby(['application_id','movement_event_date']).sum().groupby(level=[0]).cumsum()

movs_cumsum.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,movements,movement_type_application,movement_type_cancellation,movement_type_category_change,movement_type_convocation,movement_type_intervention,movement_type_interview
application_id,movement_event_date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,2007-01-02,1,1,0,0,0,0,0
1,2007-02-21,2,1,0,0,1,0,0
1,2007-03-05,4,1,0,0,1,2,0
1,2007-07-02,5,1,0,0,1,3,0
1,2007-09-17,6,1,0,0,1,4,0


In [20]:
movs_cumsum.reset_index(level=[0,1],inplace=True)
column_names = {'movement_type_application':'apps_so_far', 'movement_type_cancellation':'cancellations_so_far',\
               'movement_type_category_change':"cat_changes_so_far", 'movement_type_convocation':'convocations_so_far',\
               'movement_type_intervention':'interventions_so_far','movement_type_interview':'interviews_so_far',}
movs_cumsum.rename(columns=column_names,inplace=True)

movs_cumsum.head()

Unnamed: 0,application_id,movement_event_date,movements,apps_so_far,cancellations_so_far,cat_changes_so_far,convocations_so_far,interventions_so_far,interviews_so_far
0,1,2007-01-02,1,1,0,0,0,0,0
1,1,2007-02-21,2,1,0,0,1,0,0
2,1,2007-03-05,4,1,0,0,1,2,0
3,1,2007-07-02,5,1,0,0,1,3,0
4,1,2007-09-17,6,1,0,0,1,4,0


In [21]:
if 'cancellations_so_far' in movs_cumsum.columns:
    #Remove because test set should not have cancellations 
    movs_cumsum.drop(['cancellations_so_far'],axis=1,inplace=True)

movs_cumsum.head()

Unnamed: 0,application_id,movement_event_date,movements,apps_so_far,cat_changes_so_far,convocations_so_far,interventions_so_far,interviews_so_far
0,1,2007-01-02,1,1,0,0,0,0
1,1,2007-02-21,2,1,0,1,0,0
2,1,2007-03-05,4,1,0,1,2,0
3,1,2007-07-02,5,1,0,1,3,0
4,1,2007-09-17,6,1,0,1,4,0


In [1]:
def generate_apps_cumsum(apps_df):
    apps_count = apps_df[['table_index','ute_id','app_start_date']]
    apps_count['apps']=1
    apps_cumsum = apps_count.groupby(['ute_id','table_index','app_start_date']).sum().groupby(level=[0]).cumsum()
    apps_cumsum.reset_index(level=[2], inplace = True)
    apps_cumsum.rename(columns = {'apps':'apps_cumsum'}, inplace=True)
    apps_cumsum.reset_index(level=[0,1], inplace = True)
    apps_cumsum['date_type']='app_start_date'
    apps_cumsum['prev_apps_cumsum'] = apps_cumsum['apps_cumsum']-1
    return apps_cumsum

In [33]:
apps_cumsum = generate_apps_cumsum(apps)
apps_cumsum.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,ute_id,table_index,app_start_date,apps_cumsum,date_type,prev_apps_cumsum
0,710,120500,2016-12-06,1,app_start_date,0
1,818,71481,2013-02-18,1,app_start_date,0
2,820,23430,2009-03-20,1,app_start_date,0
3,820,61578,2012-05-17,2,app_start_date,1
4,832,2483,2007-03-27,1,app_start_date,0


In [85]:
def generate_markers(apps_df, historical_limit):
    limit = pd.Timedelta(historical_limit)
    apps_markers = pd.DataFrame(apps_df.groupby(['ute_id','table_index']).app_start_date.max())
    apps_markers['app_st_date'] = apps_markers['app_start_date']
    apps_markers.set_index(['app_st_date'], append=True,inplace=True)
    apps_markers['historical_st_date'] = apps_markers['app_start_date']-limit
    apps_markers = pd.DataFrame(apps_markers.stack())
    apps_markers.reset_index(level=[0,1,2,3], inplace = True)
    apps_markers.rename(columns = {'level_3':'date_type', 0:'date'}, inplace=True)
    apps_markers.sort_values(['ute_id','date'], inplace=True)
    return apps_markers

In [86]:
apps_markers = generate_markers(apps, '730D')
apps_markers.head()

Unnamed: 0,ute_id,table_index,app_st_date,date_type,date
1,710,120500,2016-12-06,historical_st_date,2014-12-07
0,710,120500,2016-12-06,app_start_date,2016-12-06
3,818,71481,2013-02-18,historical_st_date,2011-02-19
2,818,71481,2013-02-18,app_start_date,2013-02-18
5,820,23430,2009-03-20,historical_st_date,2007-03-21


In [100]:
def generate_historical(apps_df, historical_limit):
    
    apps_cumsum = generate_apps_cumsum(apps_df)
    apps_markers = generate_markers(apps_df, historical_limit)
    
    apps_historical = pd.merge(apps_markers, apps_cumsum, how='left', left_on= ['ute_id','table_index','date_type','app_st_date'], right_on=['ute_id','table_index','date_type', 'app_start_date']).drop(['apps_cumsum'], axis=1)
    apps_historical['prev_apps_cumsum'].fillna(method='bfill', inplace=True)
    apps_historical = pd.pivot_table(apps_historical, values = "prev_apps_cumsum", index = ['ute_id', 'table_index', 'app_st_date'], columns = 'date_type')
    apps_historical['prev_apps_within_limit'] = apps_historical['app_start_date'] - apps_historical['historical_st_date']
    apps_historical = apps_historical.reset_index(level=[0,1]).drop(['historical_st_date','app_start_date'], axis=1)
    return apps_historical

In [101]:
apps_historical = generate_historical(apps, '730D')
apps_historical.head(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


date_type,ute_id,table_index,prev_apps_within_limit
app_st_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2016-12-06,710,120500,0.0
2013-02-18,818,71481,0.0
2009-03-20,820,23430,0.0
2012-05-17,820,61578,0.0
2007-03-27,832,2483,0.0
2007-12-10,832,9285,1.0
2007-11-05,836,8343,0.0
2011-02-15,836,46058,0.0
2011-07-27,836,50813,1.0
2016-01-26,836,110006,0.0
