In [1]:
import numpy as np
import pandas as pd
import pickle
import time

In [2]:
# Path to the pickle files
PKL_DIR = 'Data/pickle'
CSV_DIR = 'Data'

# File names
train_interpolated = '/train_withInterpolations.pkl'
test_interpolated = '/test_withleak.pkl'

In [3]:
# Read all
train_data = pd.read_pickle(PKL_DIR+train_interpolated)
test_data = pd.read_pickle(PKL_DIR+test_interpolated)

In [4]:
# Perform further preprocessing to get variations on date

In [5]:
# Find minimum and maximum activity dates
min_date_train = min(train_data['date'])
max_date_train = max(train_data['date'])

In [6]:
# Find minimum and maximum people dates
min_pdate_train = min(train_data['people_date'])
max_pdate_train = max(train_data['people_date'])

In [7]:
# Get reference dates from beginning of record time
def getReference(df):
    df['adays_till_activity'] = (df['date']-min_date_train)
    df['pdays_till_activity'] = (df['people_date'] - min_pdate_train)
    df['worked_for_day']=np.busday_count(df['people_date'].values.astype('datetime64[D]'), df['date'].values.astype('datetime64[D]'))
    return df

In [8]:
# Generate outcome for prev day and curr day per group_1
def getPreviousAndNextDatesOutcomes(df):
    df.sort_values(['people_group_1','date'],inplace=True)
    # Get average outcome interpolated per group and date
    df1 = df[['people_group_1','date','outcome_filled']].groupby(['people_group_1','date']).mean()
    df1.columns = ['avg_outcome']
    df1.reset_index(inplace=True)
    df1['next_outcome']=df1[['people_group_1','date','avg_outcome']].groupby('people_group_1')['avg_outcome'].shift(-1)
    df1['prev_outcome']=df1[['people_group_1','date','avg_outcome']].groupby('people_group_1')['avg_outcome'].shift(1)
    df1['diff_date']=df1[['people_group_1','date','avg_outcome']].groupby('people_group_1')['date'].apply(lambda x: x-x.shift(1))
    df1.drop('avg_outcome',axis=1,inplace=True)
    df = pd.merge(df,df1,on=['people_group_1','date'],how='left')
    return df

In [9]:
# Get people per group
def getPeopleperGroup(df):
    # ppg = people per group
    ppg = df.groupby('people_group_1')['people_id'].apply(lambda x: len(x.unique()))
    ppg = ppg.reset_index()
    ppg.columns=['people_group_1','people_per_group']
    df = pd.merge(df,ppg,on='people_group_1') 
    ppgd = df.groupby(['people_group_1','date'])['people_id'].apply(lambda x: len(x.unique()))
    ppgd = ppgd.reset_index()
    ppgd.columns=['people_group_1','date','people_per_group_date']
    df = pd.merge(df,ppgd,on=['people_group_1','date']) 
    apgd = df.groupby(['people_group_1','date'])['activity_id'].apply(lambda x: len(x.unique()))
    apgd = apgd.reset_index()
    apgd.columns=['people_group_1','date','activities_per_group_date']
    df = pd.merge(df,apgd,on=['people_group_1','date']) 
    return df

In [10]:
# using all duplicates?
if False:
    is_dup = train_data.drop('activity_id',axis=1).duplicated()
    df = train_data[~is_dup]
else:
    df = train_data

In [11]:
df.shape

(1398166, 72)

In [12]:
test_data.shape

(498687, 72)

In [13]:
df = pd.concat([df,test_data])

In [14]:
df = getReference(df)

In [15]:
df = getPreviousAndNextDatesOutcomes(df)

In [16]:
df['people_id'] = df['people_id'].astype('int32')

In [17]:
df = getPeopleperGroup(df)

In [150]:
list(set(df.columns).difference(set(train_data.columns)))

['pdays_till_activity',
 'adays_till_activity',
 'people_per_group',
 'next_outcome',
 'people_per_group_date',
 'prev_outcome',
 'diff_date',
 'activities_per_group_date',
 'worked_for_day']

In [164]:
columns_added = ['activity_id','adays_till_activity','pdays_till_activity','worked_for_day','next_outcome','prev_outcome','diff_date', 'people_per_group','people_per_group_date','activities_per_group_date']

In [151]:
df[df['outcome']==0].shape

(588273, 81)

In [152]:
df[df['outcome'].isnull()].shape

(333083, 81)

In [171]:
df['adays_till_activity'] = df['adays_till_activity'].astype('<m8[D]').values.astype(int)

In [175]:
df['pdays_till_activity'] = df['adays_till_activity'].astype('<m8[D]').values.astype(int)
df['diff_date'] = df['adays_till_activity'].astype('<m8[D]').values.astype(int)

In [177]:
# df[columns_added][10:30]

In [178]:
### Spilt the df
train_added = df[df['outcome']>=0]
test_added = df[df['outcome'].isnull()]

In [179]:
## Pickle results ##
######### Saving state of code here ##############
train_added.to_pickle(SAVE_AS_DIR+'/train_withDateFeatures_without17304.pkl')
test_added.to_pickle(SAVE_AS_DIR+'/test_withDateFeatures_without17304.pkl')