In [1]:
# Interpolation stuff
import numpy as np
import pandas as pd
import pickle
import time
import datetime
import pickle
from itertools import product
from scipy import interpolate ## For other interpolation functions.

In [2]:
# Path to people.csv from ReadHatKaggle data set
FEATURE_FILE ='Data/act_train_features.csv'
# Path to act_train.csv from RedHatKaggle data set
OUTPUT ='Data/act_train_output.csv'
# Path to the test file
TEST_FILE = 'Data/act_test_features.csv'

# Path to the pickle files
SAVE_AS_DIR = 'Data/pickle'

In [3]:
# Read the train data set
train_data_df = pd.read_csv(FEATURE_FILE, parse_dates = ["date","people_date"])
train_data_df.sort_values(by = ['activity_id'], ascending = True, inplace = True)

In [4]:
# Read the train data output
train_output = pd.read_csv(OUTPUT)
train_output.sort_values(by = 'activity_id',ascending = True, inplace = True)

In [5]:
# Read the test data set
test_data_df = pd.read_csv(TEST_FILE, parse_dates = ["date","people_date"])

# Setting the outcome of group 17304 to 0
test = test_data_df[test_data_df['people_group_1'] == 17304]
test['outcome'] = 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [6]:
train_data_df.set_index('people_group_1', inplace = True)
train_data_df.drop(17304, axis = 0, inplace = True)
train_data_df.reset_index(inplace = True)

In [7]:
test_data_df.set_index('people_group_1',inplace = True)
test_data_df.drop(17304,axis = 0,inplace = True)
test_data_df.reset_index(inplace = True)

In [8]:
# Merge train and outcome
train_data = pd.merge(train_data_df, train_output, on='activity_id')

In [9]:
# drop duplicates
duplicate_train = train_data.drop('activity_id',axis=1).duplicated()

In [10]:
## Dataframe and their contents
# train_data: contains all train data, 17304 dropped
# test_data: contains all test data, 17304 dropped
# test: contains 17304 test data, with outcome set to 0
# df: merged data frame of train and test, with duplicates dropped from train

In [11]:
def interpolateFun0(x):
    
    x = x.reset_index(drop=True)
    g = x['outcome'].copy()
    
    # This will never run for us
    if (g.shape[0] < 3): ## If we have at most two rows.
        x['outcome_filled'] = g 
        return x

    # If all items already present
    if np.sum(g.isnull()) == 0:
        x['outcome_filled'] = g
        return x
    
    # All outcomes
    out = g.values.copy()
    # indices where outcome is not null
    value_locs = np.where(~g.isnull())[0]
    
    # if everything is null (group is only from test)
    if len(value_locs) == 0:
        x['outcome_filled'] = np.full_like(out, np.nan)
        return x
    
    # If group has only 1 value
    # So here if group==date==0 or 1 can be reset
    if len(value_locs) == 1:
        fillval = .89 if (g[value_locs[0]] == 1) else .13
        g[g.isnull()] = fillval
        x['outcome_filled'] = g
        return x        
    
    # If the first non-null item is not at location '0' in the group
    # Meaning that if the outcome on the min_date is null
    # Fill everything until there with 0.89 or 0.13
    if value_locs[0]:
        fillval = .89 if (g[value_locs[0]] == 1) else .13
        out[0:value_locs[0]] = fillval

    # Interpolate holes in the middle
    for i in range(0, len(value_locs) - 1):
        beg = value_locs[i]
        end = value_locs[i + 1]
        
        if g[beg] != g[end]:
            out[beg+1:end] = np.interp(range(beg+1, end), [beg, end], [g[beg], g[end]])
        else:
            out[beg+1:end] = g[beg]

    # If the last value is a null, fill all values after it with either 0.89 or 0.13
    if end < (len(g) - 1):
        beg = value_locs[-1]
        fillval = .89 if (g[beg] == 1) else .13

        out[beg+1:] = fillval

    x['outcome_filled'] = out    
    return x

In [12]:
minactivdate = pd.Timestamp('2022-07-17 00:00:00')
maxactivdate = pd.Timestamp('2023-08-31 00:00:00')

day_range = [maxactivdate - datetime.timedelta(days=x) for x in range(0, (maxactivdate - minactivdate).days+1)][::-1]

In [13]:
train = train_data[~duplicate_train]
groups = train.people_group_1.unique()

allGroupsAndDays = pd.DataFrame.from_records(product(groups, day_range))
allGroupsAndDays.columns = ['people_group_1', 'actdate_leak']

group_date_outcome_mean = train.groupby(['people_group_1', 'date'])['outcome'].agg('mean')

## Convert the calculation into a proper DataFrame.
group_date_outcome_mean = group_date_outcome_mean.to_frame().reset_index()
group_date_outcome_mean.rename(columns={'date': 'adate_mean'}, inplace=True)

allGroupsAndDays = pd.merge(allGroupsAndDays, group_date_outcome_mean, left_on=['people_group_1', 'actdate_leak'], right_on=['people_group_1', 'adate_mean'], how='left')
interpolated_values = allGroupsAndDays.groupby('people_group_1').apply(interpolateFun0)
interpolated_values = interpolated_values.rename(columns={'outcome': 'outcome_leak'})
interpolated_values = interpolated_values.drop(['adate_mean'], axis=1)

In [14]:
test_interpolated = pd.merge(test_data_df, interpolated_values, left_on=['people_group_1', 'date'], right_on=['people_group_1', 'actdate_leak'], how='left')
test_interpolated.drop('actdate_leak', axis=1, inplace=True)

In [15]:
test_interpolated['outcome_filled_nona'] = test_interpolated['outcome_filled'].fillna(train['outcome'].mean())

In [16]:
########## Uncomment the lines below to obtain pure interpolation methods ###################

In [17]:
# test_out =  test_interpolated[['activity_id', 'outcome_filled_nona']].copy()
# test_out.rename(columns={'outcome_filled_nona':'outcome'}, inplace=True)

In [18]:
# # test set contains the 17304 group and it was created earlier
# test_out = pd.concat([test_out,test])

In [19]:
# test_out[['outcome','activity_id']].set_index('activity_id').to_csv("InterpolationPure.csv")

In [21]:
test_interpolated = pd.concat([test_interpolated,test])

In [22]:
test_interpolated.to_pickle(SAVE_AS_DIR+'/test_withleak.pkl')

In [24]:
# This is not float, correct it
train['people_id'] = train['people_id'].astype('int32')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [25]:
######### Saving state of code here ##############
train_data.to_pickle(SAVE_AS_DIR+'/train_without17304.pkl')
train.to_pickle(SAVE_AS_DIR+'/traindedup_without17304.pkl')

In [23]:
train_data = pd.read_pickle(SAVE_AS_DIR+'/train_without17304.pkl')
train = pd.read_pickle(SAVE_AS_DIR+'/traindedup_without17304.pkl')

In [26]:
## Gives better distribution ##
from sklearn.model_selection import StratifiedKFold

KfoldOnPId=train_data

y = KfoldOnPId['outcome'].values
X = KfoldOnPId


X_train, X_test = {}, {}
y_train, y_test = {}, {}

skf = StratifiedKFold(5, shuffle=True, random_state=12345)
i=0
for train_index, test_index in skf.split(X,y):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train[i], X_test[i] = X.iloc[train_index], X.iloc[test_index]
#     X_train[i], X_test[i] = train_data[train_data.people_id.isin(people_ids_train)], \
#                             train_data[train_data.people_id.isin(people_ids_test)]
    y_train[i], y_test[i] = y[train_index], y[test_index]
    i +=1

('TRAIN:', array([      0,       1,       2, ..., 1398162, 1398164, 1398165]), 'TEST:', array([      6,       7,      19, ..., 1398151, 1398156, 1398163]))
('TRAIN:', array([      0,       1,       2, ..., 1398160, 1398163, 1398165]), 'TEST:', array([     10,      22,      28, ..., 1398161, 1398162, 1398164]))
('TRAIN:', array([      0,       2,       3, ..., 1398163, 1398164, 1398165]), 'TEST:', array([      1,       8,      11, ..., 1398142, 1398149, 1398158]))
('TRAIN:', array([      0,       1,       2, ..., 1398162, 1398163, 1398164]), 'TEST:', array([      5,      12,      15, ..., 1398153, 1398160, 1398165]))
('TRAIN:', array([      1,       5,       6, ..., 1398163, 1398164, 1398165]), 'TEST:', array([      0,       2,       3, ..., 1398154, 1398157, 1398159]))


In [27]:
# The mean should be nearly equal across all folds, something is messing up one of the groups :/
for k in range(len(X_train)):
    print(k, len(X_test[k]), X_test[k].outcome.mean())

(0, 279634, 0.6976977048570632)
(1, 279634, 0.6976977048570632)
(2, 279633, 0.6976966237890377)
(3, 279633, 0.6976966237890377)
(4, 279632, 0.6976991188419065)


In [28]:
for k in range(len(X_train)):
    # Get unique groups and expand date range
    groups = X_train[k].people_group_1.unique()
    allGroupsAndDays = pd.DataFrame.from_records(product(groups, day_range))
    allGroupsAndDays.columns = ['people_group_1', 'actdate_leak']

    group_date_outcome_mean = X_train[k].groupby(['people_group_1', 'date'])['outcome'].agg('mean')

## Convert the calculation into a proper DataFrame.
    group_date_outcome_mean = group_date_outcome_mean.to_frame().reset_index()
    group_date_outcome_mean.rename(columns={'date': 'adate_mean'}, inplace=True)

    allGroupsAndDays = pd.merge(allGroupsAndDays, group_date_outcome_mean, left_on=['people_group_1', 'actdate_leak'], right_on=['people_group_1', 'adate_mean'], how='left')
    interpolated_values = allGroupsAndDays.groupby('people_group_1').apply(interpolateFun0)
    interpolated_values = interpolated_values.rename(columns={'outcome': 'outcome_leak'})
    interpolated_values = interpolated_values.drop(['adate_mean'], axis=1)
    X_test[k] = pd.merge( X_test[k], interpolated_values, left_on=['people_group_1', 'date'], right_on=['people_group_1', 'actdate_leak'], how='left')
    X_test[k].drop('actdate_leak', axis=1, inplace=True)

In [29]:
import sklearn
total_features = []

for k in range(len(X_test)):
        
    X_test[k]['outcome_filled_nona'] = X_test[k].outcome_filled.fillna(X_train[k].outcome.mean())

    total_features.append(X_test[k])
    
    print(sklearn.metrics.roc_auc_score(X_test[k].outcome.values, X_test[k].outcome_filled_nona.values))

final_trained_new_features = pd.concat(total_features)
print('cv:', sklearn.metrics.roc_auc_score(final_trained_new_features.outcome.values, final_trained_new_features.outcome_filled_nona.values))

0.999863077697
0.999842562658
0.999807603352
0.99980856197
0.999848951121
('cv:', 0.9998338593764815)


In [30]:
train_interpolated_features = pd.merge(train_data,\
                                       final_trained_new_features[['activity_id', 'outcome_filled', 'outcome_filled_nona', 'outcome_leak']],\
                                       on='activity_id', how='left')

In [31]:
train_interpolated_features.shape

(1398166, 72)

In [32]:
train.shape # with duplicates dropped

(818977, 69)

In [33]:
train_data.shape #original df

(1398166, 69)

In [34]:
train_interpolated_features.to_pickle(SAVE_AS_DIR + '/train_withInterpolations.pkl')

In [35]:
################### END OF CODE - Discarded analysis can be found below #################################