In [1]:
# Interpolation stuff
import numpy as np
import pandas as pd
import pickle
import time
import datetime
import pickle
from itertools import product
from scipy import interpolate ## For other interpolation functions.

In [5]:
# Path to people.csv from ReadHatKaggle data set
FEATURE_FILE ='Data/act_train_features.csv'
# Path to act_train.csv from RedHatKaggle data set
OUTPUT ='Data/act_train_output.csv'
# Path to the test file
TEST_FILE = 'Data/act_test_features.csv'

# Path to the pickle files
SAVE_AS_DIR = 'Data/pickle'

In [48]:
# Read the train data set
train_data_df=pd.read_csv(FEATURE_FILE,parse_dates=["date","people_date"])
train_data_df.sort_values(by=['activity_id'],ascending=True, inplace=True)

In [49]:
# Read the train data output
train_output = pd.read_csv(OUTPUT)
train_output.sort_values(by='activity_id',ascending=True, inplace=True)

In [76]:
# Read the test data set
test_data_df=pd.read_csv(TEST_FILE,parse_dates=["date","people_date"])
test = test_data_df[test_data_df['people_group_1']==17304]
test['outcome'] = 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [7]:
train_data_df.set_index('people_group_1',inplace=True)
train_data_df.drop(17304,axis=0,inplace=True)
train_data_df.reset_index(inplace=True)

In [81]:
test_data_df.set_index('people_group_1',inplace=True)
test_data_df.drop(17304,axis=0,inplace=True)
test_data_df.reset_index(inplace=True)

In [50]:
# Merge test and outcome
train_data = pd.merge(train_data_df,train_output,on='activity_id')

In [10]:
# drop duplicates
is_dup = train_data.drop('activity_id',axis=1).duplicated()
df = train_data[~is_dup]
df = pd.concat([df,test_data_df])

In [134]:
## Dataframe and their contents
# train_data: contains all train data, 17304 dropped
# test_data: contains all test data, 17304 dropped
# test: contains 17304 test data, with outcome set to 0
# df: merged data frame of train and test, with duplicates dropped from train

In [2]:
fv = []

def interpolateFun0(x):
    """Modified the original script from Kaggle - author's function rewritten in Python.
    The author interpolates between two known val nues by averaging them. We
    can think of this as 0th order interpolation. """


    x = x.reset_index(drop=True)
    g = x['outcome'].copy()
    
    global fv

    # This will never run for us
    if (g.shape[0] < 3): ## If we have at most two rows.
        x['outcome_filled'] = g 
        return x

    # If all items already present
    if np.sum(g.isnull()) == 0:
        x['outcome_filled'] = g
        return x
    
    # All outcomes
    out = g.values.copy()
    # location where outcome is not null
    value_locs = np.where(~g.isnull())[0]
    
    # if everything is null (group is only from test)
    if len(value_locs) == 0:
        x['outcome_filled'] = np.full_like(out, np.nan)
        return x
    
    # If group has only 1 value
    # So here if group==date==0 or 1 can be reset
    if len(value_locs) == 1:
        fillval = .89 if (g[value_locs[0]] == 1) else .13
        fv.append((g[value_locs[0]], fillval))
        g[g.isnull()] = fillval
        x['outcome_filled'] = g
        return x        
    
    # If the first non-null item is not at location '0' in the group
    # Meaning that if the outcome on the min_date is null
    # Fill everything until there with 0.89 or 0.13
    if value_locs[0]:
        fillval = .89 if (g[value_locs[0]] == 1) else .13
        fv.append((g[value_locs[0]], fillval))
        out[0:value_locs[0]] = fillval

    # Interpolate holes in the middle
    for i in range(0, len(value_locs) - 1):
        beg = value_locs[i]
        end = value_locs[i + 1]
        
        if g[beg] != g[end]:
            out[beg+1:end] = np.interp(range(beg+1, end), [beg, end], [g[beg], g[end]])
        else:
            out[beg+1:end] = g[beg]

    # If the last value is a null basicall, fill upto it
    if end < (len(g) - 1):
        beg = value_locs[-1]
        fillval = .89 if (g[beg] == 1) else .13
        fv.append((g[beg], fillval))

        out[beg+1:] = fillval

    x['outcome_filled'] = out    
    return x

In [3]:
minactivdate = pd.Timestamp('2022-07-17 00:00:00')
maxactivdate = pd.Timestamp('2023-08-31 00:00:00')

alldays = [maxactivdate - datetime.timedelta(days=x) for x in range(0, (maxactivdate - minactivdate).days+1)][::-1]

In [20]:
train = train_data[~is_dup]
groups = train.people_group_1.unique()

allGroupsAndDays = pd.DataFrame.from_records(product(groups, alldays))
allGroupsAndDays.columns = ['people_group_1', 'actdate_leak']

meanbycomdate = train.groupby(['people_group_1', 'date'])['outcome'].agg('mean')

## Convert the calculation into a proper DataFrame.
meanbycomdate = meanbycomdate.to_frame().reset_index()
meanbycomdate.rename(columns={'date': 'adate_mean'}, inplace=True)

allGroupsAndDays = pd.merge(allGroupsAndDays, meanbycomdate, left_on=['people_group_1', 'actdate_leak'], right_on=['people_group_1', 'adate_mean'], how='left')
agad2 = allGroupsAndDays.groupby('people_group_1').apply(interpolateFun0)
agad2 = agad2.rename(columns={'outcome': 'outcome_leak'})
agad2 = agad2.drop(['adate_mean'], axis=1)

In [49]:
test_interpolated = pd.merge(test_data_df, agad2, left_on=['people_group_1', 'date'], right_on=['people_group_1', 'actdate_leak'], how='left')
test_interpolated.drop('actdate_leak', axis=1, inplace=True)

In [53]:
test_interpolated['outcome_filled_nona'] = test_interpolated['outcome_filled'].fillna(train['outcome'].mean())

In [None]:
########## Uncomment the lines below to obtain pure interpolation methods ###################

In [56]:
# test_out =  test_interpolated[['activity_id', 'outcome_filled_nona']].copy()
# test_out.rename(columns={'outcome_filled_nona':'outcome'}, inplace=True)

In [72]:
# # test set contains the 17304 group and it was created earlier
# test_out = pd.concat([test_out,test])

In [73]:
# test_out[['outcome','activity_id']].set_index('activity_id').to_csv("InterpolationPure.csv")

In [78]:
test_interpolated = pd.concat([test_withleak,test])

In [80]:
test_interpolated.to_pickle(SAVE_AS_DIR+'/test_withleak.pkl')

In [102]:
# This is not float, correct it
train['people_id'] = train['people_id'].astype('int32')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [154]:
######### Saving state of code here ##############
train_data.to_pickle(SAVE_AS_DIR+'/train_without17304.pkl')
train.to_pickle(SAVE_AS_DIR+'/traindedup_without17304.pkl')

In [23]:
train_data = pd.read_pickle(SAVE_AS_DIR+'/train_without17304.pkl')
train = pd.read_pickle(SAVE_AS_DIR+'/traindedup_without17304.pkl')

In [28]:
############# Predicting the extra feature on train set with help of CV on people_id #######################

## Attempt A: Kfold -> on peopleID ##
## Use either this or stratified K-fold ########
from sklearn.cross_validation import KFold

# Split on people_id instead
KfoldOnPId=train_data.sort_values('people_id').groupby('people_id').mean()['outcome'].reset_index()

X = KfoldOnPId['people_id']
y = KfoldOnPId['outcome']

X_train, X_test = {}, {}
y_train, y_test = {}, {}

kf = KFold(len(KfoldOnPId),5, shuffle=True, random_state=12345)
i=0
for train_index, test_index in kf:
    print("TRAIN:", train_index, "TEST:", test_index)
    people_ids_train, people_ids_test = X.iloc[train_index], X.iloc[test_index]
    X_train[i], X_test[i] = train_data[train_data.people_id.isin(people_ids_train)], \
                            train_data[train_data.people_id.isin(people_ids_test)]
    y_train[i], y_test[i] = y[train_index], y[test_index]
    i +=1



('TRAIN:', array([    0,     1,     2, ..., 89480, 89481, 89482]), 'TEST:', array([    3,     8,    10, ..., 89458, 89468, 89479]))
('TRAIN:', array([    0,     2,     3, ..., 89479, 89480, 89481]), 'TEST:', array([    1,    12,    14, ..., 89472, 89476, 89482]))
('TRAIN:', array([    0,     1,     2, ..., 89479, 89481, 89482]), 'TEST:', array([    5,     6,    15, ..., 89473, 89475, 89480]))
('TRAIN:', array([    1,     3,     4, ..., 89479, 89480, 89482]), 'TEST:', array([    0,     2,    11, ..., 89470, 89478, 89481]))
('TRAIN:', array([    0,     1,     2, ..., 89480, 89481, 89482]), 'TEST:', array([    4,     7,     9, ..., 89471, 89474, 89477]))


In [29]:
# The mean should be nearly equal across all folds, something is messing up one of the groups :/
for k in range(len(X_train)):
    print(k, len(X_test[k]), X_test[k].outcome.mean())

(0, 270326, 0.7223315552333109)
(1, 270812, 0.732138900787262)
(2, 318855, 0.6041304041021781)
(3, 268374, 0.7281703890838904)
(4, 269799, 0.7187128195434379)


In [44]:
## Gives better distribution ##
from sklearn.model_selection import StratifiedKFold

KfoldOnPId=train_data

y = KfoldOnPId['outcome'].values
X = KfoldOnPId


X_train, X_test = {}, {}
y_train, y_test = {}, {}

skf = StratifiedKFold(5, shuffle=True, random_state=12345)
i=0
for train_index, test_index in skf.split(X,y):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train[i], X_test[i] = X.iloc[train_index], X.iloc[test_index]
#     X_train[i], X_test[i] = train_data[train_data.people_id.isin(people_ids_train)], \
#                             train_data[train_data.people_id.isin(people_ids_test)]
    y_train[i], y_test[i] = y[train_index], y[test_index]
    i +=1

('TRAIN:', array([      0,       1,       2, ..., 1398162, 1398164, 1398165]), 'TEST:', array([      6,       7,      19, ..., 1398151, 1398156, 1398163]))
('TRAIN:', array([      0,       1,       2, ..., 1398160, 1398163, 1398165]), 'TEST:', array([     10,      22,      28, ..., 1398161, 1398162, 1398164]))
('TRAIN:', array([      0,       2,       3, ..., 1398163, 1398164, 1398165]), 'TEST:', array([      1,       8,      11, ..., 1398142, 1398149, 1398158]))
('TRAIN:', array([      0,       1,       2, ..., 1398162, 1398163, 1398164]), 'TEST:', array([      5,      12,      15, ..., 1398153, 1398160, 1398165]))
('TRAIN:', array([      1,       5,       6, ..., 1398163, 1398164, 1398165]), 'TEST:', array([      0,       2,       3, ..., 1398154, 1398157, 1398159]))


In [45]:
# The mean should be nearly equal across all folds, something is messing up one of the groups :/
for k in range(len(X_train)):
    print(k, len(X_test[k]), X_test[k].outcome.mean())

(0, 279634, 0.6976977048570632)
(1, 279634, 0.6976977048570632)
(2, 279633, 0.6976966237890377)
(3, 279633, 0.6976966237890377)
(4, 279632, 0.6976991188419065)


In [46]:
for k in range(len(X_train)):
    # Get unique groups and expand date raneg
    groups = X_train[k].people_group_1.unique()
    allGroupsAndDays = pd.DataFrame.from_records(product(groups, alldays))
    allGroupsAndDays.columns = ['people_group_1', 'actdate_leak']

    meanbycomdate = X_train[k].groupby(['people_group_1', 'date'])['outcome'].agg('mean')

## Convert the calculation into a proper DataFrame.
    meanbycomdate = meanbycomdate.to_frame().reset_index()
    meanbycomdate.rename(columns={'date': 'adate_mean'}, inplace=True)

    allGroupsAndDays = pd.merge(allGroupsAndDays, meanbycomdate, left_on=['people_group_1', 'actdate_leak'], right_on=['people_group_1', 'adate_mean'], how='left')
    agad2 = allGroupsAndDays.groupby('people_group_1').apply(interpolateFun0)
    agad2 = agad2.rename(columns={'outcome': 'outcome_leak'})
    agad2 = agad2.drop(['adate_mean'], axis=1)
    X_test[k] = pd.merge( X_test[k], agad2, left_on=['people_group_1', 'date'], right_on=['people_group_1', 'actdate_leak'], how='left')
    X_test[k].drop('actdate_leak', axis=1, inplace=True)

In [47]:
import sklearn
total_features = []

for k in range(len(X_test)):
        
    X_test[k]['outcome_filled_nona'] = X_test[k].outcome_filled.fillna(X_train[k].outcome.mean())

    total_features.append(X_test[k])
    
    print(sklearn.metrics.roc_auc_score(X_test[k].outcome.values, X_test[k].outcome_filled_nona.values))

final_trained_new_features = pd.concat(total_features)
print('cv:', sklearn.metrics.roc_auc_score(final_trained_new_features.outcome.values, final_trained_new_features.outcome_filled_nona.values))

0.999863077697
0.999842562658
0.999807603352
0.99980856197
0.999848951121
('cv:', 0.9998338593764815)


In [52]:
train_interpolated_features = pd.merge(train_data,\
                                       final_trained_new_features[['activity_id', 'outcome_filled', 'outcome_filled_nona', 'outcome_leak']],\
                                       on='activity_id', how='left')

In [53]:
train_interpolated_features.shape

(1398166, 72)

In [54]:
train.shape # with duplicates dropped

(818977, 69)

In [55]:
train_data.shape #original df

(1398166, 69)

In [56]:
train_interpolated_features.to_pickle(SAVE_AS_DIR+'/train_withInterpolations.pkl')

In [None]:
################### END OF CODE - Discarded analysis can be found below #################################

In [None]:
### SUMAARY ###
### without 17304, with duplicates ###
Kfold on peopleId = 0.94 AUC overall
Stratified on activityId = 0.99 AUC overall <- using this 
## without 17304, without duplicates ##

## with 17304, with duplicates ##
Kfold on peopleId = 0.94 AUC overall
Stratified on activityId = 0.9998338593764815 AUC overall <- using this 

## with 17304, without duplicates ##
SKf = 0.99993755377355209

In [None]:
############### DELETE STUFF BELOW HERE ################

In [62]:
## without 17304, without dups ##
## with 17304, with duplicates ##
### SKFOLD ###
KfoldOnPId=train

y = KfoldOnPId['outcome'].values
X = KfoldOnPId


X_train, X_test = {}, {}
y_train, y_test = {}, {}

skf = StratifiedKFold(5, shuffle=True, random_state=12345)
i=0
for train_index, test_index in skf.split(X,y):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train[i], X_test[i] = X.iloc[train_index], X.iloc[test_index]
#     X_train[i], X_test[i] = train_data[train_data.people_id.isin(people_ids_train)], \
#                             train_data[train_data.people_id.isin(people_ids_test)]
    y_train[i], y_test[i] = y[train_index], y[test_index]
    i +=1

# The mean should be nearly equal across all folds, something is messing up one of the groups :/
for k in range(len(X_train)):
    print(k, len(X_test[k]), X_test[k].outcome.mean())

for k in range(len(X_train)):
    # Get unique groups and expand date raneg
    groups = X_train[k].people_group_1.unique()
    allGroupsAndDays = pd.DataFrame.from_records(product(groups, alldays))
    allGroupsAndDays.columns = ['people_group_1', 'actdate_leak']

    meanbycomdate = X_train[k].groupby(['people_group_1', 'date'])['outcome'].agg('mean')

## Convert the calculation into a proper DataFrame.
    meanbycomdate = meanbycomdate.to_frame().reset_index()
    meanbycomdate.rename(columns={'date': 'adate_mean'}, inplace=True)

    allGroupsAndDays = pd.merge(allGroupsAndDays, meanbycomdate, left_on=['people_group_1', 'actdate_leak'], right_on=['people_group_1', 'adate_mean'], how='left')
    agad2 = allGroupsAndDays.groupby('people_group_1').apply(interpolateFun0)
    agad2 = agad2.rename(columns={'outcome': 'outcome_leak'})
    agad2 = agad2.drop(['adate_mean'], axis=1)
    X_test[k] = pd.merge( X_test[k], agad2, left_on=['people_group_1', 'date'], right_on=['people_group_1', 'actdate_leak'], how='left')
    X_test[k].drop('actdate_leak', axis=1, inplace=True)

total_features = []

for k in range(len(X_test)):
        
    X_test[k]['outcome_filled_nona'] = X_test[k].outcome_filled.fillna(X_train[k].outcome.mean())

    total_features.append(X_test[k])
    
    print(sklearn.metrics.roc_auc_score(X_test[k].outcome.values, X_test[k].outcome_filled_nona.values))

final_trained_new_features = pd.concat(total_features)
print('cv:', sklearn.metrics.roc_auc_score(final_trained_new_features.outcome.values, final_trained_new_features.outcome_filled_nona.values))

('TRAIN:', array([     0,      1,      2, ..., 818971, 818972, 818976]), 'TEST:', array([    11,     27,     34, ..., 818973, 818974, 818975]))
('TRAIN:', array([     0,      1,      2, ..., 818973, 818974, 818975]), 'TEST:', array([     5,      6,     10, ..., 818958, 818972, 818976]))
('TRAIN:', array([     0,      1,      2, ..., 818974, 818975, 818976]), 'TEST:', array([     3,     12,     13, ..., 818960, 818962, 818971]))
('TRAIN:', array([     1,      3,      5, ..., 818974, 818975, 818976]), 'TEST:', array([     0,      2,      4, ..., 818965, 818969, 818970]))
('TRAIN:', array([     0,      2,      3, ..., 818974, 818975, 818976]), 'TEST:', array([     1,      8,      9, ..., 818964, 818967, 818968]))
(0, 163796, 0.720420523089697)
(1, 163796, 0.720420523089697)
(2, 163795, 0.720424921395647)
(3, 163795, 0.720424921395647)
(4, 163795, 0.720424921395647)
0.999497641152
0.999440363969
0.999562401399
0.999618962542
0.999287302954
('cv:', 0.99948174654159239)


In [63]:
### KFOLD ###
## without 17304, without dups ##
# Split on people_id instead
KfoldOnPId=train.sort_values('people_id').groupby('people_id').mean()['outcome'].reset_index()

X = KfoldOnPId['people_id']
y = KfoldOnPId['outcome']

X_train, X_test = {}, {}
y_train, y_test = {}, {}

kf = KFold(len(KfoldOnPId),5, shuffle=True, random_state=12345)
i=0
for train_index, test_index in kf:
    print("TRAIN:", train_index, "TEST:", test_index)
    people_ids_train, people_ids_test = X.iloc[train_index], X.iloc[test_index]
    X_train[i], X_test[i] = train_data[train_data.people_id.isin(people_ids_train)], \
                            train_data[train_data.people_id.isin(people_ids_test)]
    y_train[i], y_test[i] = y[train_index], y[test_index]
    i +=1
    
# The mean should be nearly equal across all folds, something is messing up one of the groups :/
for k in range(len(X_train)):
    print(k, len(X_test[k]), X_test[k].outcome.mean())

for k in range(len(X_train)):
    # Get unique groups and expand date raneg
    groups = X_train[k].people_group_1.unique()
    allGroupsAndDays = pd.DataFrame.from_records(product(groups, alldays))
    allGroupsAndDays.columns = ['people_group_1', 'actdate_leak']

    meanbycomdate = X_train[k].groupby(['people_group_1', 'date'])['outcome'].agg('mean')

## Convert the calculation into a proper DataFrame.
    meanbycomdate = meanbycomdate.to_frame().reset_index()
    meanbycomdate.rename(columns={'date': 'adate_mean'}, inplace=True)

    allGroupsAndDays = pd.merge(allGroupsAndDays, meanbycomdate, left_on=['people_group_1', 'actdate_leak'], right_on=['people_group_1', 'adate_mean'], how='left')
    agad2 = allGroupsAndDays.groupby('people_group_1').apply(interpolateFun0)
    agad2 = agad2.rename(columns={'outcome': 'outcome_leak'})
    agad2 = agad2.drop(['adate_mean'], axis=1)
    X_test[k] = pd.merge( X_test[k], agad2, left_on=['people_group_1', 'date'], right_on=['people_group_1', 'actdate_leak'], how='left')
    X_test[k].drop('actdate_leak', axis=1, inplace=True)

total_features = []

for k in range(len(X_test)):
        
    X_test[k]['outcome_filled_nona'] = X_test[k].outcome_filled.fillna(X_train[k].outcome.mean())

    total_features.append(X_test[k])
    
    print(sklearn.metrics.roc_auc_score(X_test[k].outcome.values, X_test[k].outcome_filled_nona.values))

final_trained_new_features = pd.concat(total_features)
print('cv:', sklearn.metrics.roc_auc_score(final_trained_new_features.outcome.values, final_trained_new_features.outcome_filled_nona.values))

('TRAIN:', array([    0,     1,     2, ..., 89480, 89481, 89482]), 'TEST:', array([    3,     8,    10, ..., 89458, 89468, 89479]))
('TRAIN:', array([    0,     2,     3, ..., 89479, 89480, 89481]), 'TEST:', array([    1,    12,    14, ..., 89472, 89476, 89482]))
('TRAIN:', array([    0,     1,     2, ..., 89479, 89481, 89482]), 'TEST:', array([    5,     6,    15, ..., 89473, 89475, 89480]))
('TRAIN:', array([    1,     3,     4, ..., 89479, 89480, 89482]), 'TEST:', array([    0,     2,    11, ..., 89470, 89478, 89481]))
('TRAIN:', array([    0,     1,     2, ..., 89480, 89481, 89482]), 'TEST:', array([    4,     7,     9, ..., 89471, 89474, 89477]))
(0, 270326, 0.7223315552333109)
(1, 270812, 0.732138900787262)
(2, 318855, 0.6041304041021781)
(3, 268374, 0.7281703890838904)
(4, 269799, 0.7187128195434379)
0.958096801912
0.958024364282
0.938319764321
0.958038189498
0.956913363592
('cv:', 0.9454443780956242)


In [57]:
## with 17304, with duplicates ##
### SKFOLD ###
KfoldOnPId=train_data_i

y = KfoldOnPId['outcome'].values
X = KfoldOnPId


X_train, X_test = {}, {}
y_train, y_test = {}, {}

skf = StratifiedKFold(5, shuffle=True, random_state=12345)
i=0
for train_index, test_index in skf.split(X,y):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train[i], X_test[i] = X.iloc[train_index], X.iloc[test_index]
#     X_train[i], X_test[i] = train_data[train_data.people_id.isin(people_ids_train)], \
#                             train_data[train_data.people_id.isin(people_ids_test)]
    y_train[i], y_test[i] = y[train_index], y[test_index]
    i +=1

# The mean should be nearly equal across all folds, something is messing up one of the groups :/
for k in range(len(X_train)):
    print(k, len(X_test[k]), X_test[k].outcome.mean())

for k in range(len(X_train)):
    # Get unique groups and expand date raneg
    groups = X_train[k].people_group_1.unique()
    allGroupsAndDays = pd.DataFrame.from_records(product(groups, alldays))
    allGroupsAndDays.columns = ['people_group_1', 'actdate_leak']

    meanbycomdate = X_train[k].groupby(['people_group_1', 'date'])['outcome'].agg('mean')

## Convert the calculation into a proper DataFrame.
    meanbycomdate = meanbycomdate.to_frame().reset_index()
    meanbycomdate.rename(columns={'date': 'adate_mean'}, inplace=True)

    allGroupsAndDays = pd.merge(allGroupsAndDays, meanbycomdate, left_on=['people_group_1', 'actdate_leak'], right_on=['people_group_1', 'adate_mean'], how='left')
    agad2 = allGroupsAndDays.groupby('people_group_1').apply(interpolateFun0)
    agad2 = agad2.rename(columns={'outcome': 'outcome_leak'})
    agad2 = agad2.drop(['adate_mean'], axis=1)
    X_test[k] = pd.merge( X_test[k], agad2, left_on=['people_group_1', 'date'], right_on=['people_group_1', 'actdate_leak'], how='left')
    X_test[k].drop('actdate_leak', axis=1, inplace=True)

total_features = []

for k in range(len(X_test)):
        
    X_test[k]['outcome_filled_nona'] = X_test[k].outcome_filled.fillna(X_train[k].outcome.mean())

    total_features.append(X_test[k])
    
    print(sklearn.metrics.roc_auc_score(X_test[k].outcome.values, X_test[k].outcome_filled_nona.values))

final_trained_new_features = pd.concat(total_features)
print('cv:', sklearn.metrics.roc_auc_score(final_trained_new_features.outcome.values, final_trained_new_features.outcome_filled_nona.values))

('TRAIN:', array([      0,       1,       2, ..., 2197286, 2197287, 2197288]), 'TEST:', array([     14,      31,      34, ..., 2197281, 2197289, 2197290]))
('TRAIN:', array([      0,       2,       3, ..., 2197288, 2197289, 2197290]), 'TEST:', array([      1,       4,      15, ..., 2197278, 2197283, 2197286]))
('TRAIN:', array([      0,       1,       3, ..., 2197288, 2197289, 2197290]), 'TEST:', array([      2,       5,       6, ..., 2197279, 2197280, 2197282]))
('TRAIN:', array([      0,       1,       2, ..., 2197286, 2197289, 2197290]), 'TEST:', array([      8,       9,      11, ..., 2197285, 2197287, 2197288]))
('TRAIN:', array([      1,       2,       4, ..., 2197288, 2197289, 2197290]), 'TEST:', array([      0,       3,       7, ..., 2197261, 2197273, 2197284]))
(0, 439459, 0.44395495370444116)
(1, 439459, 0.44395495370444116)
(2, 439458, 0.44395368840708327)
(3, 439458, 0.44395368840708327)
(4, 439457, 0.44395469863945736)
0.999931390082
0.999944062406
0.999929486143
0.99993760

In [58]:
## with 17304, with duplicates ##
### KFOLD ###
# Split on people_id instead
KfoldOnPId=train_data_i.sort_values('people_id').groupby('people_id').mean()['outcome'].reset_index()

X = KfoldOnPId['people_id']
y = KfoldOnPId['outcome']

X_train, X_test = {}, {}
y_train, y_test = {}, {}

kf = KFold(len(KfoldOnPId),5, shuffle=True, random_state=12345)
i=0
for train_index, test_index in kf:
    print("TRAIN:", train_index, "TEST:", test_index)
    people_ids_train, people_ids_test = X.iloc[train_index], X.iloc[test_index]
    X_train[i], X_test[i] = train_data[train_data.people_id.isin(people_ids_train)], \
                            train_data[train_data.people_id.isin(people_ids_test)]
    y_train[i], y_test[i] = y[train_index], y[test_index]
    i +=1
    
# The mean should be nearly equal across all folds, something is messing up one of the groups :/
for k in range(len(X_train)):
    print(k, len(X_test[k]), X_test[k].outcome.mean())

for k in range(len(X_train)):
    # Get unique groups and expand date raneg
    groups = X_train[k].people_group_1.unique()
    allGroupsAndDays = pd.DataFrame.from_records(product(groups, alldays))
    allGroupsAndDays.columns = ['people_group_1', 'actdate_leak']

    meanbycomdate = X_train[k].groupby(['people_group_1', 'date'])['outcome'].agg('mean')

## Convert the calculation into a proper DataFrame.
    meanbycomdate = meanbycomdate.to_frame().reset_index()
    meanbycomdate.rename(columns={'date': 'adate_mean'}, inplace=True)

    allGroupsAndDays = pd.merge(allGroupsAndDays, meanbycomdate, left_on=['people_group_1', 'actdate_leak'], right_on=['people_group_1', 'adate_mean'], how='left')
    agad2 = allGroupsAndDays.groupby('people_group_1').apply(interpolateFun0)
    agad2 = agad2.rename(columns={'outcome': 'outcome_leak'})
    agad2 = agad2.drop(['adate_mean'], axis=1)
    X_test[k] = pd.merge( X_test[k], agad2, left_on=['people_group_1', 'date'], right_on=['people_group_1', 'actdate_leak'], how='left')
    X_test[k].drop('actdate_leak', axis=1, inplace=True)

total_features = []

for k in range(len(X_test)):
        
    X_test[k]['outcome_filled_nona'] = X_test[k].outcome_filled.fillna(X_train[k].outcome.mean())

    total_features.append(X_test[k])
    
    print(sklearn.metrics.roc_auc_score(X_test[k].outcome.values, X_test[k].outcome_filled_nona.values))

final_trained_new_features = pd.concat(total_features)
print('cv:', sklearn.metrics.roc_auc_score(final_trained_new_features.outcome.values, final_trained_new_features.outcome_filled_nona.values))

('TRAIN:', array([     0,      2,      3, ..., 151290, 151292, 151293]), 'TEST:', array([     1,      7,      8, ..., 151289, 151291, 151294]))
('TRAIN:', array([     0,      1,      2, ..., 151292, 151293, 151294]), 'TEST:', array([    13,     29,     31, ..., 151262, 151269, 151285]))
('TRAIN:', array([     0,      1,      4, ..., 151292, 151293, 151294]), 'TEST:', array([     2,      3,     12, ..., 151275, 151277, 151280]))
('TRAIN:', array([     1,      2,      3, ..., 151291, 151293, 151294]), 'TEST:', array([     0,      4,      5, ..., 151286, 151290, 151292]))
('TRAIN:', array([     0,      1,      2, ..., 151291, 151292, 151294]), 'TEST:', array([     9,     16,     19, ..., 151274, 151278, 151293]))
(0, 263162, 0.7322485769222)
(1, 322258, 0.5961031223429675)
(2, 274696, 0.7361228412499636)
(3, 269626, 0.7251118215602352)
(4, 268424, 0.7189334783774923)
0.95535559915
0.94038966679
0.947760633414
0.957864609794
0.960203048365
('cv:', 0.94204630996618932)


In [59]:
# drop duplicates
is_dup = train_data_i.drop('activity_id',axis=1).duplicated()
train_data_i = train_data_i[~is_dup]

(2197291, 69)

In [61]:
## with 17304, without duplicates ##
### SKFOLD ###
KfoldOnPId=train_data_i

y = KfoldOnPId['outcome'].values
X = KfoldOnPId


X_train, X_test = {}, {}
y_train, y_test = {}, {}

skf = StratifiedKFold(5, shuffle=True, random_state=12345)
i=0
for train_index, test_index in skf.split(X,y):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train[i], X_test[i] = X.iloc[train_index], X.iloc[test_index]
#     X_train[i], X_test[i] = train_data[train_data.people_id.isin(people_ids_train)], \
#                             train_data[train_data.people_id.isin(people_ids_test)]
    y_train[i], y_test[i] = y[train_index], y[test_index]
    i +=1

# The mean should be nearly equal across all folds, something is messing up one of the groups :/
for k in range(len(X_train)):
    print(k, len(X_test[k]), X_test[k].outcome.mean())

for k in range(len(X_train)):
    # Get unique groups and expand date raneg
    groups = X_train[k].people_group_1.unique()
    allGroupsAndDays = pd.DataFrame.from_records(product(groups, alldays))
    allGroupsAndDays.columns = ['people_group_1', 'actdate_leak']

    meanbycomdate = X_train[k].groupby(['people_group_1', 'date'])['outcome'].agg('mean')

## Convert the calculation into a proper DataFrame.
    meanbycomdate = meanbycomdate.to_frame().reset_index()
    meanbycomdate.rename(columns={'date': 'adate_mean'}, inplace=True)

    allGroupsAndDays = pd.merge(allGroupsAndDays, meanbycomdate, left_on=['people_group_1', 'actdate_leak'], right_on=['people_group_1', 'adate_mean'], how='left')
    agad2 = allGroupsAndDays.groupby('people_group_1').apply(interpolateFun0)
    agad2 = agad2.rename(columns={'outcome': 'outcome_leak'})
    agad2 = agad2.drop(['adate_mean'], axis=1)
    X_test[k] = pd.merge( X_test[k], agad2, left_on=['people_group_1', 'date'], right_on=['people_group_1', 'actdate_leak'], how='left')
    X_test[k].drop('actdate_leak', axis=1, inplace=True)

total_features = []

for k in range(len(X_test)):
        
    X_test[k]['outcome_filled_nona'] = X_test[k].outcome_filled.fillna(X_train[k].outcome.mean())

    total_features.append(X_test[k])
    
    print(sklearn.metrics.roc_auc_score(X_test[k].outcome.values, X_test[k].outcome_filled_nona.values))

final_trained_new_features = pd.concat(total_features)
print('cv:', sklearn.metrics.roc_auc_score(final_trained_new_features.outcome.values, final_trained_new_features.outcome_filled_nona.values))

('TRAIN:', array([      0,       1,       2, ..., 2197286, 2197287, 2197288]), 'TEST:', array([     14,      31,      34, ..., 2197281, 2197289, 2197290]))
('TRAIN:', array([      0,       2,       3, ..., 2197288, 2197289, 2197290]), 'TEST:', array([      1,       4,      15, ..., 2197278, 2197283, 2197286]))
('TRAIN:', array([      0,       1,       3, ..., 2197288, 2197289, 2197290]), 'TEST:', array([      2,       5,       6, ..., 2197279, 2197280, 2197282]))
('TRAIN:', array([      0,       1,       2, ..., 2197286, 2197289, 2197290]), 'TEST:', array([      8,       9,      11, ..., 2197285, 2197287, 2197288]))
('TRAIN:', array([      1,       2,       4, ..., 2197288, 2197289, 2197290]), 'TEST:', array([      0,       3,       7, ..., 2197261, 2197273, 2197284]))
(0, 439459, 0.44395495370444116)
(1, 439459, 0.44395495370444116)
(2, 439458, 0.44395368840708327)
(3, 439458, 0.44395368840708327)
(4, 439457, 0.44395469863945736)
0.999931390082
0.999944062406
0.999929486143
0.99993760

In [60]:
## with 17304, without duplicates ##
### KFOLD ###
# Split on people_id instead
KfoldOnPId=train_data_i.sort_values('people_id').groupby('people_id').mean()['outcome'].reset_index()

X = KfoldOnPId['people_id']
y = KfoldOnPId['outcome']

X_train, X_test = {}, {}
y_train, y_test = {}, {}

kf = KFold(len(KfoldOnPId),5, shuffle=True, random_state=12345)
i=0
for train_index, test_index in kf:
    print("TRAIN:", train_index, "TEST:", test_index)
    people_ids_train, people_ids_test = X.iloc[train_index], X.iloc[test_index]
    X_train[i], X_test[i] = train_data[train_data.people_id.isin(people_ids_train)], \
                            train_data[train_data.people_id.isin(people_ids_test)]
    y_train[i], y_test[i] = y[train_index], y[test_index]
    i +=1
    
# The mean should be nearly equal across all folds, something is messing up one of the groups :/
for k in range(len(X_train)):
    print(k, len(X_test[k]), X_test[k].outcome.mean())

for k in range(len(X_train)):
    # Get unique groups and expand date raneg
    groups = X_train[k].people_group_1.unique()
    allGroupsAndDays = pd.DataFrame.from_records(product(groups, alldays))
    allGroupsAndDays.columns = ['people_group_1', 'actdate_leak']

    meanbycomdate = X_train[k].groupby(['people_group_1', 'date'])['outcome'].agg('mean')

## Convert the calculation into a proper DataFrame.
    meanbycomdate = meanbycomdate.to_frame().reset_index()
    meanbycomdate.rename(columns={'date': 'adate_mean'}, inplace=True)

    allGroupsAndDays = pd.merge(allGroupsAndDays, meanbycomdate, left_on=['people_group_1', 'actdate_leak'], right_on=['people_group_1', 'adate_mean'], how='left')
    agad2 = allGroupsAndDays.groupby('people_group_1').apply(interpolateFun0)
    agad2 = agad2.rename(columns={'outcome': 'outcome_leak'})
    agad2 = agad2.drop(['adate_mean'], axis=1)
    X_test[k] = pd.merge( X_test[k], agad2, left_on=['people_group_1', 'date'], right_on=['people_group_1', 'actdate_leak'], how='left')
    X_test[k].drop('actdate_leak', axis=1, inplace=True)

total_features = []

for k in range(len(X_test)):
        
    X_test[k]['outcome_filled_nona'] = X_test[k].outcome_filled.fillna(X_train[k].outcome.mean())

    total_features.append(X_test[k])
    
    print(sklearn.metrics.roc_auc_score(X_test[k].outcome.values, X_test[k].outcome_filled_nona.values))

final_trained_new_features = pd.concat(total_features)
print('cv:', sklearn.metrics.roc_auc_score(final_trained_new_features.outcome.values, final_trained_new_features.outcome_filled_nona.values))

('TRAIN:', array([     0,      2,      3, ..., 151290, 151292, 151293]), 'TEST:', array([     1,      7,      8, ..., 151289, 151291, 151294]))
('TRAIN:', array([     0,      1,      2, ..., 151292, 151293, 151294]), 'TEST:', array([    13,     29,     31, ..., 151262, 151269, 151285]))
('TRAIN:', array([     0,      1,      4, ..., 151292, 151293, 151294]), 'TEST:', array([     2,      3,     12, ..., 151275, 151277, 151280]))
('TRAIN:', array([     1,      2,      3, ..., 151291, 151293, 151294]), 'TEST:', array([     0,      4,      5, ..., 151286, 151290, 151292]))
('TRAIN:', array([     0,      1,      2, ..., 151291, 151292, 151294]), 'TEST:', array([     9,     16,     19, ..., 151274, 151278, 151293]))
(0, 263162, 0.7322485769222)
(1, 322258, 0.5961031223429675)
(2, 274696, 0.7361228412499636)
(3, 269626, 0.7251118215602352)
(4, 268424, 0.7189334783774923)
0.95535559915
0.94038966679
0.947760633414
0.957864609794
0.960203048365
('cv:', 0.94204630996618932)
