In [406]:
import numpy as np
import pandas as pd
import catboost as cb
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
from collections import defaultdict, Counter
import config

%load_ext autoreload
%autoreload 2

DATA_PATH = config.get_data_path()

TRAIN_FEATURES = DATA_PATH / 'train_features.csv'
TEST_FEATURES = DATA_PATH / 'test_features.csv'

train_df = pd.read_csv(TRAIN_FEATURES, index_col=0, parse_dates=['event_dttm'])
test_df = pd.read_csv(TEST_FEATURES, index_col=0, parse_dates=['event_dttm'])

def add_this_story_counter(full_df, mode='after'):
    already_seen = defaultdict(int)
    result= np.zeros(len(full_df))
    length = full_df.shape[0]
    
    for j in range(length):
        i = (length - j - 1) if mode == 'after' else j
        current_id = full_df.customer_id.iloc[i] * 20000 + full_df.story_id.iloc[i]
        result[i] = already_seen[current_id]
        already_seen[current_id] += 1

    full_df[f'this_story_{mode}_counter'] = result
    return full_df

def add_story_counter(full_df, mode='after'):
    already_seen = defaultdict(int)
    result= np.zeros(len(full_df))
    length = full_df.shape[0]

    for j in range(length):
        i = (length - j - 1) if mode == 'after' else j
        current_id = full_df.customer_id.iloc[i]
        result[i] = already_seen[current_id]
        already_seen[current_id] += 1

    full_df[f'stories_{mode}_counter'] = result
    return full_df

def near_story_time(full_df, mode='after'):
    current = dict()
    result_array = np.zeros(len(full_df))
    length = full_df.shape[0]

    for j in range(length):
        i = (length - j - 1) if mode == 'after' else j
        current_id = full_df.customer_id.iloc[i]
        
        result = np.NaN
        curtime = full_df['event_dttm'].iloc[i]
        if current_id in current:
            if (current[current_id][0] - curtime).total_seconds() == 0:
                if (current[current_id][1] is not np.NaN):
                    result = (current[current_id][1] - curtime).total_seconds()
            else:
                result = (current[current_id][0] - curtime).total_seconds()
                current[current_id] = (curtime, current[current_id][0])
        else:
            current[current_id] = (curtime, np.NaN)
            
        result_array[i] = result

    full_df[f'nearest_story_seconds_{mode}'] = result
    
    return full_df

def add_story_counters_features(train_df, test_df):
    dttm_thresh = train_df['event_dttm'].max()
    full_df = pd.concat([train_df, test_df], sort=False)
    full_df = full_df.sort_values('event_dttm')

    full_df = add_story_counter(full_df, mode='before')
    full_df = add_story_counter(full_df, mode='after')
    full_df = add_this_story_counter(full_df, mode='before')
    full_df = add_this_story_counter(full_df, mode='after')
    full_df = near_story_time(full_df, mode='before')
    full_df = near_story_time(full_df, mode='after')

    full_df = full_df.sort_index()
    full_df['stories_at_all'] = full_df['stories_before_counter'] + full_df['stories_after_counter']
    full_df['this_story_at_all'] = full_df['this_story_before_counter'] + full_df['this_story_after_counter']
    
    
    train_df = full_df[full_df['event_dttm'] <= dttm_thresh]
    test_df = full_df[full_df['event_dttm'] > dttm_thresh]
    
    return train_df.sort_index(), test_df.sort_index()

train_df, test_df = add_story_counters_features(train_df, test_df)

# train_df.to_csv(TRAIN_FEATURES)
# test_df.to_csv(TEST_FEATURES)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [388]:
dttm_thresh = train_df['event_dttm'].max()
full_df = pd.concat([train_df, test_df], sort=False)

In [389]:
temp_df = full_df.sort_values('event_dttm').copy()
temp_df['ones'] = np.ones(temp_df.shape[0])

In [391]:
unique_stories = (
    temp_df
    .groupby(['customer_id', 'event_dttm'])['story_id']
    .nunique()
).reset_index()

In [392]:
a = (
    temp_df
    .groupby(['customer_id', 'event_dttm'])['ones']
    .apply(sum)
    .reset_index()
    .sort_values('event_dttm')
)

In [393]:
a['ones_for_group'] = np.ones(a.shape[0])

In [394]:
a['group_number'] = a.groupby('customer_id')['ones_for_group'].cumsum()

In [395]:
temp_df = pd.merge(temp_df, a, on=['customer_id', 'event_dttm']).sort_values('event_dttm')

In [396]:
temp_df = pd.merge(temp_df, unique_stories, on=['customer_id', 'event_dttm']).sort_values('event_dttm')

In [397]:
temp_df = temp_df.drop(['ones_x', 'ones_y', 'ones_for_group'], axis=1)
temp_df['story_id'] = temp_df['story_id_x']
temp_df['unique_in_group'] = temp_df['story_id_y']
temp_df = temp_df.drop(['story_id_y', 'story_id_x', 'group_count'], axis=1)

In [399]:
full_df = temp_df

In [400]:
train_df = full_df[full_df['event_dttm'] <= dttm_thresh]
test_df = full_df[full_df['event_dttm'] > dttm_thresh]

In [401]:
train_df.sort_index().to_csv(TRAIN_FEATURES)
test_df.sort_index().to_csv(TEST_FEATURES)

In [198]:
train_df.columns

Index(['customer_id', 'story_id', 'event_dttm', 'event', 'weekday', 'day',
       'hour', 'minute', 'is_weeked', 'product_0', 'product_1', 'product_2',
       'product_3', 'product_4', 'product_5', 'product_6', 'gender_cd', 'age',
       'marital_status_cd', 'children_cnt', 'job_position_cd', 'prod_not_nan',
       'prod_sum_opn', 'prod_sum_utl', 'prod_sum_cls', 'num_pages', 'num_urls',
       'num_elements', 'mean_font_size', 'text_amount', 'clusters', 'neutral',
       'negative', 'skip', 'speech', 'positive', 'num_messages', 'text_len',
       'num_guids', 'sum_amount', 'mean_amount', 'num_trans', 'std_amount',
       'std_amount_normalized', 'sum_amount_if_retail_shops',
       'sum_amount_if_retail_shops_percentage', 'sum_amount_if_other_shops',
       'sum_amount_if_other_shops_percentage',
       'sum_amount_if_professional_service',
       'sum_amount_if_professional_service_percentage',
       'sum_amount_if_transport', 'sum_amount_if_transport_percentage',
       'sum_amount_