In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from collections import Counter
tqdm.pandas()

In [2]:
customers = pd.read_csv('./data/customers.csv', usecols=['customer_id', 'age'])

In [3]:
age_bins = [-1, 19, 29, 39, 49, 59, 69, 119]
customers['age_bins'] = pd.cut(customers['age'], age_bins)

In [4]:
transactions = pd.read_csv('./data/transactions_train.csv',  
                           usecols=['t_dat', 'customer_id', 'article_id'],
                           dtype={'article_id': str, 'customer_id': str},
                           parse_dates=['t_dat'],
                          )

In [5]:
submission = pd.read_csv('./data/sample_submission.csv',
                            usecols= ['customer_id'], 
                            dtype={'customer_id': 'string'})

In [6]:
N = 12
bins = customers['age_bins'].unique().tolist()

In [7]:
def calculate_growth_factor(df):
    last_ts = df['t_dat'].max()
    df['dow'] = df['t_dat'].dt.dayofweek
    df['last_day_sale_cycle'] = df['t_dat'] - pd.TimedeltaIndex(df['dow'] - 1, unit='D')
    df.loc[df['dow'] >=2 , 'last_day_sale_cycle'] = df.loc[df['dow'] >=2 , 'last_day_sale_cycle'] + pd.TimedeltaIndex(np.ones(len(df.loc[df['dow'] >=2])) * 7, unit='D')

    df = df.drop('dow',axis=1)
    
    weekly_sales = df.drop('customer_id', axis=1).groupby(['last_day_sale_cycle', 'article_id']).count().reset_index()
    weekly_sales = weekly_sales.rename(columns={'t_dat': 'count'})
    
    df = df.merge(weekly_sales, on=['last_day_sale_cycle', 'article_id'], how = 'left')
    
    weekly_sales = weekly_sales.reset_index().set_index('article_id')

    df = df.merge(
        weekly_sales.loc[weekly_sales['last_day_sale_cycle']==last_ts, ['count']],
        on='article_id', suffixes=("", "_targ"))

    df['count_targ'].fillna(0, inplace=True)
    del weekly_sales
    
    df['growth_factor'] = df['count_targ'] / df['count']
    return df

In [8]:
def get_general_prediction_using_growth_factor(df):
    target_sales = df.drop('customer_id', axis=1).groupby('article_id')['growth_factor'].sum()
    general_pred = target_sales.nlargest(N).index.tolist()
    del target_sales
    return general_pred

In [9]:
def week_decay(df):
    last_ts = df['t_dat'].max()
    df['diff_days'] = (last_ts - df['t_dat']).dt.days
    df.loc[df['diff_days'] < 1, 'diff_days'] = 1

    a, b, c, d = 2.5e4, 1.5e5, 2e-1, 1e3
    df['decay_factor'] = a / np.sqrt(df['diff_days']) + b * np.exp(-c*df['diff_days']) - d


    df.loc[df['decay_factor'] < 0, 'decay_factor'] = 0
    df['value'] = df['growth_factor'] * df['decay_factor'] 

    df = df.groupby(['customer_id', 'article_id']).agg({'value': 'sum'})
    df = df.reset_index()

    df = df.loc[df['value'] > 0]
    df['rank'] = df.groupby("customer_id")["value"].rank("dense", ascending=False)
    df = df.loc[df['rank'] <= 12]
    return df

In [10]:
for _bin in bins:
    df  = transactions.copy()
    if str(_bin) == 'nan':
        bin_customers = customers[customers['age_bins'].isnull()]
    else:
        bin_customers = customers[customers['age_bins'] == _bin]
    
    bin_customers = bin_customers.drop(['age_bins'], axis=1)
   
    df = df.merge(bin_customers['customer_id'], on='customer_id', how='inner')
    
    df = calculate_growth_factor(df)
    
    general_pred = get_general_prediction_using_growth_factor(df)
    
    df = week_decay(df)
    
    prediction_df = df.sort_values(['customer_id', 'value'], ascending = False).reset_index(drop = True)
    prediction_df = prediction_df.groupby('customer_id').agg({'article_id': list}).reset_index()
    prediction_df = prediction_df.rename(columns={'article_id': 'prediction'})
    
    partial_submission = submission.copy()
    partial_submission = partial_submission.merge(bin_customers['customer_id'], on='customer_id', how='inner')
    partial_submission = partial_submission.merge(prediction_df, left_on = 'customer_id', right_on = 'customer_id', how = 'left')

    partial_submission['prediction'] = partial_submission['prediction'].fillna({i: general_pred for i in submission.index})
    partial_submission['prediction'] = partial_submission['prediction'].apply(lambda x: x + general_pred[:12 - len(x)])
    partial_submission['prediction'] = partial_submission['prediction'].apply(lambda x: ' '.join(x))
    partial_submission['prediction'] = partial_submission['prediction'].str.strip()
    partial_submission.to_csv(f'./misc/submission_' + str(_bin) + '.csv',index=False)

In [11]:
for i, _bin in enumerate(bins):
    partial_submission  = pd.read_csv(f'./misc/submission_' + str(_bin) + '.csv')
    if i == 0:
        submission = partial_submission
    else:
        submission = pd.concat([submission, partial_submission], axis=0)

submission.to_csv(f'./submission/age_bins.csv', index=False)