# **<a id="Content">HnM RecSys Notebook 9417</a>**

## **<a id="Content">Table of Contents</a>**
* [**<span>1. Imports</span>**](#Imports)  
* [**<span>2. Helper Functions/Decorators</span>**](#Helper-Functions)
* [**<span>5. LightGBM Model</span>**](#LightGBM-Model) 

## Imports

In [70]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
import os
import re
import warnings
import scipy.sparse as sparse
from scipy.sparse.linalg import spsolve

In [71]:
# LightGBM imports

from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.feature_selection import RFE
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import make_scorer
import lightgbm as lgb
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

In [72]:
import pickle

# open user_item_matrix_200
with open('user_item_matrix_200.pkl', 'rb') as f:
    user_item_matrix = pickle.load(f)

# open customer and articels incides map
with open('lightgbm/customer_id_indices_map.pkl', 'rb') as f:
    customer_id_indices_map = pickle.load(f)

with open('lightgbm/article_id_indices_map.pkl', 'rb') as f:
    article_id_indices_map = pickle.load(f)

# load df from pickle file for time-based split
with open('lightgbm/df.pkl', 'rb') as f:
    df = pickle.load(f)

# load final_df from pickle file for clean processing
with open('lightgbm/final_df_with_binary_targets.pkl', 'rb') as f:
    final_df = pickle.load(f)

In [73]:
final_df.head()

Unnamed: 0,price,sales_channel_1,sales_channel_2,quantity,article_engagement_ratio,user_index,item_index,FN,Active,club_member_status,...,garment_group_no_1019.0,garment_group_no_1020.0,garment_group_no_1021.0,garment_group_no_1023.0,garment_group_no_1025.0,index_group_no_1.0,index_group_no_2.0,index_group_no_3.0,index_group_no_4.0,index_group_no_26.0
0,0.042358,False,True,1.0,1.0,5,11563,1.0,1.0,2.0,...,False,False,False,False,False,True,False,False,False,False
1,0.050842,False,True,1.0,1.0,5,9899,1.0,1.0,2.0,...,False,False,False,False,False,True,False,False,False,False
2,0.06781,False,True,1.0,1.0,5,14438,1.0,1.0,2.0,...,False,False,False,False,False,True,False,False,False,False
3,0.016937,False,True,1.0,0.5,10,10307,0.0,0.0,2.0,...,False,False,False,False,False,False,True,False,False,False
4,0.016937,False,True,1.0,0.166667,10,13608,0.0,0.0,2.0,...,False,False,False,True,False,True,False,False,False,False


In [74]:
# only get top 50 customers by number of total pruchase quantity from final_df

# Compute the total quantity for each user_index
user_quantity = final_df.groupby('user_index')['quantity'].sum()

# Get the top 50 user_indices by total quantity
top_50_users = user_quantity.nlargest(50).index

# Filter the final_df to include only the data for the top 50 users
final_df_top_50 = final_df[final_df['user_index'].isin(top_50_users)].copy()
# print the shape of final_df_top_50
print(final_df_top_50.shape)

print(final_df_top_50['user_index'].nunique())


(1952211, 56)
50


In [75]:
def time_based_train_test_split(final_df, test_size=0.2):

    # Convert days, months, and years columns to datetime object
    final_df['date'] = pd.to_datetime(final_df[['day', 'month', 'year']])

    # Sort dataframe by date in ascending order
    final_df = final_df.sort_values(by='date')

    # Calculate cutoff index
    cutoff_index = int(len(final_df) * (1-test_size))

    # Create train and test dataframes
    train_df = final_df[:cutoff_index]
    test_df = final_df[cutoff_index:]

    # Drop date column from train and test dataframes
    train_df = train_df.drop('date', axis=1)
    test_df = test_df.drop('date', axis=1)

    # split train_df into X_train and y_train
    X_train = train_df.drop('target', axis=1)
    y_train = train_df['target']

    # split test_df into X_test and y_test
    X_test = test_df.drop('target', axis=1)
    y_test = test_df['target']

    return X_train, X_test, y_train, y_test

In [76]:
# function to insert random dates for each user in the df where there is no purchase (target = 0)
# random dates are between user's first purchase date and last purchase date
# if user has only one purchase, then the random date is the same as the purchase date
# # if the user has no purchase, random date range is between df min and max dates

def insert_random_dates(df):
    # Group the DataFrame by user ID
    grouped = final_df.groupby('user_index')

    default_min_date = df['date'].min()
    default_max_date = df['date'].max()
    
    print("df min date: ", default_min_date)
    print("df max date: ", default_max_date)
    
    for user_id, group_df in grouped:
    

        # Find the missing dates for this user
        missing_dates = group_df.loc[group_df['target'] == 0, 'date']

        # If there are no missing dates, continue to the next user
        if len(missing_dates) == 0:
            continue

        # Find the minimum and maximum purchase dates for this user
        min_date = group_df['date'].min()
        max_date = group_df['date'].max()
    
        # If the user has no purchase dates, use the default minimum and maximum dates
        if pd.isna(min_date):
            min_date = default_min_date
        if pd.isna(max_date):
            max_date = default_max_date

        # Generate a list of random dates between the minimum and maximum purchase dates
        random_dates = pd.date_range(start=min_date, end=max_date, freq='D').strftime('%Y-%m-%d')
        # Only sample from random_dates if it has enough values
        if len(random_dates) >= len(missing_dates):
            random_dates = np.random.choice(random_dates, size=len(missing_dates), replace=False)
        else:
            random_dates = np.random.choice(pd.date_range(start=min_date, end=max_date, freq='D'), size=len(missing_dates), replace=True)

        # Replace the missing dates with the random dates
        group_df.loc[group_df['target'] == 0, 'date'] = random_dates
    
    return df


In [77]:
final_df['date'] = pd.to_datetime(final_df[['day', 'month', 'year']])

final_df = insert_random_dates(final_df)
final_df['date'].isnull().sum()

df min date:  2018-09-20 00:00:00
df max date:  2020-09-22 00:00:00


7676429

In [66]:
final_df.head()

# check how many null values in date column
final_df['date'].isnull().sum()

7676429

In [38]:
# 80/20 time-based split to curb data leakage
X_train, X_test, y_train, y_test = time_based_train_test_split(final_df_top_50, test_size=0.2)
# final_df_top_50 = final_df_top_50.drop('date', axis=1)
# from sklearn.model_selection import train_test_split
# X_train, X_test, y_train, y_test = train_test_split(final_df_top_50.drop(['target'], axis=1), final_df_top_50['target'], test_size=0.2, random_state=42)

Memory usage of dataframe is 1393.06 MB
Memory usage after optimization is: 726.30 MB
Memory usage decreased by 47.9%
Memory usage of dataframe is 348.27 MB
Memory usage after optimization is: 401.84 MB
Memory usage decreased by -15.4%
(6242440, 55)
(1560611, 55)
(6242440,)
(1560611,)
