In [6]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np
import pandas as pd 
import time
import gc
import numpy as np
import pandas as pd
import multiprocessing as mp
from multiprocessing import Pool
from functools import partial
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from skimage import io
from sklearn.decomposition import PCA



# Evaluation Method

In [26]:
# Source: https://github.com/benhamner/Metrics/blob/master/Python/ml_metrics/average_precision.py
def apk(actual, predicted, k=10):
    """
    Computes the average precision at k.
    This function computes the average prescision at k between two lists of
    items.
    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The average precision at k over the input lists
    """
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0
    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=12):
    """
    Computes the mean average precision at k.
    This function computes the mean average prescision at k between two lists
    of lists of items.
    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The mean average precision at k over the input lists
    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted) if a]) # CHANGES: ignore null actual (variable=a)


# UUCF

**Data Processing**

In [16]:
base_path = '../input/h-and-m-personalized-fashion-recommendations/'
csv_train = f'{base_path}transactions_train.csv'
csv_users = f'{base_path}customers.csv'
csv_items = f'{base_path}articles.csv'

In [17]:
transactions = pd.read_csv(csv_train, dtype={'article_id': str}, parse_dates=['t_dat'])
users = pd.read_csv(csv_users)
items = pd.read_csv(csv_items, dtype={'article_id': str})

1. **Map user_id in users to cutomer_id in transactions**
2. **Map item_id in users to article_id in transactions**

In [20]:
user_list = users['customer_id'].unique().tolist()
item_list = items['article_id'].unique().tolist()

user_to_customer_map = {user_id: customer_id for user_id, customer_id in enumerate(user_list)}
customer_to_user_map = {customer_id: user_id for user_id, customer_id in enumerate(user_list)}

item_to_article_map = {item_id: article_id for item_id, article_id in enumerate(item_list)}
article_to_item_map = {article_id: item_id for item_id, article_id in enumerate(item_list)}

In [21]:
transactions['user_id'] = transactions['customer_id'].map(customer_to_user_map)
transactions['item_id'] = transactions['article_id'].map(article_to_item_map)

In [22]:
val_start_date = '2020-09-16'
train_df = transactions.query(f"t_dat < '{val_start_date}'").reset_index(drop=True)
valid_df = transactions.query(f"t_dat >= '{val_start_date}'").reset_index(drop=True)

# Sorting
train_df = train_df.sort_values(["customer_id", "t_dat"], ascending=False)
valid_df = valid_df.sort_values(["customer_id", "t_dat"], ascending=False)

_ = gc.collect()

valid_df = valid_df.sort_values(['customer_id', 't_dat'], ascending = [True, True]) 
valid_cust = valid_df.groupby('customer_id')['article_id'].apply(list).reset_index()
valid_cust['valid_true'] = valid_cust['article_id']
del valid_df, valid_cust['article_id']
_ = gc.collect()

**Helper Method**

In [23]:
TEST_RUN = True
TEST_SIZE = 10000

def flatten(l):
    return [item for sublist in l for item in sublist]

def compare_user(user_1, user_2):
    return len(set(user_1) & set(user_2)) / np.sqrt(len(user_1) * len(user_2))

def recommend_item(user_id, user_vec, transaction_history):
    global n
    similar_users = transaction_history.apply(lambda other_vec: compare_user(user_vec, other_vec)).sort_values(ascending=False).head(31)
    
    similar_users = similar_users[similar_users.index != user_id]
        
    users, scores = similar_users.index.tolist(), similar_users.tolist()
    
    user_score = pd.DataFrame({'user': users, 'score': scores})
    user_score['items'] = user_score.apply(lambda row: transaction_history.loc[row.user], axis=1)
    user_score['weighted_items'] = user_score.apply(lambda row: [(item, row.score) for item in row['items']], axis=1)

    recs = pd.DataFrame(flatten(user_score['weighted_items'].tolist()), columns=['item', 'score']).groupby('item')['score'].sum().sort_values(ascending=False)
    recs = recs[~recs.index.isin(user_vec)]
    # Keep the first 12 and get the item_ids
    return recs.head(12).index.tolist()

def recommend_list_user(user_ids, transaction_history):
    global n    
    n = len(user_ids)    
    user_vectors = pd.DataFrame(transaction_history.loc[user_ids]).reset_index()
    user_vectors['item_id'] = user_vectors.apply(lambda row: recommend_item(row.user_id, row.item_id, transaction_history), axis=1)
    return user_vectors.set_index('user_id')['item_id']

def get_recommendations(users, transaction_history):
    time_start = time.time()
    
    # Split into approximately evenly sized chunks
    # We will send just one batch to each CPU 
    user_chunks = np.array_split(users, mp.cpu_count())
    
    f = partial(recommend_list_user, transaction_history=transaction_history)
    with Pool(mp.cpu_count()) as p:
        res = p.map(f, user_chunks)
    
    recommendation = pd.DataFrame(pd.concat(res))

    elapsed = (time.time() - time_start) / 60
    print(f"Finished get_recommendations({len(users)}). It took {elapsed:5.2f} mins")
    return recommendation

def uucf(transactions, start_date):
    transaction_history = transactions
    print(f"Kept data from {start_date} on. Total rows: {len(transaction_history)}")
    
    transaction_history = transaction_history.groupby("user_id")['item_id'].apply(lambda items: list(set(items)))
    transaction_history = transaction_history[transaction_history.str.len() >= 3]
    if TEST_RUN:
        print("WARNING: TEST_RUN is True. It will be a toy execution.")
        transaction_history = transaction_history.head(TEST_SIZE)
    
    users = transaction_history.index.tolist()
    n_users = len(users)
    print(f"Total users in the time frame with at least 3 purchases: {n_users}")
    
    recommendation = get_recommendations(users, transaction_history)
    recommendation['customer_id'] = recommendation.index.map(user_to_customer_map)
    recommendation['prediction'] = recommendation['item_id'].map(lambda l: [item_to_article_map[i] for i in l])
    recommendation.reset_index(drop=True)[['customer_id', 'prediction']]
    return recommendation 

def uu_plot_prev(index):
    prev_items = list(set(eval_uucf.iloc[index]["valid_true"]))
    path = "../input/h-and-m-personalized-fashion-recommendations/images"
    fig = plt.figure(figsize=(20, 6))
    plt.title("Purchased items")
    plt.axis("off")
    plt.xticks(())
    plt.yticks(())

    for item, i in zip(prev_items, range(1, len(prev_items)+1)):
        sub = item[:3]
        image = path + "/"+ sub + "/"+ item +".jpg"
        image = plt.imread(image)
        fig.add_subplot(1, len(prev_items), i)
        plt.xticks(())
        plt.yticks(())
        plt.imshow(image)
def uu_plot_rcmd(index):
    rcmd_items = list(set(eval_uucf.iloc[index]["prediction"]))
    path = "../input/h-and-m-personalized-fashion-recommendations/images"
    fig = plt.figure(figsize=(20, 6))
    plt.title("Recommended items")
    plt.axis("off")

    for item, i in zip(rcmd_items, range(1, len(rcmd_items)+1)):
        sub = item[:3]
        image = path + "/"+ sub + "/"+ item +".jpg"
        image = plt.imread(image)
        fig.add_subplot(1, len(rcmd_items), i)
        plt.xticks(())
        plt.yticks(())
        plt.imshow(image)
    
    
def uu_plot(index):
    uu_plot_prev(index)
    uu_plot_rcmd(index)

**Train Model**

In [24]:
recommendation = uucf(train_df,val_start_date)
eval_uucf = recommendation
eval_uucf = valid_cust.merge(eval_uucf, on ='customer_id', how ='left')
eval_uucf = eval_uucf.dropna(axis=0)


**Evaluation**

In [27]:
mapk(
    eval_uucf['valid_true'], 
    eval_uucf['prediction'],
    k=12
)

****

In [28]:
uu_plot(4)

# Content-based Filtering with PCA

**Data Processing for CBF**

In [7]:
p_df = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/transactions_train.csv', chunksize=100000)
articles = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/articles.csv')
p_users = next(p_df)
p_df = p_users.merge(articles, on='article_id')
df1 = p_df[['customer_id', 'article_id',
       'product_group_name', 
       'graphical_appearance_name', 'colour_group_name',
       'perceived_colour_value_name',
       'perceived_colour_master_name',
       'department_name', 'index_name',
       'index_group_name', 'section_name',
       'garment_group_name']]
features = ['product_group_name', 
       'graphical_appearance_name', 'colour_group_name',
       'perceived_colour_value_name',
       'perceived_colour_master_name',
       'department_name', 'index_name',
       'index_group_name', 'section_name',
       'garment_group_name']

one_hot_df = pd.get_dummies(df1, columns=features) # one hot encoding

# Storing item and user information in one_hot df
MIN_PURCHASE = 2
groupby_customer = one_hot_df.groupby('customer_id')

# l stores all items bought for each user represented using one hot encoding
l = []
cutomer_ids = []
article_ids = []
for key in groupby_customer.groups.keys():
    temp = groupby_customer.get_group(key)
    if temp.article_id.nunique() >= MIN_PURCHASE:
        l.append(temp.drop('article_id', axis=1).sum(numeric_only=True).values)
        cutomer_ids.append(key)
        article_ids.extend(temp.article_id.values.tolist())

In [8]:
user_feature = pd.DataFrame(l, columns = one_hot_df.columns[2:])
normalized_user_feature = user_feature.div(user_feature.sum(axis=1), axis=0)
normalized_user_feature.insert(0, 'customer_id', cutomer_ids)
normalized_user_feature = normalized_user_feature.set_index('customer_id')
pca = PCA(n_components=100)
pca.fit(normalized_user_feature)
pca.explained_variance_ratio_.sum()

In [9]:
# drop duplicates and items boughts already
item_feature = one_hot_df.drop_duplicates(subset='article_id')
item_feature = item_feature[item_feature.article_id.isin(article_ids)].drop('customer_id', axis=1)
item_feature = item_feature.set_index('article_id')
user_feature_pca = pd.DataFrame(pca.transform(normalized_user_feature), columns=['component_{}'.format(i) for i in range(1, 101)]).set_index(normalized_user_feature.index)
item_feature_pca = pd.DataFrame(pca.transform(item_feature), columns=['component_{}'.format(i) for i in range(1, 101)]).set_index(item_feature.index)
scores_pca = user_feature_pca.dot(item_feature_pca.T)

**Helper method**

In [31]:
def get_rcmnd(customer_id, scores):
    cutomer_scores = scores.loc[customer_id]
    customer_prev_items = groupby_customer.get_group(customer_id)['article_id']
    prev_dropped = cutomer_scores.drop(customer_prev_items.values)
    ordered = prev_dropped.sort_values(ascending=False)   
    return ordered, customer_prev_items


def cbf_plot_prev(prev):
    path = "../input/h-and-m-personalized-fashion-recommendations/images"
    fig = plt.figure(figsize=(20, 5))
    plt.title("Purchased items")
    plt.axis("off")    
    for item, i in zip(prev, range(1, len(prev)+1)):
        item = '0' + str(item)
        sub = item[:3]
        image = path + "/"+ sub + "/"+ item +".jpg"
        image = plt.imread(image)
        fig.add_subplot(1, len(prev), i)
        plt.xticks(())
        plt.yticks(())
        plt.imshow(image)
        
def cbf_plot_rcmnd(rcmnds):
    path = "../input/h-and-m-personalized-fashion-recommendations/images"
    fig = plt.figure(figsize=(20, 5))
    plt.title("Recommended items")
    plt.axis("off")
    for item, i in zip(rcmnds, range(1, len(rcmnds)+1)):
        item = '0' + str(item)
        sub = item[:3]
        image = path + "/"+ sub + "/"+ item +".jpg"
        image = plt.imread(image)
        fig.add_subplot(1, len(rcmnds), i)
        plt.xticks(())
        plt.yticks(())
        plt.imshow(image)
                        
def cbf_plot(index):
    customer_id = scores.index[index]
    rcmnds_pca, prev_items = get_rcmnd(customer_id, scores_pca)
    rcmnds_pca = rcmnds_pca.index.values[:12]
    cbf_plot_prev(prev_items)
    cbf_plot_rcmnd(rcmnds_pca)

**Visualization**

In [33]:
cbf_plot(1)

**Evaluation**

In [None]:
mapk(
    eval_cbf['valid_true'], 
    eval_cbf['prediction'],
    k=12
)