In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np
import pandas as pd
import seaborn as sns
import datetime
import pickle
from sklearn.metrics.pairwise import cosine_similarity

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session



In [2]:
articles = pd.read_csv("../input/h-and-m-personalized-fashion-recommendations/articles.csv")
transactions = pd.read_csv("../input/h-and-m-personalized-fashion-recommendations/transactions_train.csv")

In [3]:
transactions = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/transactions_train.csv', dtype={'article_id':str})
transactions.drop(['sales_channel_id', 'price'], inplace=True, axis=1)
transactions['bought'] = 1
transactions["t_dat"] = pd.to_datetime(transactions["t_dat"])

In [4]:
transactions["t_dat"].describe()

count                         31788324
mean     2019-09-15 17:27:46.894452992
min                2018-09-20 00:00:00
25%                2019-03-28 00:00:00
50%                2019-08-25 00:00:00
75%                2020-03-29 00:00:00
max                2020-09-22 00:00:00
Name: t_dat, dtype: object

Filter older transactions and articles which are not sold over n times

In [5]:
# We only consider transactions after 1st of September 2020
start_date = datetime.datetime(2020,9,1)
transactions = transactions.loc[transactions["t_dat"] >= start_date]

# Filter transactions by number of times an article has been bought
article_bought_count = transactions[['article_id', 't_dat']].groupby('article_id').count().reset_index().rename(columns={'t_dat': 'count'})
most_bought_articles = article_bought_count[article_bought_count['count']>10]['article_id'].values
transactions = transactions[transactions['article_id'].isin(most_bought_articles)]

In [6]:
transactions["t_dat"].describe()

count                           742995
mean     2020-09-11 05:20:27.094664192
min                2020-09-01 00:00:00
25%                2020-09-06 00:00:00
50%                2020-09-11 00:00:00
75%                2020-09-17 00:00:00
max                2020-09-22 00:00:00
Name: t_dat, dtype: object

In [7]:
# Generate negative samples
np.random.seed(42)

negative_transactions = pd.DataFrame({
    'article_id': np.random.choice(transactions.article_id.unique(), transactions.shape[0]),
    'customer_id': np.random.choice(transactions.customer_id.unique(), transactions.shape[0]),
    'bought': np.zeros(transactions.shape[0])
})
positive_transactions = transactions[transactions['bought']==1]

Variables

In [8]:

num_components = 1000

learning_rate = 0.001

lmbda = 0.1

n_epochs = 20

transactions = pd.concat([transactions, negative_transactions])
customers = transactions.customer_id.values
articles = transactions.article_id.values
bought = transactions.bought.values


customer_id2index = {c: i for i, c in enumerate(np.unique(customers))}
article_id2index = {a: i for i, a in enumerate(np.unique(articles))}

training_indices = None
customers_latent_matrix = None
articles_latent_matrix = None

n_samples = transactions.shape[0]

customers_latent_matrix = np.random.normal(scale=1., size=(len(np.unique(customers)), num_components))
articles_latent_matrix = np.random.normal(scale=1., size=(len(np.unique(articles)), num_components))

In [9]:
for epoch in range(n_epochs):
    print('Epoch: {}'.format(epoch))
    training_indices = np.arange(n_samples)

    np.random.shuffle(training_indices)
    
    for idx in training_indices:
        customer_id = customers[idx]
        article_id = articles[idx]
        bought_val = bought[idx]

        customer_index = customer_id2index[customer_id]
        article_index = article_id2index[article_id]

        prediction = np.dot(customers_latent_matrix[customer_index], articles_latent_matrix[article_index])
        prediction = np.clip(prediction, 0, 1)
        error = (bought_val - prediction) # error

        # Update latent factors in terms of the learning rate and the observed error
        # c = c + alpha x (e x a - lamda x c)
        customers_latent_matrix[customer_index] += learning_rate * \
                                (error * articles_latent_matrix[article_index] - \
                                 lmbda * customers_latent_matrix[customer_index])
        # a = a + alpha x (e x c - lamda x a)
        articles_latent_matrix[article_index] += learning_rate * \
                                (error * customers_latent_matrix[customer_index] - \
                                 lmbda * articles_latent_matrix[article_index])

Epoch: 0
Epoch: 1
Epoch: 2
Epoch: 3
Epoch: 4
Epoch: 5
Epoch: 6
Epoch: 7
Epoch: 8
Epoch: 9
Epoch: 10
Epoch: 11
Epoch: 12
Epoch: 13
Epoch: 14
Epoch: 15
Epoch: 16
Epoch: 17
Epoch: 18
Epoch: 19


In [10]:
np.save('customers_latent_matrix.npy', np.empty(5))

with open('articles_latent_matrix.pkl', 'wb') as file:
    pickle.dump(articles_latent_matrix, file)

In [11]:
articles_latent_matrix = pickle.load(open('/kaggle/working/articles_latent_matrix.pkl', 'rb'))

In [12]:
customers_sub = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/sample_submission.csv').customer_id.unique()

recommendations = []

def get_default_remommendations(positive_transactions):
    # Get default recommendation (time decay popularity)
    # Calculate time decaying popularity
    # Calculate time decaying popularity. This leads to items bought more recently having more weight in the popularity list.
    positive_transactions['pop_factor'] = positive_transactions['t_dat'].apply(lambda x: 1/(datetime.datetime(2020,9,23) - x).days)
    transactions_by_article = positive_transactions[['article_id', 'pop_factor']].groupby('article_id').sum().reset_index()

    default_recommendation = transactions_by_article.sort_values(by='pop_factor', ascending=False)['article_id'].values[:12]

    return default_recommendation

# Compute similarity matrix (cosine)
# ouputs a matrix of size: (self.articles_latent_matrix, self.articles_latent_matrix)
# (i, j) th item represents similarity score of ith and jjth item. 1 is good. 0 is bad. 
similarity_matrix = cosine_similarity(articles_latent_matrix, articles_latent_matrix, dense_output=False)

# Convert similarity matrix into a matrix containing the 12 most similar items' index for each item
# out size : (len(articles), 12)
# the has closesst 12 atrcles for each article
similarity_matrix = np.argsort(similarity_matrix, axis=1)
similarity_matrix = similarity_matrix[:, -12:]

# Group articles by user and articles to compute the number of times each article has been bought by each user
# transactions_by_customer -> all customer, atricle combinations and their counts avialable in positive transactions
transactions_by_customer = positive_transactions[['customer_id', 'article_id', 'bought']].groupby(['customer_id', 'article_id']).count().reset_index()
# most bought article for each customer
most_bought_article = transactions_by_customer.loc[transactions_by_customer.groupby('customer_id').bought.idxmax()]['article_id'].values

default_recommendation = get_default_remommendations(positive_transactions)

default_recommendation_count = 0
model_recommendation_count = 0

# Make predictions
for customer in customers_sub:
    try:
        rec_1 = []
        rec_2 = []
        aux = []

        
        # 1.0
        # Return the half of the default recommendation
        rec_2 =  default_recommendation
        # 2.0
        # Retrieve the most bought article by customer
        user_most_bought_article_id = most_bought_article[customer_id2index[customer]]
        # Using the similarity matrix, get the 12 most similar articles
        rec_1 = articles[similarity_matrix[article_id2index[user_most_bought_article_id]]]
        

        # Merge half of both recommendation lists
        for rec_idx in range(6):
            aux.append(rec_2[rec_idx])
            aux.append(rec_1[rec_idx])

        recommendations.append(' '.join(aux))
        model_recommendation_count += 1
    except Exception as e:
        # If anything goes wrong Return the default recommendations
        # print(f'Error: {e}')
        # print(f'Recommending only the default recommendations')
        recommendations.append(' '.join(default_recommendation))
        default_recommendation_count += 1

prediction_df =  pd.DataFrame({'customer_id': customers_sub, 'prediction': recommendations})

In [13]:
prediction_df.to_csv('submission.csv', index=False)

In [14]:
prediction_df

Unnamed: 0,customer_id,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0924243001 0781758057 0751471001 0309864012 09...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0924243001 0751471001 0918522001 0924243002 04...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0924243001 0914319002 0751471001 0894668003 09...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0924243001 0751471001 0918522001 0924243002 04...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0924243001 0751471001 0918522001 0924243002 04...
...,...,...
1371975,ffffbbf78b6eaac697a8a5dfbfd2bfa8113ee5b403e474...,0924243001 0896851001 0751471001 0372860002 09...
1371976,ffffcd5046a6143d29a04fb8c424ce494a76e5cdf4fab5...,0924243001 0751471001 0918522001 0924243002 04...
1371977,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,0924243001 0751664001 0751471001 0902023002 09...
1371978,ffffd7744cebcf3aca44ae7049d2a94b87074c3d4ffe38...,0924243001 0751471001 0918522001 0924243002 04...
