In [1]:
import pandas as pd
import numpy as np

## Data Description

- images/ - a folder of images corresponding to each article_id; images are placed in subfolders starting with the first three digits of the article_id; note, not all article_id values have a corresponding image.
- articles.csv - detailed metadata for each article_id available for purchase
- customers.csv - metadata for each customer_id in dataset
- transactions_train.csv - the training data, consisting of the purchases each customer for each date, as well as additional information. Duplicate rows correspond to multiple purchases of the same item. Your task is to predict the article_ids each customer will purchase during the 7-day period immediately after the training data period.

In [None]:
# items
items = pd.read_csv('../data/articles.csv')
items.head()

In [None]:
for col in items.columns:
    unique_values = items[col].unique()
    null_percentage = items[col].isnull().sum() * 100 / len(items)
    print(f'''Unique values in column '{col}', {len(unique_values)}, null value {null_percentage}''')

# 105,542 items
# minial missing values

In [None]:
# developing mental models for values in features and understanding what they represent
print(items['garment_group_name'].unique())

print(items[items['article_id']=='0706016001'])

# feature values

# article_id: Unique identifier for each article/item.
# product_code: A code assigned to the product, possibly for internal categorization.

# prod_name: The name of the product.
# product_type_no: A numerical code for the product type.

# product_type_name: The name of the product type (e.g., shirt, pants, dress).

# product_group_name: The name of the broader group the product belongs to.

# graphical_appearance_no: A numerical code for the product's visual appearance.
# graphical_appearance_name: Description of the product's visual appearance (e.g., solid, patterned).

# colour_group_code: A code representing the color group.
# colour_group_name: The name of the color group (e.g., blue, red, green).

# perceived_colour_value_id: An ID for the perceived color value.
# perceived_colour_value_name: The name of the perceived color value (e.g., light, dark, bright).

# perceived_colour_master_id: An ID for the master perceived color.
# perceived_colour_master_name: The name of the master perceived color.

# department_no: A numerical code for the department.
# department_name: The name of the department (e.g., women's wear, men's wear).

# index_code: A code for indexing purposes.
# index_name: The name associated with the index code.

# index_group_no: A numerical code for the index group.
# index_group_name: The name of the index group.

# section_no: A numerical code for the section.
# section_name: The name of the section.

# garment_group_no: A numerical code for the garment group.
# garment_group_name: The name of the garment group (e.g., tops, bottoms).

# detail_desc: A detailed description of the product.

In [None]:
# why more numbers that names at times? - checking data quality
items[['section_no','section_name']].drop_duplicates().sort_values(by=['section_name']).reset_index()

In [None]:
users = pd.read_csv("../data/customers.csv")
users.head()

for col in users.columns:
    unique_values = users[col].unique()
    null_percentage = users[col].isnull().sum() * 100 / len(users)
    print(f'''Unique values in column '{col}', {len(unique_values)}, null value {null_percentage}''')


# 1,371,980 customers
# quite a few missing values

# customer_id: A unique alphanumeric identifier assigned to each individual customer in the H&M database to track and distinguish their interactions and purchases.
# FN: A binary flag indicating whether the customer has opted in to receive fashion-related notifications or communications from H&M.
# Active: A status indicator representing whether the customer has made a purchase or engaged with H&M's services within a specific recent time frame.
# club_member_status: A categorical variable representing the customer's current membership level or participation in H&M's loyalty or rewards program.
# fashion_news_frequency: A categorical feature describing how often the customer wishes to receive fashion-related updates, newsletters, or promotional content from H&M.
# age: A numerical representation of the customer's age, which can be used for demographic segmentation and targeted marketing strategies.
# postal_code: A geographical identifier representing the customer's residential or primary shipping address, useful for regional analysis and localized marketing efforts.

In [None]:
interaction = pd.read_csv("../data/transactions_train.csv", dtype={'article_id':str})
interaction.head()

# t_dat: The date of the transaction, likely in a standard date format.
# customer_id: A unique identifier for the customer who made the purchase, corresponding to the customer_id in the customers.csv file.
# article_id: A unique identifier for the purchased article, matching the article_id in the articles.csv file.
# price: The price of the article at the time of purchase, presumably in the relevant currency.
# sales_channel_id: An identifier representing the sales channel through which the purchase was made, possibly distinguishing between online and in-store purchases.

In [None]:
print(interaction['sales_channel_id'].unique())

In [None]:
# IF YOU ARE HAVING MEMORY ISSUES - USE THIS
# memory tricks (Ref: https://www.kaggle.com/competitions/h-and-m-personalized-fashion-recommendations/discussion/308635)
interaction['customer_id'] = interaction['customer_id'].apply(lambda x: int(x[-16:],16) ).astype('int64')
interaction['article_id'] = interaction['article_id'].astype('int32')
interaction['price'] = interaction['price'].astype('float32')
interaction['sales_channel_id'] = interaction['sales_channel_id'].astype('int8')
interaction.head()

In [9]:
# understand selling patterns - this can inform recommendation system
# Example -> https://www.kaggle.com/code/vanguarde/h-m-eda-first-look

# TODO: HW1
# Some question to answer 
# - What are most popular items?
# - What are the most popular dept? Item in dept? 
# - Are there trends in selling patterns over time?
# - Explore other groups possible - garment_group_name, product_type_name, product_group_name

In [33]:
# Baseline (Popularity recommender)
# Ref: https://www.kaggle.com/code/julian3833/h-m-content-based-12-most-popular-items-0-007
df = pd.read_csv('../data/transactions_train.csv', dtype={'article_id': str})
df_sub = pd.read_csv('../data/sample_submission.csv')
df_sub['prediction'] =  ' '.join(df[df['t_dat'] > '2020-09-01'].groupby('article_id')['customer_id'].count().sort_values(ascending=False).head(12).index.tolist())
df_sub.to_csv('../preds/baseline_pred.csv', index=False)

# Should get performance >0.005 MAPE

In [None]:
# Metrics => 
# Referece for code and understanding metrics:
# https://www.kaggle.com/code/nandeshwar/mean-average-precision-map-k-metric-explained-code
# Kaggle is a great community to for discussion like this! So get started today!

def apk(actual, predicted, k=12):
    """
    Computes the average precision at k.
    This function computes the average prescision at k between two lists of
    items.
    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The average precision at k over the input lists
    """
    if not actual:
        return 0.0

    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        # first condition checks whether it is valid prediction
        # second condition checks if prediction is not repeated
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    return score / min(len(actual), k)

def mapk(actual, predicted, k=12):
    """
    Computes the mean average precision at k.
    This function computes the mean average prescision at k between two lists
    of lists of items.
    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The mean average precision at k over the input lists
    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

In [None]:
# TODO: HW2
## create a validation set

# 1. Take about last 2 weeks of interaction history
# 2. For each user see how many items are being interacted with
# 3. Use this as the user-item relevant set

# We will cover this in the next sessions

In [None]:
# TODO: HW3

# Based on the Content based Recommender theory you have heard, how can you develop a content based recommender
# We will cover this in the next session as well!