In [23]:
import pandas as pd
import numpy as np

## Data Description

- images/ - a folder of images corresponding to each article_id; images are placed in subfolders starting with the first three digits of the article_id; note, not all article_id values have a corresponding image.
- articles.csv - detailed metadata for each article_id available for purchase
- customers.csv - metadata for each customer_id in dataset
- transactions_train.csv - the training data, consisting of the purchases each customer for each date, as well as additional information. Duplicate rows correspond to multiple purchases of the same item. Your task is to predict the article_ids each customer will purchase during the 7-day period immediately after the training data period.

In [24]:
# items
items = pd.read_csv('../data/articles.csv')
items.head()

Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,...,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc
0,108775015,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,9,Black,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
1,108775044,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,10,White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
2,108775051,108775,Strap top (1),253,Vest top,Garment Upper body,1010017,Stripe,11,Off White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
3,110065001,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,9,Black,...,Clean Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."
4,110065002,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,10,White,...,Clean Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."


In [25]:
for col in items.columns:
    unique_values = items[col].unique()
    null_percentage = items[col].isnull().sum() * 100 / len(items)
    print(f'''Unique values in column '{col}', {len(unique_values)}, null value {null_percentage}''')

# 105,542 items

Unique values in column 'article_id', 105542, null value 0.0
Unique values in column 'product_code', 47224, null value 0.0
Unique values in column 'prod_name', 45875, null value 0.0
Unique values in column 'product_type_no', 132, null value 0.0
Unique values in column 'product_type_name', 131, null value 0.0
Unique values in column 'product_group_name', 19, null value 0.0
Unique values in column 'graphical_appearance_no', 30, null value 0.0
Unique values in column 'graphical_appearance_name', 30, null value 0.0
Unique values in column 'colour_group_code', 50, null value 0.0
Unique values in column 'colour_group_name', 50, null value 0.0
Unique values in column 'perceived_colour_value_id', 8, null value 0.0
Unique values in column 'perceived_colour_value_name', 8, null value 0.0
Unique values in column 'perceived_colour_master_id', 20, null value 0.0
Unique values in column 'perceived_colour_master_name', 20, null value 0.0
Unique values in column 'department_no', 299, null value 0.0
Un

In [26]:
# developing mental models for values in features and understanding what they represent
print(items['garment_group_name'].unique())

# feature values

# article_id: Unique identifier for each article/item.
# product_code: A code assigned to the product, possibly for internal categorization.

# prod_name: The name of the product.
# product_type_no: A numerical code for the product type.

# product_type_name: The name of the product type (e.g., shirt, pants, dress).

# product_group_name: The name of the broader group the product belongs to.

# graphical_appearance_no: A numerical code for the product's visual appearance.
# graphical_appearance_name: Description of the product's visual appearance (e.g., solid, patterned).

# colour_group_code: A code representing the color group.
# colour_group_name: The name of the color group (e.g., blue, red, green).

# perceived_colour_value_id: An ID for the perceived color value.
# perceived_colour_value_name: The name of the perceived color value (e.g., light, dark, bright).

# perceived_colour_master_id: An ID for the master perceived color.
# perceived_colour_master_name: The name of the master perceived color.

# department_no: A numerical code for the department.
# department_name: The name of the department (e.g., women's wear, men's wear).

# index_code: A code for indexing purposes.
# index_name: The name associated with the index code.

# index_group_no: A numerical code for the index group.
# index_group_name: The name of the index group.

# section_no: A numerical code for the section.
# section_name: The name of the section.

# garment_group_no: A numerical code for the garment group.
# garment_group_name: The name of the garment group (e.g., tops, bottoms).

# detail_desc: A detailed description of the product.

['Jersey Basic' 'Under-, Nightwear' 'Socks and Tights' 'Jersey Fancy'
 'Accessories' 'Trousers Denim' 'Outdoor' 'Shoes' 'Swimwear' 'Knitwear'
 'Shirts' 'Trousers' 'Dressed' 'Shorts' 'Dresses Ladies' 'Skirts'
 'Special Offers' 'Blouses' 'Unknown' 'Woven/Jersey/Knitted mix Baby'
 'Dresses/Skirts girls']


In [27]:
# why more numbers that names at times? - checking data quality
items[['section_no','section_name']].drop_duplicates().sort_values(by=['section_name']).reset_index()

Unnamed: 0,index,section_no,section_name
0,2494,41,Baby Boy
1,11,44,Baby Essentials & Complements
2,3768,40,Baby Girl
3,216,72,Boys Underwear & Basics
4,11195,97,Collaborations
5,322,21,Contemporary Casual
6,921,20,Contemporary Smart
7,242,55,Contemporary Street
8,1627,56,Denim Men
9,124,52,Divided Accessories


In [28]:
users = pd.read_csv("../data/customers.csv")
users.head()

for col in users.columns:
    unique_values = users[col].unique()
    null_percentage = users[col].isnull().sum() * 100 / len(users)
    print(f'''Unique values in column '{col}', {len(unique_values)}, null value {null_percentage}''')


# 1,371,980 customers

# customer_id: A unique alphanumeric identifier assigned to each individual customer in the H&M database to track and distinguish their interactions and purchases.
# FN: A binary flag indicating whether the customer has opted in to receive fashion-related notifications or communications from H&M.
# Active: A status indicator representing whether the customer has made a purchase or engaged with H&M's services within a specific recent time frame.
# club_member_status: A categorical variable representing the customer's current membership level or participation in H&M's loyalty or rewards program.
# fashion_news_frequency: A categorical feature describing how often the customer wishes to receive fashion-related updates, newsletters, or promotional content from H&M.
# age: A numerical representation of the customer's age, which can be used for demographic segmentation and targeted marketing strategies.
# postal_code: A geographical identifier representing the customer's residential or primary shipping address, useful for regional analysis and localized marketing efforts.

Unique values in column 'customer_id', 1371980, null value 0.0
Unique values in column 'FN', 2, null value 65.23783145526903
Unique values in column 'Active', 2, null value 66.15081852505139
Unique values in column 'club_member_status', 4, null value 0.441843175556495
Unique values in column 'fashion_news_frequency', 4, null value 1.1669995189434248
Unique values in column 'age', 85, null value 1.1560664149623172
Unique values in column 'postal_code', 352899, null value 0.0


In [29]:
interaction = pd.read_csv("../data/transactions_train.csv")
interaction.head()

# t_dat: The date of the transaction, likely in a standard date format.
# customer_id: A unique identifier for the customer who made the purchase, corresponding to the customer_id in the customers.csv file.
# article_id: A unique identifier for the purchased article, matching the article_id in the articles.csv file.
# price: The price of the article at the time of purchase, presumably in the relevant currency.
# sales_channel_id: An identifier representing the sales channel through which the purchase was made, possibly distinguishing between online and in-store purchases.

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
0,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,0.050831,2
1,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023,0.030492,2
2,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,505221004,0.015237,2
3,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687003,0.016932,2
4,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687004,0.016932,2


In [30]:
print(interaction['sales_channel_id'].unique())

[2 1]


In [31]:
# understand selling patterns
# Example -> https://www.kaggle.com/code/vanguarde/h-m-eda-first-look

# TODO

In [32]:
# Metrics => 
# Referece for code and understanding metrics:
# https://www.kaggle.com/code/nandeshwar/mean-average-precision-map-k-metric-explained-code
# Kaggle is a great community to for discussion like this! So get started today!

def apk(actual, predicted, k=12):
    """
    Computes the average precision at k.
    This function computes the average prescision at k between two lists of
    items.
    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The average precision at k over the input lists
    """
    if not actual:
        return 0.0

    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        # first condition checks whether it is valid prediction
        # second condition checks if prediction is not repeated
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    return score / min(len(actual), k)

def mapk(actual, predicted, k=12):
    """
    Computes the mean average precision at k.
    This function computes the mean average prescision at k between two lists
    of lists of items.
    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The mean average precision at k over the input lists
    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

In [34]:
# baseline (popularity recommender)
popular_items = list(interaction.groupby('article_id')['sales_channel_id'].\
    count().reset_index().sort_values(by=['sales_channel_id']).head(12)['article_id'])

print(popular_items)

[621245001, 620887001, 537346015, 512568001, 679622001, 494472001, 770221001, 597069001, 609648001, 537439001, 537461001, 581238002]


In [None]:
def create_sample_sub()
    cids = list(pd.read_csv('../data/sample_submission.csv')['customer_id'])
    