In [1]:
import pandas as pd
import plotly.express as px

In [2]:
df_article = pd.read_csv('/kaggle/input/h-and-m-personalized-fashion-recommendations/articles.csv',dtype={'article_id': str})

def set_gender_flg(x):
    x['is_for_male'] = 0
    x['is_for_female'] = 0
    x['is_for_mama'] = 0
    if x['index_group_name'] in ['Ladieswear','Divided']:
        x['is_for_female'] = 1
    if x['index_group_name'] == 'Menswear':
        x['is_for_male'] = 1
    if x['index_group_name'] in ['Baby/Children','Sport']:
        if 'boy' in x['department_name'].lower() or 'men' in x['department_name'].lower():
            x['is_for_male'] = 1
        if 'girl' in x['department_name'].lower() or 'ladies' in x['department_name'].lower():
            x['is_for_female'] = 1
    if x['section_name'] == 'Mama':
        x['is_for_mama'] = 1
    return x

df_article = df_article.apply(set_gender_flg, axis=1)
df_article.head()

# Product sales seasonality

In [3]:
df_article['idxgrp_idx_prdtyp'] = df_article['index_group_name'] + '_' + df_article['index_name'] + '_' + df_article['product_type_name']

df_trans = pd.read_csv('/kaggle/input/h-and-m-personalized-fashion-recommendations/transactions_train.csv',dtype={'article_id': str})
df_trans = df_trans.sample(frac=0.1) # downsampling due to memory limits
df_trans['t_dat'] = pd.to_datetime(df_trans['t_dat'])
df_trans['YYYY_MM'] = df_trans['t_dat'].dt.year.astype(str) + '_' + df_trans['t_dat'].dt.month.astype(str)
df_trans['year'] = df_trans['t_dat'].dt.year
df_trans['month'] = df_trans['t_dat'].dt.month
df = pd.merge(df_trans, df_article, on='article_id', how='left')
del df_trans, df_article

dfgrp1 = df.groupby(['idxgrp_idx_prdtyp'])[['price']].sum().reset_index()
dfgrp2 = df.groupby(['idxgrp_idx_prdtyp', 'year', 'month'])[['price']].sum().reset_index()
dfgrp2 = pd.merge(dfgrp2, dfgrp1, on='idxgrp_idx_prdtyp', how='left')
dfgrp2['monthsales/ttl-sales'] = dfgrp2['price_x'] / dfgrp2['price_y'] * 100
dfgrp2['ym_date'] = dfgrp2['year'].astype(str) + '-' + dfgrp2['month'].astype(str) + '-1'
dfgrp2['ym_date'] = pd.to_datetime(dfgrp2['ym_date'])
dfgrp2 = pd.pivot_table(dfgrp2, index='ym_date', columns='idxgrp_idx_prdtyp', values='monthsales/ttl-sales').reset_index().fillna(0)
display(dfgrp2.head())

fig = px.line(dfgrp2, x='ym_date', y=['Ladieswear_Ladieswear_Jacket',
                                      'Divided_Divided_Jacket',
                                      'Menswear_Menswear_Jacket',
                                      'Ladieswear_Ladieswear_Sweater',
                                      'Divided_Divided_Sweater',
                                      'Menswear_Menswear_Sweater'], title="MothlySales / TTL sales")
fig.show()

In [4]:
display(dfgrp2.corr()[['Ladieswear_Ladieswear_Jacket']].reset_index().sort_values(by='Ladieswear_Ladieswear_Jacket').head(10))

fig = px.line(dfgrp2, x='ym_date', y=['Ladieswear_Ladieswear_Shorts',
                                      'Divided_Divided_Shorts',
                                      'Menswear_Menswear_Shorts',
                                      'Ladieswear_Lingeries/Tights_Swimwear bottom',
                                      'Ladieswear_Lingeries/Tights_Swimsuit',
                                      'Menswear_Menswear_Swimwear bottom'], title="MothlySales / TTL sales")
fig.show()

# Classify seasonal products by GMM clustering

In [5]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

feat_cols = [col for col in dfgrp2.columns if col != 'ym_date']

df_pca = StandardScaler().fit_transform(dfgrp2[feat_cols])
model_pca = PCA(n_components=5)
model_pca.fit(df_pca)
feature = model_pca.transform(df_pca)

df_eigen = model_pca.components_.T
df_eigen = pd.DataFrame(df_eigen,
                        index=None,
                        columns=['PC1','PC2','PC3','PC4','PC5'])
df_eigen['idxgrp_idx_prdtyp'] = feat_cols
df_eigen = pd.merge(
    df_eigen,
    dfgrp2.corr()[['Ladieswear_Ladieswear_Jacket']].reset_index().rename(columns={'Ladieswear_Ladieswear_Jacket': 'autumn_sales_indicator'}),
    on='idxgrp_idx_prdtyp',
    how='left'
)
px.scatter(df_eigen, x='PC1', y='PC2', hover_name='idxgrp_idx_prdtyp', color='autumn_sales_indicator')

In [6]:
from sklearn.mixture import GaussianMixture
gmm = GaussianMixture(n_components=4, covariance_type='full')
gmm.fit(df_eigen[['PC1','PC2','PC3']])
df_eigen['product_seasonal_type'] = gmm.predict(df_eigen[['PC1','PC2','PC3']])
df_eigen['prob_cluster1'] = gmm.predict_proba(df_eigen[['PC1','PC2','PC3']])[:,0]
df_eigen['prob_cluster2'] = gmm.predict_proba(df_eigen[['PC1','PC2','PC3']])[:,1]
df_eigen['prob_cluster3'] = gmm.predict_proba(df_eigen[['PC1','PC2','PC3']])[:,2]
df_eigen['prob_cluster4'] = gmm.predict_proba(df_eigen[['PC1','PC2','PC3']])[:,3]
px.scatter(df_eigen, x='PC1', y='PC2', hover_name='idxgrp_idx_prdtyp', color='product_seasonal_type')

In [7]:
fig = px.line(dfgrp2, x='ym_date', y=['Ladieswear_Ladies Accessories_Hair string',
                                      'Divided_Divided_Belt',
                                      'Ladieswear_Lingeries/Tights_Unknown'], title="Sales transition")
fig.show()

In [8]:
df = pd.merge(
    df,
    df_eigen[['idxgrp_idx_prdtyp','autumn_sales_indicator', 'product_seasonal_type', 'prob_cluster1', 'prob_cluster2', 'prob_cluster3', 'prob_cluster4']],
    on='idxgrp_idx_prdtyp',
    how='left'
)
del dfgrp1, dfgrp2
df.head()

# Detect products that will no longer be sold in late September 2020

In [9]:
dfagg1 = df.groupby(['article_id'])[['price']].sum().reset_index().rename(columns={'price': 'article_ttl_sales'})
dfagg2 = df[df['t_dat'] < '2019-09-01'].groupby(['article_id'])[['price']].sum().reset_index().rename(columns={'price': 'before_201909_article_ttl_sales'})
dfagg3 = df[df['t_dat'] > '2019-09-01'].groupby(['article_id'])[['price']].sum().reset_index().rename(columns={'price': 'after_201909_article_ttl_sales'})

dfagg1 = pd.merge(dfagg1, dfagg2, on='article_id')
dfagg1 = pd.merge(dfagg1, dfagg3, on='article_id')
dfagg1['before_201909_article_ttl_sales_ratio'] = dfagg1['before_201909_article_ttl_sales'] / dfagg1['article_ttl_sales'] * 100
dfagg1['not_for_sales_flg'] = (dfagg1['before_201909_article_ttl_sales_ratio'] > 95).astype(int)

display(dfagg1.sort_values(by='before_201909_article_ttl_sales_ratio', ascending=False).head(5))
dftmp = df[df['article_id']=='0698276001'].sort_values(by='t_dat')
fig = px.bar(dftmp, x='t_dat', y='price', title="Not for sales article example")
fig.show()

In [10]:
df = pd.merge(
    df,
    dfagg1[['article_id','before_201909_article_ttl_sales_ratio', 'not_for_sales_flg']],
    on='article_id',
    how='left'
)
df.head()

# Classify "Expensive products" and "Cheap products" calculated from the average price of each product type

In [11]:
dfagg = df.groupby(['idxgrp_idx_prdtyp'])[['price']].mean().reset_index().rename(columns={'price': 'idxgrp_idx_prdtyp_mean_price'})
df = pd.merge(
    df,
    dfagg[['idxgrp_idx_prdtyp','idxgrp_idx_prdtyp_mean_price']],
    on='idxgrp_idx_prdtyp',
    how='left'
)
df.head()