In [65]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

## EDA for articles.csv

In [None]:
articles_df = pd.read_csv("/kaggle/input/h-and-m-personalized-fashion-recommendations/articles.csv")

In [None]:
articles_df.head()

In [None]:
articles_df.describe()

In [None]:
len(articles_df.columns)

In [None]:
# iterating the columns
for col in articles_df.columns:
    print(col)

### Describe for each column

In [None]:
len(articles_df['article_id'].unique())

### Retrived only needed columns

In [None]:
reduced_article_df = articles_df[['article_id', 'product_code', 'prod_name', 'product_type_no', 'product_group_name', 'graphical_appearance_no', 'colour_group_code', 'perceived_colour_value_id', 'perceived_colour_master_id', 'index_code', 'index_group_no', 'section_no', 'garment_group_no', 'detail_desc']]

In [None]:
reduced_article_df.head()

## EDA for transactions_train.csv

In [None]:
transactions_train_df = pd.read_csv("/kaggle/input/h-and-m-personalized-fashion-recommendations/transactions_train.csv")
transactions_train_df.head()

In [None]:
transactions_train_df['t_dat'].unique()

### Convert to datetime column

In [None]:
transactions_train_df['t_dat'] = pd.to_datetime(transactions_train_df['t_dat'])

### Retrived year 2018 data

In [None]:
df_filtered = transactions_train_df[transactions_train_df['t_dat'].dt.strftime('%Y') == '2018']

### Save to file for local train

In [None]:
df_filtered.to_csv("2020_data.csv")

In [None]:
len(df_filtered)

### Deal with SettingWithCopyWarning in Pandas

In [None]:
pd.options.mode.chained_assignment = None  # default='warn'

### Drop uncessary columns

In [None]:
df_filtered.drop(['t_dat', 'price', 'sales_channel_id'], axis=1, inplace=True)

In [None]:
df_filtered.head()

### Merge DF (reduced_article_df + df_filtered) on article_id column

In [None]:
merged_df = pd.merge(reduced_article_df, df_filtered, how='right', on = 'article_id')

In [None]:
len(merged_df)

In [None]:
merged_df.head()

### Retrieved columns which only needed for content-based recommendation

In [None]:
contents_df = merged_df[['product_code', 'prod_name', 'product_group_name', 'detail_desc']]

### Drop duplicated rows

In [None]:
unique_product_df = contents_df.drop_duplicates()

### Drop Nan rows

In [None]:
unique_product_df = unique_product_df.dropna()

In [66]:
len(unique_product_df)

22237

### Merged all content columns into one ('detail_desc')

In [None]:
unique_product_df['detail_desc'] = unique_product_df['detail_desc']+unique_product_df['prod_name']+unique_product_df['product_group_name']

In [75]:
unique_product_df.head()

Unnamed: 0,product_code,prod_name,product_group_name,detail_desc
0,663713,Atlanta Push Body Harlow,Underwear,"Lace push-up body with underwired, moulded, pa..."
1,541518,Rae Push (Melbourne) 2p,Underwear,"Lace push-up bras with underwired, moulded, pa..."
2,505221,Inca Jumper,Garment Upper body,Jumper in rib-knit cotton with hard-worn detai...
3,685687,W YODA KNIT OL OFFER,Garment Upper body,V-neck knitted jumper with long sleeves and ri...
6,505221,Inca jumper,Garment Upper body,Jumper in rib-knit cotton with hard-worn detai...


### Drop uncessary columns

In [77]:
unique_product_df.drop(['prod_name', 'product_group_name'], axis=1, inplace=True)

### Merge row data with same product_code

In [83]:
unique_product_df= unique_product_df.assign(detail_desc=unique_product_df.groupby('product_code')['detail_desc'].ffill()).drop_duplicates('product_code', keep='last')

In [84]:
unique_product_df['detail_desc']

0          Lace push-up body with underwired, moulded, pa...
1          Lace push-up bras with underwired, moulded, pa...
3          V-neck knitted jumper with long sleeves and ri...
6          Jumper in rib-knit cotton with hard-worn detai...
7          Blouse in a soft weave with a narrow collar, c...
                                 ...                        
4407877    Long, oversized jacket in washed sweatshirt fa...
4408582    Hipster briefs in lace with a mid waist, lined...
4409421    Suede ankle boots with elastication at the fro...
4410674    Shirt in premium cotton with a grandad collar,...
4410807    Straight, knee-length dress in a soft knit wit...
Name: detail_desc, Length: 21249, dtype: object

### Load Universal Sentence Encoder Model (v4)

In [None]:
import tensorflow_hub as hub

embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

### Create Embedding

In [85]:
embeddings = embed(unique_product_df['detail_desc'].tolist())

print(embeddings)

tf.Tensor(
[[-0.01065604  0.01826968 -0.04593362 ...  0.05730451 -0.06756478
   0.04370683]
 [ 0.00455238 -0.0342091  -0.04517129 ...  0.01441727 -0.0663597
   0.00912434]
 [ 0.0027926   0.02062124 -0.04614712 ...  0.02384809 -0.04939636
   0.02162772]
 ...
 [-0.01820223  0.06077003  0.00524196 ... -0.03439008 -0.04590835
   0.00299222]
 [-0.05664279 -0.02204691 -0.06020565 ...  0.0289163  -0.04716483
   0.02219461]
 [-0.05621357  0.01494415 -0.05081894 ...  0.05227847 -0.0618983
  -0.04166974]], shape=(21249, 512), dtype=float32)


In [86]:
embeddings.shape

TensorShape([21249, 512])

### Retrieved unique product_code

In [87]:
product_code = set(unique_product_df['product_code'])

In [88]:
len(product_code)

21249

### Create Cosine Similarity Matrix using sklearn library

In [89]:
from sklearn.metrics.pairwise import cosine_similarity
sim = cosine_similarity(embeddings)

### Create DataFrame by setting product_code as DF index

In [90]:
sim = pd.DataFrame(data=sim,columns=list(product_code),index=list(product_code))
sim.head()

Unnamed: 0,655366,524302,524305,524307,524313,655403,655410,655434,524363,655437,...,655267,655268,655271,655272,655287,655291,655292,655331,655347,655351
655366,1.0,0.851624,0.538012,0.553357,0.614788,0.530881,0.519516,0.643793,0.611574,0.433494,...,0.364786,0.13304,0.471892,0.611268,0.588819,0.570915,0.512542,0.27179,0.518533,0.56167
524302,0.851624,1.0,0.533368,0.487942,0.515292,0.480066,0.506876,0.596636,0.550088,0.371653,...,0.332437,0.192776,0.436684,0.533683,0.449791,0.501745,0.497236,0.311,0.431884,0.503306
524305,0.538012,0.533368,1.0,0.668629,0.667619,0.671663,0.706352,0.556987,0.669205,0.452664,...,0.333453,0.136424,0.535852,0.774392,0.529257,0.633041,0.451008,0.337748,0.576588,0.665883
524307,0.553357,0.487942,0.668629,1.0,0.560401,0.638976,0.606606,0.591301,0.606068,0.498862,...,0.365387,0.147239,0.511885,0.699806,0.554632,0.579177,0.521845,0.342155,0.553013,0.588546
524313,0.614788,0.515292,0.667619,0.560401,1.0,0.670114,0.657604,0.583227,0.785059,0.563721,...,0.322796,0.169245,0.470381,0.784745,0.621126,0.662104,0.478761,0.284926,0.686417,0.680695


In [101]:
sim.index

Int64Index([655366, 524302, 524305, 524307, 524313, 655403, 655410, 655434,
            524363, 655437,
            ...
            655267, 655268, 655271, 655272, 655287, 655291, 655292, 655331,
            655347, 655351],
           dtype='int64', length=21249)

In [116]:
sim.loc[655366]

655366    1.000000
524302    0.851624
524305    0.538012
524307    0.553357
524313    0.614788
            ...   
655291    0.570915
655292    0.512542
655331    0.271790
655347    0.518533
655351    0.561670
Name: 655366, Length: 21249, dtype: float32

### Dump Similarity DF as pickle file for further use

In [91]:
import pickle
modelfile=sim.to_numpy()
with open('modelfile.pickle', 'wb') as f:
    pickle.dump(modelfile, f)

### Load Model pickle file

In [92]:
openpkl = pickle.load(open('modelfile.pickle', 'rb'))

In [123]:
openpkl

array([[1.0000002 , 0.8516239 , 0.5380116 , ..., 0.27178988, 0.5185333 ,
        0.5616703 ],
       [0.8516239 , 1.0000002 , 0.5333676 , ..., 0.31099987, 0.4318842 ,
        0.5033062 ],
       [0.5380116 , 0.5333676 , 1.        , ..., 0.337748  , 0.576588  ,
        0.6658827 ],
       ...,
       [0.27178988, 0.31099987, 0.337748  , ..., 1.        , 0.29526678,
        0.38893133],
       [0.5185333 , 0.4318842 , 0.576588  , ..., 0.29526678, 1.0000001 ,
        0.6272614 ],
       [0.5616703 , 0.5033062 , 0.6658827 , ..., 0.38893133, 0.6272614 ,
        0.99999976]], dtype=float32)

### User-defined function to find top 5 similar products

In [150]:
def recommended_products(product_id):
   
    show_cos_sim = sim.loc[product_id]
    rank = np.argsort(show_cos_sim.values)[::-1]
    arr= rank[1:6]
    arr=arr.flatten()
    return  arr

### Sample search with product code `655366`

In [151]:
recommended_products_dict = recommended_products(655366) 
recommended_products_dict

array([ 1429,  5386,  2882, 19257, 21135])

## EDA for customers.csv

In [None]:
customer_df = pd.read_csv("/kaggle/input/h-and-m-personalized-fashion-recommendations/customers.csv")
customer_df.head()

In [None]:
customer_df.head()

## EDA for sample_submission.csv

In [None]:
sample_submission_df = pd.read_csv("/kaggle/input/h-and-m-personalized-fashion-recommendations/sample_submission.csv")
sample_submission_df.head()

In [None]:
sample_submission_df.describe()