In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

## EDA for articles.csv

In [2]:
articles_df = pd.read_csv("/kaggle/input/h-and-m-personalized-fashion-recommendations/articles.csv")

In [None]:
articles_df.head()

In [None]:
articles_df.describe()

In [None]:
len(articles_df.columns)

In [None]:
# iterating the columns
for col in articles_df.columns:
    print(col)

### Describe for each column

In [None]:
len(articles_df['article_id'].unique())

### Retrived only needed columns

In [3]:
reduced_article_df = articles_df[['article_id', 'product_code', 'prod_name', 'product_type_no', 'product_group_name', 'graphical_appearance_no', 'colour_group_code', 'perceived_colour_value_id', 'perceived_colour_master_id', 'index_code', 'index_group_no', 'section_no', 'garment_group_no', 'detail_desc']]

In [None]:
reduced_article_df.head()

## EDA for transactions_train.csv

In [4]:
transactions_train_df = pd.read_csv("/kaggle/input/h-and-m-personalized-fashion-recommendations/transactions_train.csv")
transactions_train_df.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
0,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,0.050831,2
1,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023,0.030492,2
2,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,505221004,0.015237,2
3,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687003,0.016932,2
4,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687004,0.016932,2


In [None]:
transactions_train_df['t_dat'].unique()

In [None]:
len(transactions_train_df)

### Convert to datetime column

In [5]:
transactions_train_df['t_dat'] = pd.to_datetime(transactions_train_df['t_dat'])

### Retrived year 2018 data

In [6]:
df_filtered = transactions_train_df[transactions_train_df['t_dat'].dt.strftime('%Y') == '2018']

In [7]:
len(df_filtered)

4411262

### Deal with SettingWithCopyWarning in Pandas

In [8]:
pd.options.mode.chained_assignment = None  # default='warn'

### Drop uncessary columns

In [9]:
df_filtered.drop(['t_dat', 'price', 'sales_channel_id'], axis=1, inplace=True)

In [10]:
df_filtered.head()

Unnamed: 0,customer_id,article_id
0,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001
1,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023
2,00007d2de826758b65a93dd24ce629ed66842531df6699...,505221004
3,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687003
4,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687004


In [11]:
len(df_filtered)

4411262

### Merge DF (reduced_article_df + df_filtered) on article_id column

In [12]:
merged_df = pd.merge(reduced_article_df, df_filtered, how='right', on = 'article_id')

In [13]:
len(merged_df)

4411262

In [None]:
merged_df.head()

### Retrieved columns which only needed for content-based recommendation

In [14]:
contents_df = merged_df[['product_code', 'prod_name', 'product_group_name', 'detail_desc']]

In [15]:
len(contents_df)

4411262

### Drop duplicated rows

In [16]:
unique_product_df = contents_df.drop_duplicates()

### Drop Nan rows

In [17]:
unique_product_df = unique_product_df.dropna()

In [18]:
len(unique_product_df)

22237

### Merged all content columns into one ('detail_desc')

In [19]:
unique_product_df['detail_desc'] = unique_product_df['detail_desc']+unique_product_df['prod_name']+unique_product_df['product_group_name']

In [20]:
unique_product_df.head()

Unnamed: 0,product_code,prod_name,product_group_name,detail_desc
0,663713,Atlanta Push Body Harlow,Underwear,"Lace push-up body with underwired, moulded, pa..."
1,541518,Rae Push (Melbourne) 2p,Underwear,"Lace push-up bras with underwired, moulded, pa..."
2,505221,Inca Jumper,Garment Upper body,Jumper in rib-knit cotton with hard-worn detai...
3,685687,W YODA KNIT OL OFFER,Garment Upper body,V-neck knitted jumper with long sleeves and ri...
6,505221,Inca jumper,Garment Upper body,Jumper in rib-knit cotton with hard-worn detai...


### Drop uncessary columns

In [21]:
unique_product_df.drop(['prod_name', 'product_group_name'], axis=1, inplace=True)

### Merge row data with same product_code

In [22]:
unique_product_df= unique_product_df.assign(detail_desc=unique_product_df.groupby('product_code')['detail_desc'].ffill()).drop_duplicates('product_code', keep='last')

In [None]:
len(unique_product_df)

In [None]:
unique_product_df['detail_desc']

### Load Universal Sentence Encoder Model (v4)

In [23]:
import tensorflow_hub as hub

embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

2022-02-21 09:59:15.427688: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-02-21 09:59:15.548060: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-02-21 09:59:15.548899: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-02-21 09:59:15.550584: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compil

### Create Embedding

In [26]:
embeddings = embed(unique_product_df['detail_desc'].tolist())

print(embeddings)

tf.Tensor(
[[-0.01065608  0.01826968 -0.04593364 ...  0.05730452 -0.06756479
   0.04370686]
 [ 0.00455232 -0.03420905 -0.04517129 ...  0.01441733 -0.06635971
   0.00912433]
 [ 0.00279254  0.02062127 -0.04614714 ...  0.02384813 -0.04939635
   0.02162774]
 ...
 [-0.01820225  0.06077004  0.00524198 ... -0.03439009 -0.04590836
   0.00299223]
 [-0.05664276 -0.02204696 -0.06020563 ...  0.02891629 -0.04716483
   0.02219471]
 [-0.05621359  0.01494409 -0.05081891 ...  0.05227846 -0.06189832
  -0.0416698 ]], shape=(21249, 512), dtype=float32)


In [27]:
embeddings.shape

TensorShape([21249, 512])

### Retrieved unique product_code

In [28]:
product_code = set(unique_product_df['product_code'])

In [29]:
len(product_code)

21249

### Create Cosine Similarity Matrix using sklearn library

In [30]:
from sklearn.metrics.pairwise import cosine_similarity
sim = cosine_similarity(embeddings)

### Create DataFrame by setting product_code as DF index

In [31]:
sim = pd.DataFrame(data=sim,columns=list(product_code),index=list(product_code))
sim.head()

Unnamed: 0,655366,524302,524305,524307,524313,655403,655410,655434,524363,655437,...,655267,655268,655271,655272,655287,655291,655292,655331,655347,655351
655366,1.0,0.851624,0.538012,0.553357,0.614788,0.530881,0.519516,0.643793,0.611574,0.433494,...,0.364786,0.13304,0.471892,0.611268,0.588819,0.570915,0.512542,0.27179,0.518533,0.561671
524302,0.851624,1.0,0.533368,0.487942,0.515292,0.480066,0.506876,0.596636,0.550087,0.371653,...,0.332437,0.192776,0.436685,0.533683,0.449791,0.501745,0.497236,0.311,0.431884,0.503306
524305,0.538012,0.533368,1.0,0.668629,0.667619,0.671663,0.706352,0.556987,0.669205,0.452664,...,0.333452,0.136424,0.535852,0.774392,0.529257,0.633041,0.451008,0.337748,0.576588,0.665883
524307,0.553357,0.487942,0.668629,1.0,0.560401,0.638976,0.606605,0.591301,0.606068,0.498862,...,0.365388,0.147239,0.511885,0.699806,0.554632,0.579177,0.521845,0.342155,0.553013,0.588546
524313,0.614788,0.515292,0.667619,0.560401,1.0,0.670114,0.657604,0.583227,0.785059,0.563721,...,0.322796,0.169245,0.47038,0.784744,0.621126,0.662104,0.47876,0.284926,0.686417,0.680695


In [None]:
sim.index

In [None]:
sim.loc[655366]

### Dump Similarity DF as pickle file for further use

In [None]:
import pickle
modelfile=sim.to_numpy()
with open('modelfile.pickle', 'wb') as f:
    pickle.dump(modelfile, f)

### Load Model pickle file

In [None]:
openpkl = pickle.load(open('modelfile.pickle', 'rb'))

In [None]:
openpkl

### User-defined function to find top 5 similar products

In [32]:
def recommended_products(product_id):
   
    show_cos_sim = sim.loc[product_id]
    rank = np.argsort(show_cos_sim.values)[::-1]
    arr= rank[1:13]
    arr=arr.flatten()
    return ' '.join(map(str, arr.tolist()))

### Sample search with product code `655366`

In [None]:
recommended_products_dict = recommended_products(655366) 
recommended_products_dict

### For each user find related recommend items

In [33]:
recommended_list = {}
for item in product_code:
    recommended_list[item] = recommended_products(item)

### Data Cleaning to merged_df

In [34]:
merged_df.drop_duplicates(inplace=True)

In [35]:
merged_df.dropna(inplace=True)

In [36]:
len(merged_df)

3829403

In [None]:
merged_df.head()

### Retrieved Entries which product code contained in training data (product_code)

In [37]:
article_ids =merged_df[merged_df['product_code'].isin(product_code)][['article_id', 'product_code']]

In [38]:
article_ids.drop_duplicates(inplace=True)

In [39]:
article_ids.dropna(inplace=True)

In [40]:
len(article_ids)

41797

In [None]:
article_ids.head()

In [41]:
article_ids.rename(columns={'product_code': 'prediction'}, inplace=True)

### ReMap pandas data values in DF column

In [42]:
article_ids.replace({"prediction": recommended_list}, inplace=True)

In [None]:
article_ids.head()

In [None]:
len(article_ids)

In [43]:
merged_article_df = pd.merge(article_ids, merged_df, how='left', on = 'article_id')[['customer_id', 'prediction']]

In [44]:
merged_article_df.head()

Unnamed: 0,customer_id,prediction
0,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,18370 5373 10457 16553 17000 19709 14541 3175 ...
1,3681748607f3287d2c3a65e00bb5fb153de30e9becf158...,18370 5373 10457 16553 17000 19709 14541 3175 ...
2,4ef5967ff17bf474bffebe5b16bd54878e1d4105f7b4ed...,18370 5373 10457 16553 17000 19709 14541 3175 ...
3,6b7b10d2d47516c82a6f97332478dab748070f09693f09...,18370 5373 10457 16553 17000 19709 14541 3175 ...
4,8ac137752bbe914aa4ae6ad007a9a0c5b67a1ab2b2d474...,18370 5373 10457 16553 17000 19709 14541 3175 ...


In [45]:
len(merged_article_df)

3829403

### Check duplicate values

In [None]:
merged_article_df['customer_id'].duplicated()

In [46]:
merged_article_df.drop_duplicates(subset = ['customer_id'], keep = 'first', inplace = True) 

## EDA for customers.csv

In [47]:
customer_df = pd.read_csv("/kaggle/input/h-and-m-personalized-fashion-recommendations/customers.csv")
customer_df.head()

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,,,ACTIVE,NONE,49.0,52043ee2162cf5aa7ee79974281641c6f11a68d276429a...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,,,ACTIVE,NONE,25.0,2973abc54daa8a5f8ccfe9362140c63247c5eee03f1d93...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,,,ACTIVE,NONE,24.0,64f17e6a330a85798e4998f62d0930d14db8db1c054af6...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,,,ACTIVE,NONE,54.0,5d36574f52495e81f019b680c843c443bd343d5ca5b1c2...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,1.0,1.0,ACTIVE,Regularly,52.0,25fa5ddee9aac01b35208d01736e57942317d756b32ddd...


In [None]:
len(customer_df['customer_id'].unique())

In [51]:
customer_df.drop(['FN', 'Active', 'club_member_status', 'fashion_news_frequency', 'age', 'postal_code'], axis=1, inplace=True)

In [52]:
customer_df.head()

Unnamed: 0,customer_id
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...


In [53]:
merged_article_df = pd.merge(merged_article_df, customer_df, how='right', on = 'customer_id')

In [56]:
merged_article_df.isna()

Unnamed: 0,customer_id,prediction
0,False,False
1,False,False
2,False,False
3,False,True
4,False,False
...,...,...
1371975,False,True
1371976,False,False
1371977,False,False
1371978,False,True


In [59]:
filled_data = merged_article_df['prediction'][0]

In [67]:
merged_article_df['prediction'].fillna(filled_data, inplace=True)

In [68]:
merged_article_df.isna()

Unnamed: 0,customer_id,prediction
0,False,False
1,False,False
2,False,False
3,False,False
4,False,False
...,...,...
1371975,False,False
1371976,False,False
1371977,False,False
1371978,False,False


In [70]:
len(merged_article_df)

1371980

In [71]:
merged_article_df.to_csv("submission.csv", index=False)

## EDA for sample_submission.csv

In [None]:
sample_submission_df = pd.read_csv("/kaggle/input/h-and-m-personalized-fashion-recommendations/sample_submission.csv")
sample_submission_df.head()

In [None]:
test=sample_submission_df['prediction'][0]

In [None]:
type(test)

In [None]:
arr = test.split()

In [None]:
len(arr)