In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

## EDA for articles.csv

In [None]:
articles_df = pd.read_csv("/kaggle/input/h-and-m-personalized-fashion-recommendations/articles.csv")

In [None]:
articles_df.head()

In [None]:
articles_df.describe()

In [None]:
len(articles_df.columns)

In [None]:
# iterating the columns
for col in articles_df.columns:
    print(col)

### Describe for each column

In [None]:
len(articles_df['article_id'].unique())

### Retrived only needed columns

In [None]:
reduced_article_df = articles_df[['article_id', 'product_code', 'prod_name', 'product_type_no', 'product_group_name', 'graphical_appearance_no', 'colour_group_code', 'perceived_colour_value_id', 'perceived_colour_master_id', 'index_code', 'index_group_no', 'section_no', 'garment_group_no', 'detail_desc']]

In [None]:
reduced_article_df.head()

## EDA for transactions_train.csv

In [None]:
transactions_train_df = pd.read_csv("/kaggle/input/h-and-m-personalized-fashion-recommendations/transactions_train.csv")
transactions_train_df.head()

In [None]:
transactions_train_df['t_dat'].unique()

### Convert to datetime column

In [None]:
transactions_train_df['t_dat'] = pd.to_datetime(transactions_train_df['t_dat'])

### Retrived year 2018 data

In [None]:
df_filtered = transactions_train_df[transactions_train_df['t_dat'].dt.strftime('%Y') == '2018']

### Save to file for local train

In [None]:
df_filtered.to_csv("2020_data.csv")

In [None]:
len(df_filtered)

### Deal with SettingWithCopyWarning in Pandas

In [None]:
pd.options.mode.chained_assignment = None  # default='warn'

### Drop uncessary columns

In [None]:
df_filtered.drop(['t_dat', 'price', 'sales_channel_id'], axis=1, inplace=True)

In [None]:
df_filtered.head()

### Merge DF (reduced_article_df + df_filtered) on article_id column

In [None]:
merged_df = pd.merge(reduced_article_df, df_filtered, how='right', on = 'article_id')

In [None]:
len(merged_df)

In [None]:
merged_df.head()

### Retrieved columns which only needed for content-based recommendation

In [None]:
contents_df = merged_df[['product_code', 'prod_name', 'product_group_name', 'detail_desc']]

In [None]:
len(contents_df)

### Drop duplicated rows

In [None]:
unique_product_df = contents_df.drop_duplicates()

### Drop Nan rows

In [None]:
unique_product_df = unique_product_df.dropna()

In [None]:
len(unique_product_df)

### Merged all content columns into one ('detail_desc')

In [None]:
unique_product_df['detail_desc'] = unique_product_df['detail_desc']+unique_product_df['prod_name']+unique_product_df['product_group_name']

In [None]:
unique_product_df.head()

### Drop uncessary columns

In [None]:
unique_product_df.drop(['prod_name', 'product_group_name'], axis=1, inplace=True)

### Merge row data with same product_code

In [None]:
unique_product_df= unique_product_df.assign(detail_desc=unique_product_df.groupby('product_code')['detail_desc'].ffill()).drop_duplicates('product_code', keep='last')

In [None]:
unique_product_df['detail_desc']

### Load Universal Sentence Encoder Model (v4)

In [None]:
import tensorflow_hub as hub

embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

### Create Embedding

In [None]:
embeddings = embed(unique_product_df['detail_desc'].tolist())

print(embeddings)

In [None]:
embeddings.shape

### Retrieved unique product_code

In [None]:
product_code = set(unique_product_df['product_code'])

In [None]:
len(product_code)

### Create Cosine Similarity Matrix using sklearn library

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
sim = cosine_similarity(embeddings)

### Create DataFrame by setting product_code as DF index

In [None]:
sim = pd.DataFrame(data=sim,columns=list(product_code),index=list(product_code))
sim.head()

In [None]:
sim.index

In [None]:
sim.loc[655366]

### Dump Similarity DF as pickle file for further use

In [None]:
import pickle
modelfile=sim.to_numpy()
with open('modelfile.pickle', 'wb') as f:
    pickle.dump(modelfile, f)

### Load Model pickle file

In [None]:
openpkl = pickle.load(open('modelfile.pickle', 'rb'))

In [None]:
openpkl

### User-defined function to find top 5 similar products

In [None]:
def recommended_products(product_id):
   
    show_cos_sim = sim.loc[product_id]
    rank = np.argsort(show_cos_sim.values)[::-1]
    arr= rank[1:13]
    arr=arr.flatten()
    return ' '.join(map(str, arr.tolist()))

### Sample search with product code `655366`

In [None]:
recommended_products_dict = recommended_products(655366) 
recommended_products_dict

### For each user find related recommend items

In [None]:
recommended_list = {}
for item in product_code:
    recommended_list[item] = recommended_products(item)

### Data Cleaning to merged_df

In [None]:
merged_df.drop_duplicates(inplace=True)

In [None]:
merged_df.dropna(inplace=True)

In [None]:
len(merged_df)

In [None]:
merged_df.head()

### Retrieved Entries which product code contained in training data (product_code)

In [None]:
article_ids =merged_df[merged_df['product_code'].isin(product_code)][['article_id', 'product_code']]

In [None]:
article_ids.drop_duplicates(inplace=True)

In [None]:
article_ids.dropna(inplace=True)

In [None]:
len(article_ids)

In [None]:
article_ids.head()

In [None]:
article_ids.rename(columns={'product_code': 'prediction'}, inplace=True)

### ReMap pandas data values in DF column

In [None]:
article_ids.replace({"prediction": recommended_list}, inplace=True)

In [None]:
article_ids.head()

In [None]:
len(article_ids)

In [None]:
merged_article_df = pd.merge(article_ids, merged_df, how='left', on = 'article_id')[['customer_id', 'prediction']]

In [245]:
merged_article_df.head()

Unnamed: 0,customer_id,prediction
0,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,18370 5373 10457 16553 17000 19709 14541 3175 ...
1,3681748607f3287d2c3a65e00bb5fb153de30e9becf158...,18370 5373 10457 16553 17000 19709 14541 3175 ...
2,4ef5967ff17bf474bffebe5b16bd54878e1d4105f7b4ed...,18370 5373 10457 16553 17000 19709 14541 3175 ...
3,6b7b10d2d47516c82a6f97332478dab748070f09693f09...,18370 5373 10457 16553 17000 19709 14541 3175 ...
4,8ac137752bbe914aa4ae6ad007a9a0c5b67a1ab2b2d474...,18370 5373 10457 16553 17000 19709 14541 3175 ...


In [253]:
len(merged_article_df)

580471

### Check duplicate values

In [252]:
merged_article_df['customer_id'].duplicated()

0          False
1          False
2          False
3          False
4          False
           ...  
3828999    False
3829090    False
3829114    False
3829115    False
3829380    False
Name: customer_id, Length: 580471, dtype: bool

In [251]:
merged_article_df.drop_duplicates(subset = ['customer_id'], keep = 'first', inplace = True) 

In [254]:
merged_article_df.to_csv("submission.csv", index=False)

## EDA for customers.csv

In [None]:
customer_df = pd.read_csv("/kaggle/input/h-and-m-personalized-fashion-recommendations/customers.csv")
customer_df.head()

In [None]:
customer_df.head()

## EDA for sample_submission.csv

In [None]:
sample_submission_df = pd.read_csv("/kaggle/input/h-and-m-personalized-fashion-recommendations/sample_submission.csv")
sample_submission_df.head()

In [None]:
test=sample_submission_df['prediction'][0]

In [None]:
type(test)

In [None]:
arr = test.split()

In [None]:
len(arr)