In [None]:
from PIL import Image
import requests
from io import BytesIO
import matplotlib.pyplot as plt
from matplotlib import gridspec
import pandas as pd
import seaborn as sns
import numpy as np

from bs4 import BeautifulSoup

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import math
import time
import re
import os

from collections import Counter

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import pairwise_distances

from scipy.sparse import hstack
import plotly
import plotly.figure_factory as ff
from plotly.graph_objs import Scatter, Layout

plotly.offline.init_notebook_mode(connected = True)

import itertools
import pickle

# Data Importing and Initial analysis

In [None]:
data = pd.read_json("../data/tops_fashion.json")

In [None]:
data.shape

In [None]:
data.columns

In [None]:
data = data[['asin','brand','color','medium_image_url','product_type_name','title','formatted_price']]

In [None]:
data.head()

# Data Cleaning and understanding

In [None]:
def data_understanding(column):
    print("**********High level description**********\n")
    print(column.describe())
    
    print("\n **********Unique values********** \n")
    print(column.unique())\
    
    print('\n **********Top ten values********** \n')
    count = Counter(list(column))
    print(count.most_common(10))

In [None]:
## for column product type name
data_understanding(data['product_type_name'])

In [None]:
## for column Brand
data_understanding(data['brand'])

In [None]:
## for column Color
data_understanding(data['color'])

In [None]:
## for column Price
data_understanding(data['formatted_price'])

In [None]:
## for column title
data_understanding(data['title'])

In [None]:
## Removing null value rows from color and price
data = data.loc[~data['formatted_price'].isnull()]
data = data.loc[~data['color'].isnull()]

In [None]:
data.shape

In [None]:
## Check duplicate items

print(sum(data.duplicated('title')))

## Remove all products with very few words in title

data_sorted = data[data['title'].apply(lambda x : len(x.split()) > 4 )]
print("After removal of products with short description:", data_sorted.shape)

In [None]:
## Sort data based on title (alphabatical order of title)

data_sorted.sort_values('title', inplace = True, ascending = False)
data_sorted.head()

In [None]:
indices = []
for i , row in data_sorted.iterrows():
    indices.append(i)

In [None]:
stage1_dedup_asins = []
i = 0
j = 0
num_data_points = data_sorted.shape[0]

while i < num_data_points and j < num_data_points:
    previous_i = i
    
    # store the list of words of ith string in a
    a = data['title'].loc[indices[i]].split()
    
    j = i+1
    
    while j < num_data_points:
        #store the list of words of jth string in b
        b = data['title'].loc[indices[j]].split()
        
        # store the maximum length of two strings
        length = max(len(a) , len(b))
        
        count = 0 # count is used to store the number of words that are matched in both strings
        
        for k in itertools.zip_longest(a,b): #it will give [('a','a'),('b','b'),('c','d'),('d','none')]
            if (k[0] == k[1]):
                count += 1
                
                
        if (length - count) > 2:
            stage1_dedup_asins.append(data_sorted['asin'].loc[indices[i]])
            
            
            i = j
            break
        else:
            j += 1
    if previous_i == i:
        break

In [None]:
data = data.loc[data['asin'].isin(stage1_dedup_asins)]

In [None]:
indices = []
for i,row in data.iterrows():
    indices.append(i)

stage2_dedupe_asins = []
while len(indices)!=0:
    i = indices.pop()
    stage2_dedupe_asins.append(data['asin'].loc[i])
    # consider the first apperal's title
    a = data['title'].loc[i].split()
    # store the list of words of ith string in a, ex: a = ['tokidoki', 'The', 'Queen', 'of', 'Diamonds', 'Women's', 'Shirt', 'X-Large']
    for j in indices:
        
        b = data['title'].loc[j].split()
        # store the list of words of jth string in b, ex: b = ['tokidoki', 'The', 'Queen', 'of', 'Diamonds', 'Women's', 'Shirt', 'X-Large']
        
        length = max(len(a),len(b))
        
        # count is used to store the number of words that are matched in both strings
        count  = 0

        for k in itertools.zip_longest(a,b): 
            if (k[0]==k[1]):
                count += 1

        # if the number of words in which both strings differ are < 3 , we are considering it as those two apperals are same, hence we are ignoring them
        if (length - count) < 3:
            indices.remove(j)

In [None]:
data = data.loc[data['asin'].isin(stage2_dedupe_asins)]

In [None]:
data.shape

In [None]:
data.to_pickle('../code/16k_apperal_data_preprocessed')

# Text Pre-Processing

In [None]:
## Function to remove stop words and other pre-processing task

def text_preprocessing(text, index, col):
    if type(text) is not int:
        string = ""
        
        for words in text.split():
            #remove special characters
            word = ("".join(e for e in words if e.isalnum()))
            # convert to lower case
            word = word.lower()
            # stop words removal
            if not words in stop_words:
                string += word + " "
        data[col][index] = string


In [None]:
for index , row in data.iterrows():
    text_preprocessing(row['title'], index, 'title')

# Text Based Similarity models

In [None]:
# function to display an image
def display_img(url, ax, fig):
    response = requests.get(url)
    img = Image.open(BytesIO(response.content))
    plt.imshow(img)

In [None]:
## Plotting heatmaps for algo decisions
def plot_heatmap(keys, values, labels, url, text):
    # divide whole figure into two parts
    gs = gridspec.GridSpec(2,2,width_ratios = [4,1], height_ratios = [4,1])
    fig = plt.figure(figsize = (25,3))
    
    # plot heatmap of that represent count of commonly occured words in title
    ax = plt.subplot(gs[0])
    # display cell in white if words of title1 intersect with word of title2 , if not, black
    ax = sns.heatmap(np.array([values]) , annot = np.array([labels]))
    ax.set_xticklabels(keys) 
    ax.set_title(text)
    
    # plot image of the apparel
    ax = plt.subplot(gs[1])
    ax.grid(False)
    ax.set_xticks([])
    ax.set_yticks([])
    
    # call display image
    display_img(url, ax, fig)
    
    plt.show()

In [None]:
def plot_heatmap_image(doc_id, vec1, vec2, url, text, model):
    # find the common words between titles
    intersection = set(vec1.keys()) & set(vec2.keys())
    
    #set values of non intersecting words to zero
    for i in vec2:
        if i not in intersection:
            vec2[i] = 0
    
    # for labeling heatmap, keys contains list of all words in title2
    keys = list(vec2.keys())
        #  if ith word in intersection(lis of words of title1 and list of words of title2): values(i)=count of that word in title2 else values(i)=0 
    values = [vec2[x] for x in vec2.keys()]
    
    # labels: len(labels) == len(keys), the values of labels depends on the model we are using
        # if model == 'bag of words': labels(i) = values(i)
        # if model == 'tfidf weighted bag of words':labels(i) = tfidf(keys(i))
        # if model == 'idf weighted bag of words':labels(i) = idf(keys(i))

    if model == 'bag_of_words':
        labels = values
    elif model == 'tfidf':
        labels = []
        for x in vec2.keys():
            # tfidf_title_vectorizer.vocabulary_ it contains all the words in the corpus
            # tfidf_title_features[doc_id, index_of_word_in_corpus] will give the tfidf value of word in given document (doc_id)
            if x in  tfidf_title_vectorizer.vocabulary_:
                labels.append(tfidf_title_features[doc_id, tfidf_title_vectorizer.vocabulary_[x]])
            else:
                labels.append(0)
    elif model == 'idf':
        labels = []
        for x in vec2.keys():
            # idf_title_vectorizer.vocabulary_ it contains all the words in the corpus
            # idf_title_features[doc_id, index_of_word_in_corpus] will give the idf value of word in given document (doc_id)
            if x in  idf_title_vectorizer.vocabulary_:
                labels.append(idf_title_features[doc_id, idf_title_vectorizer.vocabulary_[x]])
            else:
                labels.append(0)

    plot_heatmap(keys, values, labels, url, text)


In [None]:
# function to get list of words along with frequency
def text_to_vector(text):
    word = re.compile(r'\w+')
    words = word.findall(text)
    
    return Counter(words)

In [None]:
def get_result(doc_id, content_a, content_b, url, model):
    text1 = content_a
    text2 = content_b
    
    vector1 = text_to_vector(text1)
    vector2 = text_to_vector(text2)
    
    plot_heatmap_image(doc_id, vector1, vector2, url, text2, model)

## 1. Bag Of Words (BOW)

In [None]:
vectorizer = CountVectorizer()
title_features = vectorizer.fit_transform(data['title'])
title_features.get_shape()

In [None]:
def bow_model(doc_id, num_results):
    #doc_id = id of product
    #num_results = how many similar products
    
    #distance between query product and all other products
    pair_wise_dist = pairwise_distances(title_features, title_features[doc_id])
    
    #indices of smalles distances
    indices = np.argsort(pair_wise_dist.flatten())[0:num_results]
    
    # smalles distances
    pdists = np.sort(pair_wise_dist.flatten())[0:num_results]
    
    df_indices = list(data.index[indices])
    
    for i in range(0, len(indices)):
        get_result(indices[i],data['title'].loc[df_indices[0]], data['title'].loc[df_indices[i]], data['medium_image_url'].loc[df_indices[i]], 'bag_of_words')
        
        print('ASIN:', data['asin'].loc[df_indices[i]])
        print('Brand:', data['brand'].loc[df_indices[i]])
        print('Title:', data['title'].loc[df_indices[i]])
        print('Euclidean similarity with the query image :', pdists[i])
        print('*'*60)
        

In [None]:
bow_model(12920, 20)

## 2. TF-IDF

In [None]:
tf_idf_vectorizer = TfidfVectorizer(min_df = 0)
tf_idf_feature = tf_idf_vectorizer.fit_transform(data['title'])


In [None]:
def tfidf_model(doc_id, num_results):
    pair_wise_dist = pairwise_distances(tf_idf_feature, tf_idf_feature[doc_id])

    #indices of smalles distances
    indices = np.argsort(pair_wise_dist.flatten())[0:num_results]

    # smalles distances
    pdists = np.sort(pair_wise_dist.flatten())[0:num_results]

    df_indices = list(data.index[indices])

    for i in range(0, len(indices)):
        get_result(indices[i],data['title'].loc[df_indices[0]], data['title'].loc[df_indices[i]], data['medium_image_url'].loc[df_indices[i]], 'bag_of_words')

        print('ASIN:', data['asin'].loc[df_indices[i]])
        print('Brand:', data['brand'].loc[df_indices[i]])
        print('Title:', data['title'].loc[df_indices[i]])
        print('Euclidean similarity with the query image :', pdists[i])
        print('*'*60)


In [None]:
tfidf_model(12920,20)

## 3.Weighted Word2Vec or Text semantic based

In [None]:
idf_title_vectorizer = CountVectorizer()
idf_title_features = idf_title_vectorizer.fit_transform(data['title'])

def n_containing(word):
    # return the number of documents which had the given word
    return sum(1 for blob in data['title'] if word in blob.split())

def idf(word):
    # idf = log(#number of docs / #number of docs which had the given word)
    return math.log(data.shape[0] / (n_containing(word)))




In [None]:
# we need to convert the values into float
idf_title_features  = idf_title_features.astype(np.float)

for i in idf_title_vectorizer.vocabulary_.keys():
    # for every word in whole corpus we will find its idf value
    idf_val = idf(i)
    
    # to calculate idf_title_features we need to replace the count values with the idf values of the word
    # idf_title_features[:, idf_title_vectorizer.vocabulary_[i]].nonzero()[0] will return all documents in which the word i present
    for j in idf_title_features[:, idf_title_vectorizer.vocabulary_[i]].nonzero()[0]:
        
        # we replace the count values of word i in document j with  idf_value of word i 
        # idf_title_features[doc_id, index_of_word_in_courpus] = idf value of word
        idf_title_features[j,idf_title_vectorizer.vocabulary_[i]] = idf_val
        

In [None]:
def get_word_vec(sentence, doc_id , m_name):
    #sentence = title, doc_id = id
    vec = []
    for i in sentence.split():
        if i in vocab:
            if m_name == 'weighted' and i in idf_title_vectorizer.vocabulary_:
                vec.append(idf_title_features[doc_id, idf_title_vectorizer.vocabulary_[i]] * model[i])
            elif m_name == 'avg':
                vec.append(model[i])
        else:
            vec.append(np,zeros(shape = (300,)))
    return np.array(vec)

In [None]:
def get_distance(vec1, vec2):
    final_dist = []
    
    # for each vector in vec1 we calculate the distance(euclidean) to all vectors in vec2
    for i in vec1:
        dist = []
        for j in vec2:
            dist.append(np.linalg.norm(i-j))
        final_dist.append(np.array(dist))
        
    return np.array(final_dist)
        

In [None]:
def heat_map_w2v(sentence1, sentence2, url, doc_id1, doc_id2, model):
    # sentance1 : title1, input apparel
    # sentance2 : title2, recommended apparel
    # url: apparel image url
    # doc_id1: document id of input apparel
    # doc_id2: document id of recommended apparel
    # model: it can have two values, 1. avg 2. weighted
    
    #s1_vec = np.array(#number_of_words_title1 * 300), each row is a vector(weighted/avg) of length 300 corresponds to each word in give title
    s1_vec = get_word_vec(sentence1, doc_id1, model)
    #s2_vec = np.array(#number_of_words_title1 * 300), each row is a vector(weighted/avg) of length 300 corresponds to each word in give title
    s2_vec = get_word_vec(sentence2, doc_id2, model)

    # s1_s2_dist = np.array(#number of words in title1 * #number of words in title2)
    # s1_s2_dist[i,j] = euclidean distance between words i, j
    s1_s2_dist = get_distance(s1_vec, s2_vec)

    
    
    # devide whole figure into 2 parts 1st part displays heatmap 2nd part displays image of apparel
    gs = gridspec.GridSpec(2, 2, width_ratios=[4,1],height_ratios=[2,1]) 
    fig = plt.figure(figsize=(15,15))
    
    ax = plt.subplot(gs[0])
    # ploting the heap map based on the pairwise distances
    ax = sns.heatmap(np.round(s1_s2_dist,4), annot=True)
    # set the x axis labels as recommended apparels title
    ax.set_xticklabels(sentence2.split())
    # set the y axis labels as input apparels title
    ax.set_yticklabels(sentence1.split())
    # set title as recommended apparels title
    ax.set_title(sentence2)
    
    ax = plt.subplot(gs[1])
    # we remove all grids and axis labels for image
    ax.grid(False)
    ax.set_xticks([])
    ax.set_yticks([])
    display_img(url, ax, fig)
    
    plt.show()

In [None]:
with open('word2vec_model', 'rb') as handle:
    model = pickle.load(handle)

In [None]:
vocab = model.keys()

# this function witll add the vectors fo each word and returns the avg vector of given sentance
def build_avg_vec(sentence, num_features, doc_id, m_name):
    featureVec = np.zeros((num_features,), dtype="float32")
    # we will intialize a vector of size 300 with all zeros
    # we add each word2vec(wordi) to this fetureVec
    nwords = 0
    
    for word in sentence.split():
        nwords += 1
        if word in vocab:
            if m_name == 'weighted' and word in  idf_title_vectorizer.vocabulary_:
                featureVec = np.add(featureVec, idf_title_features[doc_id, idf_title_vectorizer.vocabulary_[word]] * model[word])
            elif m_name == 'avg':
                featureVec = np.add(featureVec, model[word])
    if(nwords>0):
        featureVec = np.divide(featureVec, nwords)
    # returns the avg vector of given sentance, its of shape (1, 300)
    return featureVec

In [None]:
doc_id = 0
w2v_title = []
# for every title we build a avg vector representation
for i in data['title']:
    w2v_title.append(build_avg_vec(i, 300, doc_id,'avg'))
    doc_id += 1

# w2v_title = np.array(# number of doc in courpus * 300), each row corresponds to a doc 
w2v_title = np.array(w2v_title)


In [None]:
def avg_w2v_model(doc_id, num_results):

    pairwise_dist = pairwise_distances(w2v_title, w2v_title[doc_id].reshape(1,-1))

    # np.argsort will return indices of 9 smallest distances
    indices = np.argsort(pairwise_dist.flatten())[0:num_results]
    #pdists will store the 9 smallest distances
    pdists  = np.sort(pairwise_dist.flatten())[0:num_results]

    #data frame indices of the 9 smallest distace's
    df_indices = list(data.index[indices])
    
    for i in range(0, len(indices)):
        heat_map_w2v(data['title'].loc[df_indices[0]],data['title'].loc[df_indices[i]], data['medium_image_url'].loc[df_indices[i]], indices[0], indices[i], 'avg')
        print('ASIN :',data['asin'].loc[df_indices[i]])
        print('BRAND :',data['brand'].loc[df_indices[i]])
        print ('euclidean distance from given input image :', pdists[i])
        print('*'*125)

        
avg_w2v_model(12920, 20)


## 4. IDF weighted Word2Vec

In [None]:
doc_id = 0
w2v_title_weight = []
# for every title we build a weighted vector representation
for i in data['title']:
    w2v_title_weight.append(build_avg_vec(i, 300, doc_id,'weighted'))
    doc_id += 1
# w2v_title = np.array(# number of doc in courpus * 300), each row corresponds to a doc 
w2v_title_weight = np.array(w2v_title_weight)

In [None]:
def weighted_w2v_model(doc_id, num_results):
    
    pairwise_dist = pairwise_distances(w2v_title_weight, w2v_title_weight[doc_id].reshape(1,-1))

    # np.argsort will return indices of 9 smallest distances
    indices = np.argsort(pairwise_dist.flatten())[0:num_results]
    #pdists will store the 9 smallest distances
    pdists  = np.sort(pairwise_dist.flatten())[0:num_results]

    #data frame indices of the 9 smallest distace's
    df_indices = list(data.index[indices])
    
    for i in range(0, len(indices)):
        heat_map_w2v(data['title'].loc[df_indices[0]],data['title'].loc[df_indices[i]], data['medium_image_url'].loc[df_indices[i]], indices[0], indices[i], 'weighted')
        print('ASIN :',data['asin'].loc[df_indices[i]])
        print('Brand :',data['brand'].loc[df_indices[i]])
        print('euclidean distance from input :', pdists[i])
        print('='*125)

weighted_w2v_model(12920, 20)