In [48]:
import pandas as pd
import re
import nltk
import io
import os
from nltk.stem import WordNetLemmatizer
from nltk import stem
from nltk import sent_tokenize, word_tokenize
from stop_words import get_stop_words
from google.cloud import vision
from google.cloud.vision import types
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.metrics.pairwise import cosine_similarity


In [19]:
df=pd.read_csv('./zara_dresses.csv')
df_trousers=pd.read_csv('./zara_trousers.csv')
df_tshirts=pd.read_csv('./zara_tshirts.csv')
df_shirts=pd.read_csv('./zara_shirts.csv')

#### Creamos un solo df que contenga toda la info obtenida tras escrapear la web de Zara

In [20]:
df=df.append(df_trousers).append(df_tshirts).append(df_shirts)

In [21]:
df.drop_duplicates(inplace=True)

In [22]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6
0,POLKA DOT TULLE DRESS Details,Semi-sheer collared dress with long sleeves an...,Ecru,4437/069,39.99,[],static.zara.net/photos///2019/V/0/1/p/4437/069...
1,LACE DRESS Details,Short sleeve dress with a shirt collar. Featur...,White,1639/056,39.99,"['XS', 'S', 'M']",static.zara.net/photos///2019/V/0/1/p/1639/056...
2,LONG POLKA DOT DRESS Details,Collared dress with long sleeves. Featuring an...,Off-white,3440/055,29.99,"['XS', 'S', 'M', 'L', 'XL', 'XXL']",static.zara.net/photos///2019/V/0/1/p/3440/055...
3,DRESS WITH MATCHING EMBROIDERY Details,V-neck dress with sleeves falling below the el...,White,1821/021,25.99,"['XS', 'S', 'M', 'L']",static.zara.net/photos///2019/V/0/1/p/1821/021...
4,EMBROIDERED LACE DRESS Details,Round neck dress with sleeves reaching below t...,Ecru,5770/022,29.99,"['XS', 'S', 'M', 'L']",static.zara.net/photos///2019/V/0/1/p/5770/022...


In [23]:
df.shape

(1954, 7)

In [24]:
df.isnull().sum()

0    1
1    1
2    1
3    1
4    1
5    1
6    1
dtype: int64

In [25]:
df.dropna(inplace=True)

In [30]:
#remove "details" word from column 0. Do not give us any information
df["0"]=df["0"].str.replace("Details","")

In [31]:
# create a function to remove words 

def remove_words(word,sentence):
    return sentence.split(word)[0]

In [32]:
# remove the heigh of the model from the description. Do not give us any information

df["1"]=df["1"].apply(lambda x:remove_words('MODEL HEIGHT',x))
df["1"]=df["1"].apply(lambda x:remove_words('HEIGHT OF MODEL',x))


In [33]:
#rename columns
df.rename(columns={'0':'item','1':'description','2':'color','3':'reference','4':'price','5':'available_sizes','6':'image'},inplace=True)

In [34]:
df.head()

Unnamed: 0,item,description,color,reference,price,available_sizes,image
0,POLKA DOT TULLE DRESS,Semi-sheer collared dress with long sleeves an...,Ecru,4437/069,39.99,[],static.zara.net/photos///2019/V/0/1/p/4437/069...
1,LACE DRESS,Short sleeve dress with a shirt collar. Featur...,White,1639/056,39.99,"['XS', 'S', 'M']",static.zara.net/photos///2019/V/0/1/p/1639/056...
2,LONG POLKA DOT DRESS,Collared dress with long sleeves. Featuring an...,Off-white,3440/055,29.99,"['XS', 'S', 'M', 'L', 'XL', 'XXL']",static.zara.net/photos///2019/V/0/1/p/3440/055...
3,DRESS WITH MATCHING EMBROIDERY,V-neck dress with sleeves falling below the el...,White,1821/021,25.99,"['XS', 'S', 'M', 'L']",static.zara.net/photos///2019/V/0/1/p/1821/021...
4,EMBROIDERED LACE DRESS,Round neck dress with sleeves reaching below t...,Ecru,5770/022,29.99,"['XS', 'S', 'M', 'L']",static.zara.net/photos///2019/V/0/1/p/5770/022...


In [35]:
df.head()

Unnamed: 0,item,description,color,reference,price,available_sizes,image
0,POLKA DOT TULLE DRESS,Semi-sheer collared dress with long sleeves an...,Ecru,4437/069,39.99,[],static.zara.net/photos///2019/V/0/1/p/4437/069...
1,LACE DRESS,Short sleeve dress with a shirt collar. Featur...,White,1639/056,39.99,"['XS', 'S', 'M']",static.zara.net/photos///2019/V/0/1/p/1639/056...
2,LONG POLKA DOT DRESS,Collared dress with long sleeves. Featuring an...,Off-white,3440/055,29.99,"['XS', 'S', 'M', 'L', 'XL', 'XXL']",static.zara.net/photos///2019/V/0/1/p/3440/055...
3,DRESS WITH MATCHING EMBROIDERY,V-neck dress with sleeves falling below the el...,White,1821/021,25.99,"['XS', 'S', 'M', 'L']",static.zara.net/photos///2019/V/0/1/p/1821/021...
4,EMBROIDERED LACE DRESS,Round neck dress with sleeves reaching below t...,Ecru,5770/022,29.99,"['XS', 'S', 'M', 'L']",static.zara.net/photos///2019/V/0/1/p/5770/022...


In [36]:
#create column with all words from descritption, item and color

df['text_processed']=df.item+df.description+df.color

In [37]:
#NLP

nltk.download('wordnet')
nltk.download('stem')

def clean_up(s):
    lines = s.lower().split(" ")
    lines = [re.sub(r'\d', ' ', line) for line in lines]
    lines = [re.sub(r'\W', ' ', line) for line in lines]
    
    return " ".join(lines)

def tokenize(s):
    return word_tokenize(s)

def stem_and_lemmatize(l):
    lemmatizer = WordNetLemmatizer()
    stemmer = stem.PorterStemmer()
    stem_phrase=[stemmer.stem(x) for x in l]
    stem_and_lemmatize_phrase = [lemmatizer.lemmatize(x)  for x in stem_phrase] 
    return stem_and_lemmatize_phrase

def remove_stopwords(w):
    stop_words = get_stop_words('en')
    [w.remove(i) for i in w if i in stop_words]
    return w

def clean_f(x): 
    functions = [clean_up, tokenize, stem_and_lemmatize, remove_stopwords]
    for f in functions: 
        x = f(x)
    return x

df['text_processed']=df['text_processed'].apply(clean_f)


[nltk_data] Downloading package wordnet to /Users/almu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Error loading stem: Package 'stem' not found in index


In [38]:
df['text_processed']=df['text_processed'].apply(', '.join)

In [39]:
df.head()

Unnamed: 0,item,description,color,reference,price,available_sizes,image,text_processed
0,POLKA DOT TULLE DRESS,Semi-sheer collared dress with long sleeves an...,Ecru,4437/069,39.99,[],static.zara.net/photos///2019/V/0/1/p/4437/069...,"polka, dot, tull, dress, semi, sheer, collar, ..."
1,LACE DRESS,Short sleeve dress with a shirt collar. Featur...,White,1639/056,39.99,"['XS', 'S', 'M']",static.zara.net/photos///2019/V/0/1/p/1639/056...,"lace, dress, short, sleev, dress, a, shirt, co..."
2,LONG POLKA DOT DRESS,Collared dress with long sleeves. Featuring an...,Off-white,3440/055,29.99,"['XS', 'S', 'M', 'L', 'XL', 'XXL']",static.zara.net/photos///2019/V/0/1/p/3440/055...,"long, polka, dot, dress, collar, dress, long, ..."
3,DRESS WITH MATCHING EMBROIDERY,V-neck dress with sleeves falling below the el...,White,1821/021,25.99,"['XS', 'S', 'M', 'L']",static.zara.net/photos///2019/V/0/1/p/1821/021...,"dress, match, embroideri, v, neck, dress, slee..."
4,EMBROIDERED LACE DRESS,Round neck dress with sleeves reaching below t...,Ecru,5770/022,29.99,"['XS', 'S', 'M', 'L']",static.zara.net/photos///2019/V/0/1/p/5770/022...,"embroid, lace, dress, round, neck, dress, slee..."


In [42]:
# escrapeamos de la api de Vision Cloud
# Probamos con la siguiente imagen:

url='https://cdn-images.farfetch-contents.com/13/21/69/02/13216902_16131498_480.jpg'

os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="./healthy-feat-234911-0b9642033688.json"
tags=[]
client = vision.ImageAnnotatorClient()
image = vision.types.Image()
image.source.image_uri = url
# Call client.label_detection and web_detection with as (image=image) argument.
response = client.label_detection(image=image)
response2 = client.web_detection(image=image)
# Store the response in a variable response
labels = response.label_annotations
webs = response2.web_detection
        
for web in webs.web_entities:
    tags.append(web.description)     
            
for label in labels:
    tags.append(label.description)

print(tags)       

['T-shirt', 'Dress', 'Sequin', 'Cocktail dress', 'Halterneck', 'Shirt', 'Sleeveless shirt', 'Cami Mini Dress', 'Mesh Mini Dress', 'Miniskirt', 'Clothing', 'Dress', 'Day dress', 'Violet', 'Turquoise', 'Purple', 'Sleeveless shirt', 'Cocktail dress', 'Aqua', 'Pink']


In [43]:
new_descr=' '.join(clean_f(' '.join(tags)))

In [44]:
# la info de la nueva imagen la incluimos en el df de Zara

combine=df.append({'reference':'item_Farfetch','price':'-','available_sizes':'-','image':'-','item':'-','description':'-','color':'-','text_processed':new_descr}, ignore_index=True)

In [46]:
combine.tail()

Unnamed: 0,item,description,color,reference,price,available_sizes,image,text_processed
1949,SHIRT WITH CUTWORK EMBROIDERY,Short sleeve shirt with a crossover V-neckline...,Brick,7200/006,25.99,"['XS', 'S', 'M', 'L', 'XL', 'XXL']",static.zara.net/photos///2019/V/0/1/p/7200/006...,"shirt, cutwork, embroideri, short, sleev, shir..."
1950,STRIPED TOP WITH STRAPS,Top with a straight-cut neckline and thin stra...,Sky blue,2391/023,25.99,"['XS', 'S', 'M', 'L', 'XL', 'XXL']",static.zara.net/photos///2019/V/0/1/p/2391/023...,"stripe, top, strap, top, straight, cut, neckli..."
1951,OVERSIZED SHIRT,Oversized shirt with a regular collar and long...,Pistachio,6929/041,25.99,"['XS', 'S', 'M', 'L', 'XL']",static.zara.net/photos///2019/V/0/1/p/6929/041...,"shirt, shirt, regular, collar, long, sleev, fe..."
1952,CROPPED SHIRT WITH KNOT,Loose-fitting cropped shirt with a regular col...,Black,6929/042,25.99,"['XS', 'S', 'M', 'L', 'XL']",static.zara.net/photos///2019/V/0/1/p/6929/042...,"crop, shirt, knot, loo, fit, crop, shirt, regu..."
1953,-,-,-,item_Farfetch,-,-,-,t shirt dress sequin cocktail dress halterneck...


In [47]:
#Recomendador

tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0, stop_words='english')

tfidf_matrix = tf.fit_transform(combine['text_processed'])
cosine_similarities = cosine_similarity(tfidf_matrix,tfidf_matrix)

results = {} # dictionary created to store the result in a dictionary format (ID : (Score,item_id))

for idx, row in combine.iterrows(): #iterates through all the rows
    # the below code 'similar_indice' stores similar ids based on cosine similarity. sorts them in ascending order. [:-5:-1] is then used so that the indices with most similarity are got. 0 means no similarity and 1 means perfect similarity
    similar_indices = cosine_similarities[idx].argsort()[:-5:-1] #stores 5 most similar books, you can change it as per your needs
    similar_items = [(cosine_similarities[idx][i], combine['reference'][i]) for i in similar_indices]
    results[row['reference']] = similar_items[1:]
    
#below code 'function item(id)' returns a row matching the id along with Book Title. Initially it is a dataframe, then we convert it to a list
def price(id):
    return combine.loc[combine['reference'] == id]['price'].tolist()[0]
def image(id):
    return combine.loc[combine['reference'] == id]['image'].tolist()[0]
def ref(id):
    return combine.loc[combine['reference'] == id]['reference'].tolist()[0]
def recommend(id, num):
    if (num == 0):
        print("Hello! Unable to recommend any item as you have not chosen the number of dresses to be recommended")
    elif (num==1):
        print("Hello! Recommending " + str(num) + " item ")
        
    else :
        print("Hello! Recommending " + str(num) + " items")
        
    print("----------------------------------------------------------")
    recs = results[id][:num]
    for rec in recs:
        print("You may like this item: " + image(rec[1]) + " " + str(price(rec[1])) + "$ " + ref(rec[1]))
    print ("We hope you love them!")
    print ("You can buy them online at www.zara.com or at any of our stores")

#the first argument in the below function to be passed is the id of the book, second argument is the number of books you want to be recommended
recommend('item_Farfetch',3)

Hello! Recommending 3 items
----------------------------------------------------------
You may like this item: static.zara.net/photos///2019/V/0/1/p/5507/005/808/2/w/560/5507005808_1_1_1.jpg?ts=1552911390646 29.99$ 5507/005
You may like this item: static.zara.net/photos///2019/V/0/1/p/6771/008/808/2/w/560/6771008808_1_1_1.jpg?ts=1550082476189 29.99$ 6771/008
You may like this item: static.zara.net/photos///2019/V/0/1/p/6771/008/808/2/w/560/6771008808_1_1_1.jpg?ts=1550082476189 29.99$ 6771/008
We hope you love them!
You can buy them online at www.zara.com or at any of our stores


In [49]:
#Probamos con Euclidean distances pero los resultados son peores

tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0, stop_words='english')

tfidf_matrix = tf.fit_transform(combine['text_processed'])
euclidean_distance = euclidean_distances(tfidf_matrix,tfidf_matrix)

results = {} # dictionary created to store the result in a dictionary format (ID : (Score,item_id))

for idx, row in combine.iterrows(): #iterates through all the rows
    # the below code 'similar_indice' stores similar ids based on euclidean_distance. sorts them in ascending order. [:-5:-1] is then used so that the indices with most similarity are got. 0 means no similarity and 1 means perfect similarity
    similar_indices = euclidean_distance[idx].argsort()[:-5:-1] #stores 5 most similar books, you can change it as per your needs
    similar_items = [(euclidean_distance[idx][i], combine['reference'][i]) for i in similar_indices]
    results[row['reference']] = similar_items[1:]
    
#below code 'function item(id)' returns a row matching the id along with Book Title. Initially it is a dataframe, then we convert it to a list
def price(id):
    return combine.loc[combine['reference'] == id]['price'].tolist()[0]
def image(id):
    return combine.loc[combine['reference'] == id]['image'].tolist()[0]
def recommend(id, num):
    if (num == 0):
        print("Unable to recommend any item as you have not chosen the number of dresses to be recommended")
    elif (num==1):
        print("Recommending " + str(num) + " item ")
        
    else :
        print("Recommending " + str(num) + " items")
        
    print("----------------------------------------------------------")
    recs = results[id][:num]
    for rec in recs:
        print("You may like this item: " + image(rec[1]) + " " + str(price(rec[1])) + "$ (score:" + str(rec[0]) + ")")
        print ("We hope you love them!")
        print ("You can buy them online at www.zara.com or at any of our stores")

#the first argument in the below function to be passed is the id of the book, second argument is the number of books you want to be recommended
recommend('item_Farfetch',3)



Recommending 3 items
----------------------------------------------------------
You may like this item: static.zara.net/photos///2019/V/0/1/p/8372/037/510/5/w/560/8372037510_1_1_1.jpg?ts=1549271002525 25.99$ (score:1.414213562373096)
We hope you love them!
You can buy them online at www.zara.com or at any of our stores
You may like this item: static.zara.net/photos///2019/V/0/1/p/8372/037/510/5/w/560/8372037510_1_1_1.jpg?ts=1549271002525 25.99$ (score:1.414213562373096)
We hope you love them!
You can buy them online at www.zara.com or at any of our stores
You may like this item: static.zara.net/photos///2019/V/0/1/p/8372/037/510/5/w/560/8372037510_1_1_1.jpg?ts=1549271002525 25.99$ (score:1.414213562373096)
We hope you love them!
You can buy them online at www.zara.com or at any of our stores
