### Imports

In [1]:
import re
import functions
import boto3
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer

### Data download

In [2]:
# s3 = boto3.client("s3")
# s3.download_file(
#     Bucket="cheapper", Key="webscraping_results_17-01-2023.csv", Filename="data.csv"
# )

### Read data

In [3]:
df = pd.read_csv('data.csv').drop('Unnamed: 0',axis = 1)

In [4]:
df = df[df['name'].str.contains('yerba',case = False)].reset_index(drop = True) # me quedo solo con yerba

In [5]:
df.loc[[69,72,104,176,246]]

Unnamed: 0,date,ecomm_name,name,src,price_1,price_2
69,17-01-2023,elclon,YERBA CANARIAS 1 KG,https://f.fcdn.app/imgs/95fec6/www.elclon.com....,185,
72,17-01-2023,eldorado,YERBA CANARIAS 1KG,https://eldoradouy.vtexassets.com/arquivos/ids...,18800,
104,17-01-2023,devoto,Yerba CANARIAS 1 kg,https://geant.vteximg.com.br/arquivos/ids/2921...,188,
176,17-01-2023,disco,Yerba CANARIAS 1 kg,https://geant.vteximg.com.br/arquivos/ids/2921...,188,
246,17-01-2023,tiendainglesa,Yerba CANARIAS 1 Kg,https://images-ti-vm1.tiendainglesa.com.uy/lar...,190,


### Unsupervised clasification

In [6]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [7]:
def containsNumber(value):
    for character in value:
        if character.isdigit():
            return True
    return False

In [8]:
def remove_stopwords(texts):
    stop_words = set(stopwords.words('spanish'))
    stop_words.add('gr')
    stop_words.add('g')
    stop_words.add('grs.')
    stop_words.add('grs')
    stop_words.add('kg')
    stop_words.add('kg.')
    stop_words.add('&')
    stop_words.add('+')
    stop_words.add('.')
    stop_words.add('%')
    stop_words.add('clasica')
    cleaned_texts = []
    for text in texts:
        cleaned_text = [word for word in text.split() if word.lower() not in stop_words]
        cleaned_texts.append(' '.join(cleaned_text))
    return cleaned_texts

In [9]:
def remove_stopwords_together(texts):
    
    new_names = []
    for names in texts:
        
        new_name = []
        for name in names.split():
            
            if containsNumber(name):
                new_name.append(re.findall(r'\d+', name)[0])
            else:
                new_name.append(name)
        
        new_names.append(' '.join(new_name))
    return new_names

In [10]:
def remove_yerba_mate(texts):
    if 'yerba' in texts.split() and 'mate' in texts.split():
            return texts.replace('mate ','')
    else:
        return texts

In [11]:
df['name'] = df['name'].astype(str) # get everything to str
df['name'] = df['name'].str.lower() # lowercase everything
df['name'] = df['name'].str.replace('c/','') # get rid of the c/
df['name'] = df['name'].str.replace('(','')
df['name'] = df['name'].str.replace(')','')
df['name'] = df['name'].str.replace('á','a')
df['name'] = df['name'].str.replace('é','e')
df['name'] = df['name'].str.replace('í','i')
df['name'] = df['name'].str.replace('ó','o')
df['name'] = df['name'].str.replace('ú','u')
df['name'] = df['name'].str.replace('/',' ')

df['name'] = remove_stopwords(df['name'].values) # stopwords
df['name'] = remove_stopwords_together(df['name']) # clean "500g" like descriptions
df['name'] = [remove_yerba_mate(descriptions) for descriptions in df['name']] # remove mate from yerba mate

  df['name'] = df['name'].str.replace('(','')
  df['name'] = df['name'].str.replace(')','')


In [12]:
df.loc[[69,72,104,176,246]]

Unnamed: 0,date,ecomm_name,name,src,price_1,price_2
69,17-01-2023,elclon,yerba canarias 1,https://f.fcdn.app/imgs/95fec6/www.elclon.com....,185,
72,17-01-2023,eldorado,yerba canarias 1,https://eldoradouy.vtexassets.com/arquivos/ids...,18800,
104,17-01-2023,devoto,yerba canarias 1,https://geant.vteximg.com.br/arquivos/ids/2921...,188,
176,17-01-2023,disco,yerba canarias 1,https://geant.vteximg.com.br/arquivos/ids/2921...,188,
246,17-01-2023,tiendainglesa,yerba canarias 1,https://images-ti-vm1.tiendainglesa.com.uy/lar...,190,


### Text classification

In [13]:
def tokenize_text(text):
    tokens = word_tokenize(text)
    return tokens

def create_bow(texts):
    vectorizer = CountVectorizer()
    bow = vectorizer.fit_transform(texts)
    return bow, vectorizer

In [14]:
from sklearn.feature_extraction.text import CountVectorizer

cv =  CountVectorizer(tokenizer=lambda txt: txt.split()) # tengo que usar una función custom
bow = cv.fit_transform(df['name'])

count_array = bow.toarray()
df_features = pd.DataFrame(data=count_array, columns = cv.get_feature_names_out())



In [15]:
print('Item:', df['name'].loc[69])
print('')
print('Unos en el vector:')
df_features.loc[:,df_features.loc[69] == 1].loc[69]

Item: yerba canarias 1

Unos en el vector:


1           1
canarias    1
yerba       1
Name: 69, dtype: int64

In [16]:
# This is the final dataset with the names and vector representation
df_features = df_features.set_index(df['name']) 

In [17]:
# Function that gets similar vectors to the ones given
def get_similar_items(text):
    
    text = functions.clean_description(text)
    
    text = cv.transform(text).toarray()[0]
    
    distances = []

    for u in df_features.index:
        if np.linalg.norm(text - df_features.loc[u]) <= 1:
            distances.append(u)
            
    return df[df['name'].isin(distances)]

### Text similarity

In [18]:
text = 'yerba canarias serena 1kg'

similar_items = get_similar_items(text)

In [19]:
similar_items

Unnamed: 0,date,ecomm_name,name,src,price_1,price_2
57,17-01-2023,elclon,yerba canarias 1 serena,https://f.fcdn.app/imgs/90d00b/www.elclon.com....,205,
73,17-01-2023,eldorado,yerba canarias serena 1,https://eldoradouy.vtexassets.com/arquivos/ids...,19990,
105,17-01-2023,devoto,yerba canarias serena 1,https://geant.vteximg.com.br/arquivos/ids/2068...,208,
175,17-01-2023,disco,yerba canarias serena 1,https://geant.vteximg.com.br/arquivos/ids/2068...,202,
244,17-01-2023,tiendainglesa,yerba serena canarias 1,https://images-ti-vm1.tiendainglesa.com.uy/lar...,213,


### Image similarity

In [39]:
# Model loading
from detecto.core import Model
from detecto import utils

labels = ['yerba canarias 1 kg',
 'yerba armiño 1 kg',
 'yerba canarias serena 1 kg',
 'yerba armiño suave 1 kg',
 'yerba armiño compuesta 1 kg']

model = Model.load('model_weights.pth', labels)



In [84]:
# Visualizo una imagen
def get_image_prediction(image):
    
    image = utils.read_image(image)
    
    labels, boxes, scores = model.predict_top(image)
    
    df_temp = pd.DataFrame(scores, index = [labels], columns = ['Probability'])
    
    df_temp = df_temp.reset_index()
    
    return df_temp.sort_values('Probability').reset_index().values[-1][1]

In [85]:
item = get_image_prediction('test 2.jpg')

In [86]:
item

'yerba canarias serena 1 kg'

In [88]:
## Pass it trough the text similarity
similar_items = get_similar_items(df_temp.sort_values('Probability').reset_index().values[-1][1])

In [89]:
similar_items

Unnamed: 0,date,ecomm_name,name,src,price_1,price_2
57,17-01-2023,elclon,yerba canarias 1 serena,https://f.fcdn.app/imgs/90d00b/www.elclon.com....,205,
73,17-01-2023,eldorado,yerba canarias serena 1,https://eldoradouy.vtexassets.com/arquivos/ids...,19990,
105,17-01-2023,devoto,yerba canarias serena 1,https://geant.vteximg.com.br/arquivos/ids/2068...,208,
175,17-01-2023,disco,yerba canarias serena 1,https://geant.vteximg.com.br/arquivos/ids/2068...,202,
244,17-01-2023,tiendainglesa,yerba serena canarias 1,https://images-ti-vm1.tiendainglesa.com.uy/lar...,213,
