### Imports

In [36]:
import re
import boto3
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer

### Data download

In [13]:
s3 = boto3.client("s3")
s3.download_file(
    Bucket="cheapper", Key="webscraping_results_17-01-2023.csv", Filename="data.csv"
)

### Read data

In [2]:
df = pd.read_csv('data.csv').drop('Unnamed: 0',axis = 1)

### What we want to do?

These items are all the same, but have slightly different name depending of the source where we got them from.

The idea would be to create a model that matches products we belive are the same.

In [3]:
df.loc[261]

date                                                 17-01-2023
ecomm_name                                             eldorado
name                                        YERBA CANARIAS 1KG 
src           https://eldoradouy.vtexassets.com/arquivos/ids...
price_1                                                  188,00
price_2                                                     NaN
Name: 261, dtype: object

In [4]:
df.loc[69]

date                                                 17-01-2023
ecomm_name                                               elclon
name                                        YERBA CANARIAS 1 KG
src           https://f.fcdn.app/imgs/95fec6/www.elclon.com....
price_1                                                     185
price_2                                                     NaN
Name: 69, dtype: object

In [5]:
df.loc[423]

date                                                 17-01-2023
ecomm_name                                               devoto
name                                        Yerba CANARIAS 1 kg
src           https://geant.vteximg.com.br/arquivos/ids/2921...
price_1                                                     188
price_2                                                     NaN
Name: 423, dtype: object

### Unsupervised clasification

In [6]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\emanu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [7]:
def containsNumber(value):
    for character in value:
        if character.isdigit():
            return True
    return False

In [8]:
def remove_stopwords(texts):
    stop_words = set(stopwords.words('spanish'))
    stop_words.add('gr')
    stop_words.add('g')
    stop_words.add('grs.')
    stop_words.add('grs')
    stop_words.add('kg')
    stop_words.add('kg.')
    cleaned_texts = []
    for text in texts:
        cleaned_text = [word for word in text.split() if word.lower() not in stop_words]
        cleaned_texts.append(' '.join(cleaned_text))
    return cleaned_texts

In [9]:
def remove_stopwords_together(texts):
    
    new_names = []
    for names in texts:
        
        new_name = []
        for name in names.split():
            
            if containsNumber(name):
                new_name.append(re.findall(r'\d+', name)[0])
            else:
                new_name.append(name)
        
        new_names.append(' '.join(new_name))
    return new_names

In [10]:
def tokenize_text(text):
    tokens = word_tokenize(text)
    return tokens

In [11]:
def create_bow(texts):
    vectorizer = CountVectorizer()
    bow = vectorizer.fit_transform(texts)
    return bow, vectorizer

In [12]:
df['name'] = df['name'].astype(str) # get everything to str
df['name'] = df['name'].str.lower() # lowercase everything
df['name'] = df['name'].str.replace('c/','') # get rid of the c/

df['name'] = remove_stopwords(df['name'].values) # stopwords
df['name'] = remove_stopwords_together(df['name']) # clean "500g" like descriptions

In [47]:
# method 1
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
bow = cv.fit_transform(df['name'])

print(bow[0].toarray())

[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 

In [46]:
# method 2

# questions: it's good, i need to sort this to show me the most relevant answers
def classify_texts(texts, n_clusters):
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(texts)
    kmeans = KMeans(n_clusters=n_clusters)
    kmeans.fit(X)
    return kmeans.labels_

In [38]:
clusters = classify_texts(df['name'], 500)

In [39]:
df['cluster'] = clusters

In [40]:
df.loc[423]

date                                                 17-01-2023
ecomm_name                                               devoto
name                                           yerba canarias 1
src           https://geant.vteximg.com.br/arquivos/ids/2921...
price_1                                                     188
price_2                                                     NaN
cluster                                                      77
Name: 423, dtype: object

In [41]:
df[df['cluster'] == 77]

Unnamed: 0,date,ecomm_name,name,src,price_1,price_2,cluster
69,17-01-2023,elclon,yerba canarias 1,https://f.fcdn.app/imgs/95fec6/www.elclon.com....,185,,77
261,17-01-2023,eldorado,yerba canarias 1,https://eldoradouy.vtexassets.com/arquivos/ids...,18800,,77
423,17-01-2023,devoto,yerba canarias 1,https://geant.vteximg.com.br/arquivos/ids/2921...,188,,77
452,17-01-2023,devoto,yerba canarias 500,https://geant.vteximg.com.br/arquivos/ids/2921...,101,,77
465,17-01-2023,devoto,pack x 2 yerba canarias 70,https://geant.vteximg.com.br/arquivos/ids/3278...,25,,77
476,17-01-2023,devoto,yerba canarias 250,https://geant.vteximg.com.br/arquivos/ids/2921...,55,,77
484,17-01-2023,devoto,yerba canarias hierbas 1,https://geant.vteximg.com.br/arquivos/ids/2689...,195,,77
485,17-01-2023,devoto,yerba canarias 5,https://geant.vteximg.com.br/arquivos/ids/2921...,925,,77
843,17-01-2023,disco,yerba canarias 1,https://geant.vteximg.com.br/arquivos/ids/2921...,188,,77
858,17-01-2023,disco,yerba canarias 500,https://geant.vteximg.com.br/arquivos/ids/2921...,101,,77
