In [1]:
import pandas as pd
from google.cloud import bigquery
from google.oauth2 import service_account

#### Ler tabela do Google Big Query

In [2]:
# Create the connection with Google Cloud
def create_connection():
        # Set the credentials and create the connection
        key_path = "../usp-mba-dsa-tcc-4277103d9155.json"

        credentials = service_account.Credentials.from_service_account_file(
            key_path,
            scopes=["https://www.googleapis.com/auth/bigquery"]
        )

        client = bigquery.Client(
            credentials=credentials,
            project=credentials.project_id,
        )
        return client

client = create_connection()

In [3]:
# Perform a query.
QUERY = ("""
    SELECT DISTINCT offer_id, LOWER(Descricao) Descricao
    FROM `usp-mba-dsa-tcc.ecommerce_offers.vw_dim_offers`
""")
query_job = client.query(QUERY)  # API request
rows = query_job.result()  # Waits for query to finish

offers = rows.to_dataframe()

In [4]:
offers.head(2)

Unnamed: 0,offer_id,Descricao
0,b'\xf3\x91\xf2\xcb\xa2RP\xe5\x81\xf0\xd2\xd7\x...,ração seca premier pet ambientes internos salm...
1,b'\xce\xcd<\x90.\xdf\r\xb3lc\x1a\xaf\xb2\x7f)\...,ração royal canin exigent para gatos adultos c...


In [5]:
from nltk.tokenize import word_tokenize

In [6]:
offers["Sabores"] = offers['Descricao'].apply(word_tokenize) 

In [7]:
word_tokenize(offers["Descricao"][1])

['ração',
 'royal',
 'canin',
 'exigent',
 'para',
 'gatos',
 'adultos',
 'com',
 'paladar',
 'exigente']

In [8]:
import spacy
nlp = spacy.load("pt_core_news_lg")

#### Similarity

In [9]:
for word in offers["Sabores"][0]:
    sim_score = nlp(word).similarity(nlp("carne"))
    print(word, sim_score)

ração 0.5075243337344716
royal -0.02736204134551323
canin 0.0
exigent 0.0
para 0.10003670491914987
gatos 0.27396940681317006
adultos 0.15703508530882343
com 0.03491110062039592
paladar 0.3497222246583097
exigente 0.21890106474824528


  sim_score = nlp(word).similarity(nlp("carne"))


In [None]:
descricoes = ' '.join(offers['Descricao'])
words = list(set(word_tokenize(descricoes))) # Lista distinta de tokens
print(len(words))

#### All words

In [None]:
import nltk
from nltk.corpus import stopwords
stop_words = set(stopwords.words('portuguese'))

In [None]:
import re

def remove_stopwords_and_numbers(words):
    filtered_tokens = []
    for token in words:
        # Verifique se o token não é uma stop word e não é um número
        if token.lower() not in stop_words and not re.match(r'^\d+(?:\.\d+)?$', token):
            filtered_tokens.append(token)
    return filtered_tokens

tokens_sem_stopwords_e_numeros = remove_stopwords_and_numbers(words)
print(len(tokens_sem_stopwords_e_numeros))
tokens_sem_stopwords_e_numeros

In [None]:
df = pd.DataFrame(tokens_sem_stopwords_e_numeros)
df.to_csv('xaxaxa.csv', index=False)

In [None]:
offers["Sabores"][1]

#### Teste

In [22]:
sabores = []
doc = nlp(offers["Descricao"][0])

# Itere pelas entidades nomeadas no texto
for entidade in doc.ents:
    if entidade.label_ == "sabor":  # Verifique se a entidade é um sabor
        sabores.append(entidade.text)

In [19]:
sabores

[]

In [24]:
doc.ents

()