### Importando as bibliotecas

In [1]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split

### Carregando a base de dados

In [2]:
try:
    df = pd.read_csv('./amazon-fashion-800k+-user-reviews-dataset.csv', encoding='utf-8')
except:
    try:
        df = pd.read_csv('./amazon-fashion-800k+-user-reviews-dataset.csv', encoding='latin1')
    except:
        df = pd.read_csv('./amazon-fashion-800k+-user-reviews-dataset.csv', encoding='iso-8859-1')

df.head()


Unnamed: 0,rating,title,text,images,asin,parent_asin,user_id,timestamp,helpful_vote,verified_purchases,target
0,1.0,It say 5 pair when purchasing but only get 2 r...,I was looking for 5 pair and only received 2 p...,[],B07QFTMTLP,B07QFTMTLP,AHASEZ65RESN57BMGRV6QBM5DTIA,1565088068852,0,True,-1
1,1.0,DonÃ¢ÂÂt do it!,Just donÃ¢ÂÂt. These things fell apart after...,[],B0764KKDN1,B0764KKDN1,AE3AMA3QSOHFKV46JJAHTHMMIR6A,1622416429592,0,True,-1
2,1.0,Small,Retuned is too small for me,[],B07J1WHVCP,B07J1WHVCP,AH4CFWQE2HTC5BSWIEF3LVLUFK6A,1565284666220,0,True,-1
3,1.0,Pre-Used When Received,This product came with the sleeves turned insi...,[],B0773JWP64,B0773JWP64,AFEKQFJWST6MVTKEJBQKUUBTWK7A,1581963636172,0,False,-1
4,1.0,Worn once and several places at seams have com...,Worn once and several places at seams have com...,[],B099NST9RX,B08JGNS1NK,AGU2FPKN6ARXUSSGBT6WTVLZKJSQ,1640895438476,0,True,-1


### Analise inicial

In [3]:
print("Dimensão:", df.shape)
df.info()
df.describe(include="all")
df.sample(5)


Dimensão: (867310, 11)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 867310 entries, 0 to 867309
Data columns (total 11 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   rating              867310 non-null  float64
 1   title               867107 non-null  object 
 2   text                867012 non-null  object 
 3   images              867310 non-null  object 
 4   asin                867310 non-null  object 
 5   parent_asin         867310 non-null  object 
 6   user_id             867310 non-null  object 
 7   timestamp           867310 non-null  int64  
 8   helpful_vote        867310 non-null  int64  
 9   verified_purchases  867310 non-null  bool   
 10  target              867310 non-null  int64  
dtypes: bool(1), float64(1), int64(3), object(6)
memory usage: 67.0+ MB


Unnamed: 0,rating,title,text,images,asin,parent_asin,user_id,timestamp,helpful_vote,verified_purchases,target
138745,1.0,Not the quality I expected for $15.,Product is nice but thereÃ¢ÂÂs a lot of thre...,[{'small_image_url': 'https://images-na.ssl-im...,B075DHTXV2,B081FC9HXJ,AG5ZATVSJSGP24YNMYPWGLGYZP5A,1612371857231,1,True,-1
505554,3.0,Cheap Fabric,The dress is pretty but it looks and feels lik...,[],B085DHQYZX,B085DHQYZX,AGOLB5YET2G7NTBYJR4EOLOQF76A,1600107281165,0,False,0
174786,2.0,HUGE for size (and no size label or tag),Ordered XS wondering if it might be a bit too ...,[],B09G993TQM,B09G993TQM,AH7NDBQ5V3UGSAANVV7YSO5TETDQ,1662256713731,0,True,-1
397421,3.0,Not a Ã¢ÂÂgirlÃ¢ÂÂsÃ¢ÂÂ swim suit,Really cute suit HOWEVER not for Ã¢ÂÂgirls.Ã...,[],B095SN5L68,B095SN5L68,AHE24P2RRJIIEOPIQJD5VQEF47IA,1625266260846,2,True,0
86949,1.0,"Cheap price , still not worth it","Fabric looks nothing like the photo, so thin...",[],B0967G1YRW,B0967GNHY9,AHL73JWE6DYUJJMOX6E56F3YW2PA,1625694168125,0,True,-1


### Selecionar apenas colunas úteis


In [4]:
df = df[['rating', 'title', 'text']]


### Análise depois de selecionar as colunas

In [5]:
print("Dimensão:", df.shape)
df.info()
df.describe(include="all")
df.sample(5)

Dimensão: (867310, 3)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 867310 entries, 0 to 867309
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   rating  867310 non-null  float64
 1   title   867107 non-null  object 
 2   text    867012 non-null  object 
dtypes: float64(1), object(2)
memory usage: 19.9+ MB


Unnamed: 0,rating,title,text
363237,3.0,It fit but I think maybe just large would have...,I asked for a medium and got an extra large. I...
272540,2.0,The medium is Way to large,The medium is Way to large
520122,3.0,too big,too big and hard to reshape
131562,1.0,Shaped weird,The back strap is shorter then the shoe and th...
610395,4.0,Have done 3 hikes with it and it is comfortabl...,Quality pack for the price. Have done 3 hikes...


### Análise Exploratória de Dados

- Quantidades de valores nulos ordenados

- Quantas linhas ou textos estão duplicados.

- Se existem ratings fora do padrão (ex.: 6, 0, negativo).

- Se há textos vazios, muito curtos ou ruidosos.

- Se os textos possuem emojis, caracteres especiais, URLs ou ruído digital.

- Distribuição do rating (útil para criação posterior da coluna de sentimento).




In [6]:
# Quantidade de valores nulos ordenados do maior para o menor
df.isna().sum().sort_values(ascending=False)

text      298
title     203
rating      0
dtype: int64

- Verificar duplicidades

In [7]:
# Conta quantas linhas são duplicadas considerando todas as colunas
duplicatas = df.duplicated().sum()
print("Total de linhas duplicadas:", duplicatas)

Total de linhas duplicadas: 26881


In [8]:
# Conta duplicidades considerando somente o texto do review
duplicatas_texto = df['text'].duplicated().sum()
print("Total de textos duplicados:", duplicatas_texto)

Total de textos duplicados: 56608


- Verificar valores inconsistentes

In [9]:
# Verifica valores únicos de rating
print("Valores únicos de rating:")
print(df['rating'].unique())

Valores únicos de rating:
[1. 2. 3. 4. 5.]


In [10]:
# Verifica estatísticas básicas do rating
print(df['rating'].describe())

count    867310.000000
mean          3.000000
std           1.414214
min           1.000000
25%           2.000000
50%           3.000000
75%           4.000000
max           5.000000
Name: rating, dtype: float64


In [11]:
# Procura textos extremamente curtos (possível ruído)
df['text_length'] = df['text'].astype(str).apply(len)
print("\nDistribuição do tamanho dos textos:")
print(df['text_length'].describe())


Distribuição do tamanho dos textos:
count    867310.000000
mean        158.297295
std         196.318998
min           1.000000
25%          47.000000
50%         102.000000
75%         199.000000
max       14382.000000
Name: text_length, dtype: float64


In [12]:
# Exibe 5 exemplos de textos com menos de 5 caracteres
curtos = df[df['text_length'] < 5][['text', 'rating']]
print("\nTextos muito curtos (possível ruído):")
print(curtos.head())


Textos muito curtos (possível ruído):
      text  rating
39     Ugh     1.0
309   Junk     1.0
688   Huge     1.0
1167    No     1.0
1291  Ruim     1.0


- Distribuição da variável rating

In [13]:
print("\nDistribuição do rating:")
print(df['rating'].value_counts().sort_index())


Distribuição do rating:
rating
1.0    173462
2.0    173462
3.0    173462
4.0    173462
5.0    173462
Name: count, dtype: int64


- Verificar presença de emojis ou caracteres não ascii

In [14]:
def contem_caracteres_especiais(texto):
    return bool(re.search(r'[^\x00-\x7F]', str(texto)))

df['caracteres_especiais'] = df['text'].apply(contem_caracteres_especiais)

print("\nQuantidade de textos com caracteres especiais ou emojis:")
print(df['caracteres_especiais'].sum())



Quantidade de textos com caracteres especiais ou emojis:
108443


-  Detectar presença de URLs dentro dos textos

In [15]:
def contem_url(texto):
    return bool(re.search(r'http\S+|www\.\S+', str(texto)))

df['tem_url'] = df['text'].apply(contem_url)

print("\nQuantidade de textos contendo URLs:")
print(df['tem_url'].sum())


Quantidade de textos contendo URLs:
186


### Limpeza
- Remover duplicidades

- Remover textos vazios ou curtos demais

- Remover URLs

- Normalizar texto (lowercase, espaços, caracteres especiais)

- Remover emojis e símbolos opcionais

- Criar um dataframe limpo para usar no modelo depois

### Remover duplicidades

In [16]:
# Remove duplicidades considerando todas as colunas
df = df.drop_duplicates()

# Remove duplicidades considerando só o texto (review)
df = df.drop_duplicates(subset=['text'])

- Remover textos vazios ou muito curtos

In [17]:
# Preenche valores nulos no texto com string vazia
df['text'] = df['text'].fillna("")

# Calcula tamanho dos textos
df['text_length'] = df['text'].astype(str).apply(len)

# Mantém apenas textos com mais de 5 caracteres (ajustável)
df = df[df['text_length'] > 5]

- Função para remover URLs dos textos

In [18]:
def remove_urls(text):
    return re.sub(r'http\S+|www\.\S+', '', text)

df['text'] = df['text'].apply(remove_urls)

- Normalizar texto (lowercase + remover múltiplos espaços)


In [19]:
def normalizar(text):
    # transforma em minúsculas
    text = text.lower()
    # remove múltiplos espaços
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['text'] = df['text'].apply(normalizar)

- Remover emojis e caracteres especiais (opcional)

In [20]:
def remove_emoji(text):
    emoji_pattern = re.compile(
        "["
        "\U0001F600-\U0001F64F"  # emoticons
        "\U0001F300-\U0001F5FF"  # símbolos e pictogramas
        "\U0001F680-\U0001F6FF"  # transportes e símbolos
        "\U0001F1E0-\U0001F1FF"  # bandeiras
        "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

df['text'] = df['text'].apply(remove_emoji)

- Criar dataframe final de texto limpo

In [21]:
df = df[['rating', 'title', 'text']].reset_index(drop=True)

- Salvar versão limpa

In [23]:
df.to_csv('./tabela_limpa_para_analise.csv', index=False)

print("Limpeza concluída. Primeiras linhas da nova base:")
df.head()

Limpeza concluída. Primeiras linhas da nova base:


Unnamed: 0,rating,title,text
0,1.0,It say 5 pair when purchasing but only get 2 r...,i was looking for 5 pair and only received 2 p...
1,1.0,DonÃ¢ÂÂt do it!,just donã¢âât. these things fell apart after...
2,1.0,Small,retuned is too small for me
3,1.0,Pre-Used When Received,this product came with the sleeves turned insi...
4,1.0,Worn once and several places at seams have com...,worn once and several places at seams have com...


In [24]:
print("Dimensão:", df.shape)
df.info()
df.describe(include="all")
df.sample(5)


Dimensão: (809845, 3)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 809845 entries, 0 to 809844
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   rating  809845 non-null  float64
 1   title   809733 non-null  object 
 2   text    809845 non-null  object 
dtypes: float64(1), object(2)
memory usage: 18.5+ MB


Unnamed: 0,rating,title,text
426590,3.0,Hope you don't have to return,"the dress was beautiful but, i literally paid ..."
694864,5.0,Good buy,awesome pair of glasses. comfortable and sturd...
670509,5.0,Will work a buy for these steel toe shoes,these shoes for work are awesome cause you don...
6996,1.0,Too narrow,too narrow for my wide foot. looked nice though!
351390,3.0,Blouse not the same as product description.......,i was so looking forward to this very attracti...


### Selecionando o texto e criando um rótulo de sentimentos baseado nas avaliações(Coluna Rating)

Converte valores de rating (1 a 5) para categorias de sentimento:

- 4 e 5 estrelas -> Positivo
- 3 estrelas -> Neutro
- 1 e 2 estrelas -> Negativo

In [25]:
# Criar função para rotular sentimentos

def gerar_sentimento(rating):
  if rating >= 4:
    return "Positivo"
  elif rating ==3:
    return "Neutro"
  else:
    return "Negativo"

# Aplicar aos dados
df['sentimento'] = df['rating'].apply(gerar_sentimento)

# Visualizar distribuição
df['sentimento'].value_counts()

sentimento
Negativo    329675
Positivo    315138
Neutro      165032
Name: count, dtype: int64

In [26]:
print("Dimensão:", df.shape)
df.info()
df.describe(include="all")
df.sample(5)

Dimensão: (809845, 4)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 809845 entries, 0 to 809844
Data columns (total 4 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   rating      809845 non-null  float64
 1   title       809733 non-null  object 
 2   text        809845 non-null  object 
 3   sentimento  809845 non-null  object 
dtypes: float64(1), object(3)
memory usage: 24.7+ MB


Unnamed: 0,rating,title,text,sentimento
400399,3.0,Sizing inconsistent,did not fit right. sizing was inconsistent wit...,Neutro
139175,1.0,One Star,terrible the strap on the watch is made very p...,Negativo
11296,1.0,Not as pictured,it is nothing like the picture shown.,Negativo
623856,4.0,Four Stars,i like it buy it tangles alot,Positivo
504279,4.0,Good quality sock,a store in the neighborhood recommended these ...,Positivo


### Preparação dos Dados + Separação Treino/Teste


- Criar a coluna full_text para combinar title + text aumenta a qualidade de modelo, pois o título geralmente contém informação relevante.

In [27]:
# Criando uma coluna combinada título de texto
df['full_text'] = df['title'].astype(str) + " " + df['text'].astype(str)

In [28]:
print("Dimensão:", df.shape)
df.info()
df.describe(include="all")
df.sample(5)

Dimensão: (809845, 5)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 809845 entries, 0 to 809844
Data columns (total 5 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   rating      809845 non-null  float64
 1   title       809733 non-null  object 
 2   text        809845 non-null  object 
 3   sentimento  809845 non-null  object 
 4   full_text   809845 non-null  object 
dtypes: float64(1), object(4)
memory usage: 30.9+ MB


Unnamed: 0,rating,title,text,sentimento,full_text
670004,5.0,Face masks,these are a bit heavy. in w<br />the winter th...,Positivo,Face masks these are a bit heavy. in w<br />th...
713726,5.0,Perfect!!,"perfect size,! i never wear dangly earrings bu...",Positivo,"Perfect!! perfect size,! i never wear dangly e..."
554346,4.0,these little guys work great. Terrific value,i have a hard time with metal backs on earring...,Positivo,these little guys work great. Terrific value i...
115967,1.0,Really poor quality,really disappointed with the quality of this p...,Negativo,Really poor quality really disappointed with t...
191014,2.0,Not as pictured,i bought the amethyst. they were smaller than ...,Negativo,Not as pictured i bought the amethyst. they we...


## Definir Features (X) e Rótulos (y)
- Usando full_text como entrada, e sentimento como saída.

In [29]:
# Features e labels
X = df['full_text']
y = df['sentimento']

### Dividir em Treino e Teste (com estratificação)
- Mantendo o equilibrio das classes

In [30]:
X_train, X_test, y_train, y_test = train_test_split(
     X,
     y,
     test_size=0.2,
     stratify=y,
     random_state=42
 )
# Verificando se a divisão ficou correta
print("\nTamanhos dos conjuntos:")
print("Treino:", X_train.shape)
print("Teste:", X_test.shape)


Tamanhos dos conjuntos:
Treino: (647876,)
Teste: (161969,)
