In [1]:
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', 120)
pd.set_option('display.max_colwidth', 300)

print("pandas versão:", pd.__version__)

pandas versão: 2.2.3


In [2]:
# Célula 2: tentar ler o CSV principal (modo seguro)
path = "../data/raw/reviews.csv"

# Tentativa com encoding padrão; se falhar, tenta latin-1
try:
    df = pd.read_csv(path, low_memory=False)
    print("CSV lido com sucesso (utf-8). Linhas:", len(df))
except Exception as e:
    print("Erro lendo com utf-8:", e)
    print("Tentando latin-1...")
    df = pd.read_csv(path, encoding='latin-1', low_memory=False)
    print("CSV lido com sucesso (latin-1). Linhas:", len(df))


CSV lido com sucesso (utf-8). Linhas: 5000


In [3]:
# Célula 3: ver as primeiras 5 linhas
df.head(1)


Unnamed: 0,id,dateAdded,dateUpdated,name,asins,brand,categories,primaryCategories,imageURLs,keys,manufacturer,manufacturerNumber,reviews.date,reviews.dateAdded,reviews.dateSeen,reviews.doRecommend,reviews.id,reviews.numHelpful,reviews.rating,reviews.sourceURLs,reviews.text,reviews.title,reviews.username,sourceURLs
0,AVqVGZNvQMlgsOJE6eUY,2017-03-03T16:56:05Z,2018-10-25T16:36:31Z,"Amazon Kindle E-Reader 6"" Wifi (8th Generation, 2016)",B00ZV9PXP2,Amazon,"Computers,Electronics Features,Tablets,Electronics,iPad & Tablets,Kindle E-readers,iPad Accessories,Used:Tablets,E-Readers,E-Readers & Accessories,Computers/Tablets & Networking,Used:Computers Accessories,iPads Tablets,All Tablets,Tablets & E-readers,Computers & Tablets,Amazon,Tablets & eBook Re...",Electronics,"https://pisces.bbystatic.com/image2/BestBuy_US/images/products/5442/5442403_sd.jpg,https://c1.neweggimages.com/NeweggImage/ProductImage/A3FA_1_201801081360871160.jpg,https://i.ebayimg.com/thumbs/images/g/N4IAAOSwoA9Zgkso/s-l96.jpg,http://i.ebayimg.com/thumbs/images/g/dpkAAOSwfpVZFKHy/s-l200.jpg,...","allnewkindleereaderblack6glarefreetouchscreendisplaywifiincludesspecialoffers/b00zv9pxp2,allnewkindleereaderblack6glarefreetouchscreendisplaywifiincludesspecialoffers/9siafvd7fk6707,0848719083774,848719083774,allnewkindleereaderblack6glarefreetouchscreendisplaywifiincludesspecialoffers/322538285...",Amazon,B00ZV9PXP2,2017-09-03T00:00:00.000Z,,"2018-05-27T00:00:00Z,2017-09-18T00:00:00Z,2017-09-06T00:00:00Z,2017-09-12T00:00:00Z",False,,0,3,"http://reviews.bestbuy.com/3545/5442403/reviews.htm%25252525253Fformat%25252525253Dembedded,https://reviews.bestbuy.com/3545/5442403/reviews.htm%2525252525253Fformat%2525252525253Dembedded%25252525252526page%2525252525253D20,https://reviews.bestbuy.com/3545/5442403/reviews.htm%25252525253Fformat...",I thought it would be as big as small paper but turn out to be just like my palm. I think it is too small to read on it... not very comfortable as regular Kindle. Would definitely recommend a paperwhite instead.,Too small,llyyue,"https://www.newegg.com/Product/Product.aspx%25253FItem%25253D9SIAFVD7FK6707,https://reviews.bestbuy.com/3545/5442403/reviews.htm%2525252525253Fformat%2525252525253Dembedded%25252525252526page%2525252525253D39,https://reviews.bestbuy.com/3545/5442403/reviews.htm%2525252525253Fformat%2525252525253..."


In [4]:
# Célula 4: tipos de colunas e nulos
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 24 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   id                   5000 non-null   object 
 1   dateAdded            5000 non-null   object 
 2   dateUpdated          5000 non-null   object 
 3   name                 5000 non-null   object 
 4   asins                5000 non-null   object 
 5   brand                5000 non-null   object 
 6   categories           5000 non-null   object 
 7   primaryCategories    5000 non-null   object 
 8   imageURLs            5000 non-null   object 
 9   keys                 5000 non-null   object 
 10  manufacturer         5000 non-null   object 
 11  manufacturerNumber   5000 non-null   object 
 12  reviews.date         5000 non-null   object 
 13  reviews.dateAdded    1052 non-null   object 
 14  reviews.dateSeen     5000 non-null   object 
 15  reviews.doRecommend  5000 non-null   b

In [5]:
# Célula 5: contar nulos (missing values)
missing = df.isna().sum().sort_values(ascending=False)
missing.head(20)


reviews.id             4971
reviews.dateAdded      3948
reviews.title            13
reviews.username          1
id                        0
dateAdded                 0
dateUpdated               0
name                      0
primaryCategories         0
categories                0
brand                     0
asins                     0
manufacturerNumber        0
imageURLs                 0
manufacturer              0
keys                      0
reviews.doRecommend       0
reviews.dateSeen          0
reviews.date              0
reviews.numHelpful        0
dtype: int64

In [6]:
# Célula 6: listar colunas para ver nomes exatos (útil para mapear)
cols = df.columns.tolist()
print("Total de colunas:", len(cols))
cols


Total de colunas: 24


['id',
 'dateAdded',
 'dateUpdated',
 'name',
 'asins',
 'brand',
 'categories',
 'primaryCategories',
 'imageURLs',
 'keys',
 'manufacturer',
 'manufacturerNumber',
 'reviews.date',
 'reviews.dateAdded',
 'reviews.dateSeen',
 'reviews.doRecommend',
 'reviews.id',
 'reviews.numHelpful',
 'reviews.rating',
 'reviews.sourceURLs',
 'reviews.text',
 'reviews.title',
 'reviews.username',
 'sourceURLs']

In [7]:
# Célula 7: selecionar colunas que provavelmente vamos usar
candidates = ['reviews.id','reviews_id','id','asins','asins','product_id',
              'reviews.text','reviews_text','review_text',
              'reviews.rating','rating','reviews_date','reviews.date','review_date']

found = [c for c in candidates if c in df.columns]
print("Colunas encontradas dentre as candidatas:", found)

# criar subset com as colunas encontradas (separar em novo df)
subset = df[found].copy()
subset.head(5)


Colunas encontradas dentre as candidatas: ['reviews.id', 'id', 'asins', 'asins', 'reviews.text', 'reviews.rating', 'reviews.date']


Unnamed: 0,reviews.id,id,asins,asins.1,reviews.text,reviews.rating,reviews.date
0,,AVqVGZNvQMlgsOJE6eUY,B00ZV9PXP2,B00ZV9PXP2,I thought it would be as big as small paper but turn out to be just like my palm. I think it is too small to read on it... not very comfortable as regular Kindle. Would definitely recommend a paperwhite instead.,3,2017-09-03T00:00:00.000Z
1,,AVqVGZNvQMlgsOJE6eUY,B00ZV9PXP2,B00ZV9PXP2,This kindle is light and easy to use especially at the beach!!!,5,2017-06-06T00:00:00.000Z
2,,AVqVGZNvQMlgsOJE6eUY,B00ZV9PXP2,B00ZV9PXP2,"Didnt know how much i'd use a kindle so went for the lower end. im happy with it, even if its a little dark",4,2018-04-20T00:00:00.000Z
3,177283626.0,AVqVGZNvQMlgsOJE6eUY,B00ZV9PXP2,B00ZV9PXP2,"I am 100 happy with my purchase. I caught it on sale at a really good price. I am normally a real book person, but I have a 1 year old who loves ripping up pages. The Kindle prevents that, it's extremely portable (it fits better in my purse than a giant book), and I have it loaded with lots of b...",5,2017-11-02T17:33:31.000Z
4,,AVqVGZNvQMlgsOJE6eUY,B00ZV9PXP2,B00ZV9PXP2,Solid entry level Kindle. Great for kids. Gifted for a kid of my friend and they love to use it to read more than their iPads. battery is good but higher model is a bit better.,5,2018-04-24T00:00:00.000Z


In [8]:
# Célula 8: normalizar nomes para snake_case (mais simples de programar)
def normalize(name):
    name = name.strip().lower()
    name = name.replace(' ', '_').replace('.', '_').replace('-', '_')
    return name

df.columns = [normalize(c) for c in df.columns]
print("Colunas normalizadas:")
df.columns.tolist()


Colunas normalizadas:


['id',
 'dateadded',
 'dateupdated',
 'name',
 'asins',
 'brand',
 'categories',
 'primarycategories',
 'imageurls',
 'keys',
 'manufacturer',
 'manufacturernumber',
 'reviews_date',
 'reviews_dateadded',
 'reviews_dateseen',
 'reviews_dorecommend',
 'reviews_id',
 'reviews_numhelpful',
 'reviews_rating',
 'reviews_sourceurls',
 'reviews_text',
 'reviews_title',
 'reviews_username',
 'sourceurls']

In [9]:
# Célula 9: mapear colunas encontradas para nomes padronizados
mapping = {
    'reviews_id': 'review_id',
    'id': 'review_id',
    'asins': 'product_id',
    'reviews_text': 'review_text',
    'reviews_text': 'review_text',
    'reviews_rating': 'rating',
    'rating': 'rating',
    'reviews_date': 'review_date',
    'reviews_dateadded': 'review_date'
}

# só aplicar mapeamento se a coluna existir
mapping = {k:v for k,v in mapping.items() if k in df.columns}
df = df.rename(columns=mapping)

# mostrar as colunas finais que iremos usar
for c in ['review_id','product_id','review_text','rating','review_date']:
    print(c, "in df?", c in df.columns)


review_id in df? True
product_id in df? True
review_text in df? True
rating in df? True
review_date in df? True


In [10]:
# Célula 9.1 — Resolver colunas duplicadas (coalesce automaticamente)
from collections import Counter
import pandas as pd

# 1) detectar nomes de coluna duplicados
cols = df.columns.tolist()
dupe_names = [name for name, count in Counter(cols).items() if count > 1]
print("Nomes duplicados detectados:", dupe_names)

# 2) para cada nome duplicado: combinar (coalesce) preferindo valores não-nulos
for dup in dupe_names:
    # máscara booleana para selecionar todas as colunas que têm exatamente esse nome
    mask = [c == dup for c in df.columns]
    df_dup = df.loc[:, mask]   # DataFrame com as colunas duplicadas (pode ter 2+, mesmo nome)
    n_cols = df_dup.shape[1]
    # mostrar contagem de valores não-nulos por cada coluna duplicada (ajuda no diagnóstico)
    non_null_counts = df_dup.notna().sum(axis=0).tolist()
    print(f"\nProcessando '{dup}' com {n_cols} colunas; non-null counts:", non_null_counts)
    
    # coalesce: por linha, pega o primeiro valor não-nulo da esquerda para a direita
    # bfill(axis=1) preenche para a direita; .iloc[:,0] pega a primeira coluna preenchida
    combined = df_dup.bfill(axis=1).iloc[:, 0]
    
    # remover TODAS as colunas duplicadas do df original
    df = df.loc[:, ~pd.Index(df.columns).isin([dup])]
    
    # inserir a coluna coalescida (no final das colunas; reordenamos depois se quiser)
    df[dup] = combined

    print(f"'{dup}' resolvido — {n_cols} → 1 coluna. Valores não-nulos na nova coluna:", int(df[dup].notna().sum()))

# 3) verificação final
print("\nVerificação final de duplicatas de nomes:", df.columns.duplicated().any())
if df.columns.duplicated().any():
    print("Colunas duplicadas (restantes):", [c for i,c in enumerate(df.columns) if df.columns.duplicated()[i]])
else:
    print("OK — nenhum nome de coluna duplicado.")

# 4) mostrar resumo do DataFrame (opcional, útil para confirmar)
print("\nResumo após resolução:")
display(df.info())


Nomes duplicados detectados: ['review_id', 'review_date']

Processando 'review_id' com 2 colunas; non-null counts: [5000, 29]
'review_id' resolvido — 2 → 1 coluna. Valores não-nulos na nova coluna: 5000

Processando 'review_date' com 2 colunas; non-null counts: [5000, 1052]
'review_date' resolvido — 2 → 1 coluna. Valores não-nulos na nova coluna: 5000

Verificação final de duplicatas de nomes: False
OK — nenhum nome de coluna duplicado.

Resumo após resolução:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 22 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   dateadded            5000 non-null   object
 1   dateupdated          5000 non-null   object
 2   name                 5000 non-null   object
 3   product_id           5000 non-null   object
 4   brand                5000 non-null   object
 5   categories           5000 non-null   object
 6   primarycategories    5000 non-n

None

In [11]:
# Célula 9.2 — Selecionar colunas relevantes (manter só o que vamos usar agora)
# Explicação: definimos uma lista "preferida" de colunas que tipicamente usamos no pipeline.
# A seguir, escolhemos apenas as que realmente existem no df e criamos `subset`.

preferred = [
    'review_id',        # id único da review (chave)
    'product_id',       # asin / id do produto
    'review_text',      # texto da review -> NLP
    'rating',           # nota (1-5)
    'review_date',      # data da review
    'reviews_username', # nome do usuário (opcional)
    'reviews_title',    # título da review (opcional)
    'reviews_numhelpful',# votos de útil
    'reviews_dorecommend', # booleano de recomendação
    'brand',            # marca do produto (opcional)
    'name',             # nome do produto (opcional)
    'categories',       # categorias (opcional, útil para análise)
    'primarycategories' # categoria principal (opcional)
]

# filtra apenas os preferidos que existem no DataFrame atual
found = [c for c in preferred if c in df.columns]
print("Colunas preferidas encontradas no df:", found)

# criar subset com cópia (mais seguro)
subset = df[found].copy()

# opcional: sobrescrever df com o subset para economizar memória e evitar confusão
# se quiser manter o df original, comente a próxima linha
df = subset

# checagens rápidas
print("\nShape do subset:", subset.shape)
print("\nPrimeiras linhas do subset:")
display(subset.head(5))

print("\nResumo do subset:")
display(subset.info())


Colunas preferidas encontradas no df: ['review_id', 'product_id', 'review_text', 'rating', 'review_date', 'reviews_username', 'reviews_title', 'reviews_numhelpful', 'reviews_dorecommend', 'brand', 'name', 'categories', 'primarycategories']

Shape do subset: (5000, 13)

Primeiras linhas do subset:


Unnamed: 0,review_id,product_id,review_text,rating,review_date,reviews_username,reviews_title,reviews_numhelpful,reviews_dorecommend,brand,name,categories,primarycategories
0,AVqVGZNvQMlgsOJE6eUY,B00ZV9PXP2,I thought it would be as big as small paper but turn out to be just like my palm. I think it is too small to read on it... not very comfortable as regular Kindle. Would definitely recommend a paperwhite instead.,3,2017-09-03T00:00:00.000Z,llyyue,Too small,0,False,Amazon,"Amazon Kindle E-Reader 6"" Wifi (8th Generation, 2016)","Computers,Electronics Features,Tablets,Electronics,iPad & Tablets,Kindle E-readers,iPad Accessories,Used:Tablets,E-Readers,E-Readers & Accessories,Computers/Tablets & Networking,Used:Computers Accessories,iPads Tablets,All Tablets,Tablets & E-readers,Computers & Tablets,Amazon,Tablets & eBook Re...",Electronics
1,AVqVGZNvQMlgsOJE6eUY,B00ZV9PXP2,This kindle is light and easy to use especially at the beach!!!,5,2017-06-06T00:00:00.000Z,Charmi,Great light reader. Easy to use at the beach,0,True,Amazon,"Amazon Kindle E-Reader 6"" Wifi (8th Generation, 2016)","Computers,Electronics Features,Tablets,Electronics,iPad & Tablets,Kindle E-readers,iPad Accessories,Used:Tablets,E-Readers,E-Readers & Accessories,Computers/Tablets & Networking,Used:Computers Accessories,iPads Tablets,All Tablets,Tablets & E-readers,Computers & Tablets,Amazon,Tablets & eBook Re...",Electronics
2,AVqVGZNvQMlgsOJE6eUY,B00ZV9PXP2,"Didnt know how much i'd use a kindle so went for the lower end. im happy with it, even if its a little dark",4,2018-04-20T00:00:00.000Z,johnnyjojojo,Great for the price,0,True,Amazon,"Amazon Kindle E-Reader 6"" Wifi (8th Generation, 2016)","Computers,Electronics Features,Tablets,Electronics,iPad & Tablets,Kindle E-readers,iPad Accessories,Used:Tablets,E-Readers,E-Readers & Accessories,Computers/Tablets & Networking,Used:Computers Accessories,iPads Tablets,All Tablets,Tablets & E-readers,Computers & Tablets,Amazon,Tablets & eBook Re...",Electronics
3,AVqVGZNvQMlgsOJE6eUY,B00ZV9PXP2,"I am 100 happy with my purchase. I caught it on sale at a really good price. I am normally a real book person, but I have a 1 year old who loves ripping up pages. The Kindle prevents that, it's extremely portable (it fits better in my purse than a giant book), and I have it loaded with lots of b...",5,2017-11-02T17:33:31.000Z,Kdperry,A Great Buy,3,True,Amazon,"Amazon Kindle E-Reader 6"" Wifi (8th Generation, 2016)","Computers,Electronics Features,Tablets,Electronics,iPad & Tablets,Kindle E-readers,iPad Accessories,Used:Tablets,E-Readers,E-Readers & Accessories,Computers/Tablets & Networking,Used:Computers Accessories,iPads Tablets,All Tablets,Tablets & E-readers,Computers & Tablets,Amazon,Tablets & eBook Re...",Electronics
4,AVqVGZNvQMlgsOJE6eUY,B00ZV9PXP2,Solid entry level Kindle. Great for kids. Gifted for a kid of my friend and they love to use it to read more than their iPads. battery is good but higher model is a bit better.,5,2018-04-24T00:00:00.000Z,Johnnyblack,Solid entry-level Kindle. Great for kids,0,True,Amazon,"Amazon Kindle E-Reader 6"" Wifi (8th Generation, 2016)","Computers,Electronics Features,Tablets,Electronics,iPad & Tablets,Kindle E-readers,iPad Accessories,Used:Tablets,E-Readers,E-Readers & Accessories,Computers/Tablets & Networking,Used:Computers Accessories,iPads Tablets,All Tablets,Tablets & E-readers,Computers & Tablets,Amazon,Tablets & eBook Re...",Electronics



Resumo do subset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 13 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   review_id            5000 non-null   object
 1   product_id           5000 non-null   object
 2   review_text          5000 non-null   object
 3   rating               5000 non-null   int64 
 4   review_date          5000 non-null   object
 5   reviews_username     4999 non-null   object
 6   reviews_title        4987 non-null   object
 7   reviews_numhelpful   5000 non-null   int64 
 8   reviews_dorecommend  5000 non-null   bool  
 9   brand                5000 non-null   object
 10  name                 5000 non-null   object
 11  categories           5000 non-null   object
 12  primarycategories    5000 non-null   object
dtypes: bool(1), int64(2), object(10)
memory usage: 473.8+ KB


None

In [12]:
# Célula 10 — Extrair/limpar product_id (ASIN)
import ast
import pandas as pd

def extract_first_asin(val):
    """
    Recebe um valor que pode ser:
      - uma string simples "B0123"
      - uma string que representa lista "['B0123', 'B0456']"
      - uma string com separadores "B0123, B0456"
    Retorna o primeiro ASIN limpo ou None.
    """
    if pd.isna(val):
        return None                   # se faltante, retorna None
    s = str(val).strip()
    # tentar interpretar listas com ast.literal_eval (ex: "['B01','B02']")
    try:
        parsed = ast.literal_eval(s)
        if isinstance(parsed, (list, tuple)) and parsed:
            return str(parsed[0]).strip()
    except Exception:
        pass
    # fallback: cortar no primeiro separador comum
    for sep in [',','|',';',' ']:
        if sep in s:
            cand = s.split(sep)[0]
            return cand.strip(" []'\"")
    # se nada disso, retorna a string limpa
    return s.strip(" []'\"")

# Aplicar somente se a coluna existir
if 'product_id' in df.columns:
    df['product_id'] = df['product_id'].apply(extract_first_asin)
    print("product_id - exemplos limpos:")
    display(df['product_id'].head(8))
else:
    print("Coluna 'product_id' não encontrada no DataFrame.")


product_id - exemplos limpos:


0    B00ZV9PXP2
1    B00ZV9PXP2
2    B00ZV9PXP2
3    B00ZV9PXP2
4    B00ZV9PXP2
5    B00ZV9PXP2
6    B00ZV9PXP2
7    B00ZV9PXP2
Name: product_id, dtype: object

In [13]:
# Célula 11 — Converter rating para numérico e inspecionar
if 'rating' in df.columns:
    # converte para número; strings inválidas viram NaN
    df['rating'] = pd.to_numeric(df['rating'], errors='coerce')
    print("Tipo atual da coluna 'rating':", df['rating'].dtype)
    print("\nDistribuição de valores (rating):")
    display(df['rating'].value_counts(dropna=False).sort_index())
else:
    print("Coluna 'rating' não encontrada.")


Tipo atual da coluna 'rating': int64

Distribuição de valores (rating):


rating
1      63
2      54
3     197
4    1208
5    3478
Name: count, dtype: int64

In [14]:
# Célula 12 — Converter review_date para datetime
if 'review_date' in df.columns:
    # tenta converter; valores inválidos viram NaT (data faltante)
    df['review_date'] = pd.to_datetime(df['review_date'], errors='coerce', utc=True)
    print("Tipo da coluna 'review_date':", df['review_date'].dtype)
    print("dtype:", df['review_date'].dtype)
    display(df['review_date'].head(8))
    print("Quantas datas inválidas/NaT:", int(df['review_date'].isna().sum()))
else:
    print("Coluna 'review_date' não encontrada.")


Tipo da coluna 'review_date': datetime64[ns, UTC]
dtype: datetime64[ns, UTC]


0   2017-09-03 00:00:00+00:00
1   2017-06-06 00:00:00+00:00
2   2018-04-20 00:00:00+00:00
3   2017-11-02 17:33:31+00:00
4   2018-04-24 00:00:00+00:00
5   2016-12-14 00:00:00+00:00
6   2017-12-20 17:38:23+00:00
7   2017-07-14 00:00:00+00:00
Name: review_date, dtype: datetime64[ns, UTC]

Quantas datas inválidas/NaT: 0


In [15]:
# Célula 13 — Features simples do texto: comprimento e contagem de palavras
if 'review_text' in df.columns:
    # garantir string para evitar erros
    df['review_text'] = df['review_text'].astype(str)
    # número de caracteres
    df['review_len'] = df['review_text'].str.len()
    # número de palavras (split por espaço)
    df['review_word_count'] = df['review_text'].str.split().str.len()
    print("Exemplos de métricas de texto (preview):")
    display(df[['review_text','review_len','review_word_count']].head(6))
    print("\nResumo estatístico de review_len:")
    display(df['review_len'].describe())
else:
    print("Coluna 'review_text' não encontrada.")


Exemplos de métricas de texto (preview):


Unnamed: 0,review_text,review_len,review_word_count
0,I thought it would be as big as small paper but turn out to be just like my palm. I think it is too small to read on it... not very comfortable as regular Kindle. Would definitely recommend a paperwhite instead.,211,41
1,This kindle is light and easy to use especially at the beach!!!,63,12
2,"Didnt know how much i'd use a kindle so went for the lower end. im happy with it, even if its a little dark",107,24
3,"I am 100 happy with my purchase. I caught it on sale at a really good price. I am normally a real book person, but I have a 1 year old who loves ripping up pages. The Kindle prevents that, it's extremely portable (it fits better in my purse than a giant book), and I have it loaded with lots of b...",757,148
4,Solid entry level Kindle. Great for kids. Gifted for a kid of my friend and they love to use it to read more than their iPads. battery is good but higher model is a bit better.,176,36
5,This make an excellent ebook reader. Don't expect much from this device except to read basic ebooks. The good thing is it's cheap and good to read in the sun.,158,30



Resumo estatístico de review_len:


count    5000.000000
mean      161.348400
std       242.597383
min        45.000000
25%        71.000000
50%       105.500000
75%       182.000000
max      8351.000000
Name: review_len, dtype: float64

In [16]:
# Célula 14 — Remover linhas sem texto útil e duplicatas por review_id
before = len(df)
print("Linhas antes da limpeza:", before)

# remover linhas cujo texto é vazio ou 'nan' string
if 'review_text' in df.columns:
    mask_valid_text = df['review_text'].notna() & (df['review_text'].str.strip() != '') & (df['review_text'].str.lower() != 'nan')
    df = df[mask_valid_text].copy()
    print("Linhas após remover sem texto (se aplicável):", len(df))
else:
    print("Sem coluna 'review_text' para filtrar.")

# remover duplicatas:
if 'review_id' in df.columns:
    df = df.drop_duplicates(subset=['review_id'], keep='first')
    print("Após drop_duplicates por review_id:", len(df))
else:
    df = df.drop_duplicates()
    print("Após drop_duplicates global:", len(df))

after = len(df)
print(f"Total de linhas removidas: {before - after}")


Linhas antes da limpeza: 5000
Linhas após remover sem texto (se aplicável): 5000
Após drop_duplicates por review_id: 24
Total de linhas removidas: 4976


In [17]:
# Célula 15 — Salvar processed CSV e sample para desenvolvimento
import os
import pandas as pd

processed_path = "../data/processed/reviews_clean.csv"   # ../ porque notebook está em notebooks/
sample_path = "../data/raw/reviews_sample.csv"

# salvar processed (sem índice)
df.to_csv(processed_path, index=False)
print("Processed CSV salvo em:", processed_path)

# salvar sample de até 1000 linhas (útil para desenvolvimento)
n = min(1000, len(df))
df.sample(n=n, random_state=42).to_csv(sample_path, index=False)
print(f"Sample salvo em: {sample_path}  (linhas: {n})")

# checagem rápida: ler e mostrar shape
print("\nChecagem rápida dos arquivos salvos:")
print("exists processed?", os.path.exists(processed_path))
print("exists sample?", os.path.exists(sample_path))
print("Processed shape:", pd.read_csv(processed_path).shape)
print("Sample shape:", pd.read_csv(sample_path).shape)

# mostrar primeiras 3 linhas do processed como preview
print("\nPreview do processed (3 primeiras linhas):")
display(pd.read_csv(processed_path).head(3))


Processed CSV salvo em: ../data/processed/reviews_clean.csv
Sample salvo em: ../data/raw/reviews_sample.csv  (linhas: 24)

Checagem rápida dos arquivos salvos:
exists processed? True
exists sample? True
Processed shape: (24, 15)
Sample shape: (24, 15)

Preview do processed (3 primeiras linhas):


Unnamed: 0,review_id,product_id,review_text,rating,review_date,reviews_username,reviews_title,reviews_numhelpful,reviews_dorecommend,brand,name,categories,primarycategories,review_len,review_word_count
0,AVqVGZNvQMlgsOJE6eUY,B00ZV9PXP2,I thought it would be as big as small paper but turn out to be just like my palm. I think it is too small to read on it... not very comfortable as regular Kindle. Would definitely recommend a paperwhite instead.,3,2017-09-03 00:00:00+00:00,llyyue,Too small,0,False,Amazon,"Amazon Kindle E-Reader 6"" Wifi (8th Generation, 2016)","Computers,Electronics Features,Tablets,Electronics,iPad & Tablets,Kindle E-readers,iPad Accessories,Used:Tablets,E-Readers,E-Readers & Accessories,Computers/Tablets & Networking,Used:Computers Accessories,iPads Tablets,All Tablets,Tablets & E-readers,Computers & Tablets,Amazon,Tablets & eBook Re...",Electronics,211,41
1,AWFUWc8THh53nbDRF6YO,B010CEHQTG,Great Gift for anyone. Very easy to setup. Coexist with all IOT Devices. Alexa is AWESOME!,5,2017-12-16 00:00:00+00:00,doyson,Amazon Echo Show - Greatest Gift EVER,0,True,Amazon,"Amazon Echo Show Alexa-enabled Bluetooth Speaker with 7"" Screen","Computers,Amazon Echo,Virtual Assistant Speakers,Audio & Video Components,Electronics Features,Computer Accessories,Home & Tools,See more Amazon Echo Show Smart Assistant - White,Smart Home Automation,Electronics,TVs Entertainment,Speakers,Smart Hub & Kits,Digital Device 3,Consumer Electronics,W...","Electronics,Hardware",90,16
2,AWK8z0pOIwln0LfXlSxH,B01J24C0TI,"I love having the screen available to use for certain tasks like streaming doorbell video, security camera footage, Amazon Prime music lyrics and videos, but I am disappointed that Amazon‚Äôs implementation of YouTube on the Echo Show violated Google's terms of service so they got the app pulled...",4,2017-10-27 00:00:00+00:00,rainypages,Not Nearly as Smart as Google Home,0,True,Amazon,"Amazon Echo Show Alexa-enabled Bluetooth Speaker with 7"" Screen","Amazon Echo,Virtual Assistant Speakers,Electronics Features,Home & Tools,Smart Home Automation,TVs Entertainment,Speakers,Smart Hub & Kits,Digital Device 3,Wireless Speakers,Smart Home,Home Improvement,Voice Assistants,Amazon Home,Amazon","Electronics,Hardware",743,129


In [18]:
pd.read_csv("../data/processed/reviews_clean_sample.csv").head()

Unnamed: 0,review_id,product_id,review_text,rating,review_date,reviews_username,reviews_title,reviews_numhelpful,reviews_dorecommend,brand,name,categories,primarycategories,review_len,review_word_count
0,AVpgdkC8ilAPnD_xsvyi,B018Y22BI4,I've had kindles for years and this latest one is also right up at the top,5,2016-11-25 00:00:00+00:00,craftie,good value,0,True,Amazon,"Fire Tablet, 7 Display, Wi-Fi, 16 GB - Includes Special Offers, Black","Fire Tablets,Computers/Tablets & Networking,Tablets,All Tablets,Amazon Tablets,Frys,Computers & Tablets,Tablets & eBook Readers",Electronics,74,16
1,AVqkIhwDv8e3D1O-lebb,B01AHB9CN2,"This was bought to replace my old Kindle with keypad, and experimental internet. Was surprised how much the Kindle had changed, and at the same price as my old one was when I bought it!Have already shown it to a couple of friends who now say they are thinking about getting one.",4,2017-03-18 00:00:00+00:00,Janet,Great picture quality,0,True,Amazon,"All-New Fire HD 8 Tablet, 8"" HD Display, Wi-Fi, 16 GB - Includes Special Offers, Magenta","Electronics,iPad & Tablets,All Tablets,Fire Tablets,Tablets,Computers & Tablets",Electronics,278,52
2,AVqVGZNvQMlgsOJE6eUY,B00ZV9PXP2,I thought it would be as big as small paper but turn out to be just like my palm. I think it is too small to read on it... not very comfortable as regular Kindle. Would definitely recommend a paperwhite instead.,3,2017-09-03 00:00:00+00:00,llyyue,Too small,0,False,Amazon,"Amazon Kindle E-Reader 6"" Wifi (8th Generation, 2016)","Computers,Electronics Features,Tablets,Electronics,iPad & Tablets,Kindle E-readers,iPad Accessories,Used:Tablets,E-Readers,E-Readers & Accessories,Computers/Tablets & Networking,Used:Computers Accessories,iPads Tablets,All Tablets,Tablets & E-readers,Computers & Tablets,Amazon,Tablets & eBook Re...",Electronics,211,41
3,AVpftoij1cnluZ0-p5n2,B00IOYAM4I,i needed something to read in the sunlight other than fire hd,5,2015-08-12 00:00:00+00:00,Lester,love it,2,True,Amazon,Amazon - Kindle Voyage - 4GB - Wi-Fi + 3G - Black,"Computers & Tablets,E-Readers & Accessories,eBook Readers,Kindle E-readers",Electronics,61,12
4,AVqkIh9HQMlgsOJE6fu_,B01AHBDCKQ,"Bought this mostly as a backup.and to read a few books, since I have a larger cell phone and a back up. Used it occasionally may 40 or 50 hours of use till this week when I sent my laptop in for repairs. so I probably used it a tot this week. I was watching my 3rd Primetime video this week and i...",1,2017-02-11 00:00:00+00:00,PatF,"less than 120 days, about 100 use. It is toast!",0,False,Amazon,"All-New Fire HD 8 Tablet, 8 HD Display, Wi-Fi, 32 GB - Includes Special Offers, Blue","Fire Tablets,Tablets,All Tablets,Amazon Tablets,Computers & Tablets",Electronics,696,146
