# üìä SentiBR - An√°lise Explorat√≥ria de Dados (EDA)

Este notebook realiza uma an√°lise explorat√≥ria completa do dataset de reviews.

## Objetivos:
1. Entender a distribui√ß√£o dos dados
2. Analisar padr√µes de sentimento
3. Identificar caracter√≠sticas textuais
4. Avaliar balanceamento das classes
5. Definir estrat√©gia de preprocessamento

In [None]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import plotly.express as px
import plotly.graph_objects as go
from collections import Counter
import re
import emoji
from pathlib import Path

# Configura√ß√µes
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 100)

%matplotlib inline

## 1. Carregar Dados

In [None]:
# Carregar dataset processado
df = pd.read_csv('../../data/processed/processed_reviews.csv')

print(f"üìä Dataset carregado: {len(df)} reviews")
print(f"üìù Colunas: {df.columns.tolist()}")
df.head()

In [None]:
# Informa√ß√µes gerais
df.info()

## 2. An√°lise de Distribui√ß√£o de Sentimentos

In [None]:
# Distribui√ß√£o de sentimentos
sentiment_counts = df['sentiment'].value_counts()

fig = px.bar(
    x=sentiment_counts.index,
    y=sentiment_counts.values,
    labels={'x': 'Sentimento', 'y': 'N√∫mero de Reviews'},
    title='Distribui√ß√£o de Sentimentos',
    color=sentiment_counts.index,
    color_discrete_map={
        'positivo': '#2ecc71',
        'neutro': '#f39c12',
        'negativo': '#e74c3c'
    }
)
fig.update_layout(showlegend=False, height=400)
fig.show()

print("\nüìä Estat√≠sticas:")
print(sentiment_counts)
print(f"\nüìà Percentuais:")
print(sentiment_counts / len(df) * 100)

In [None]:
# Distribui√ß√£o de ratings
if 'overall_rating' in df.columns:
    fig = px.histogram(
        df, 
        x='overall_rating',
        title='Distribui√ß√£o de Ratings (1-5 estrelas)',
        labels={'overall_rating': 'Rating'},
        nbins=5,
        color='sentiment'
    )
    fig.show()
    
    print(f"‚≠ê Rating m√©dio: {df['overall_rating'].mean():.2f}")
    print(f"‚≠ê Rating mediano: {df['overall_rating'].median():.2f}")

## 3. An√°lise Textual

In [None]:
# Comprimento das reviews
df['text_length'] = df['review_text'].str.len()
df['word_count'] = df['review_text'].str.split().str.len()

# Estat√≠sticas por sentimento
print("üìè Comprimento m√©dio das reviews (caracteres):")
print(df.groupby('sentiment')['text_length'].describe())

print("\nüìù N√∫mero m√©dio de palavras:")
print(df.groupby('sentiment')['word_count'].describe())

In [None]:
# Visualiza√ß√£o de comprimento por sentimento
fig = px.box(
    df,
    x='sentiment',
    y='word_count',
    title='Distribui√ß√£o do N√∫mero de Palavras por Sentimento',
    labels={'word_count': 'N√∫mero de Palavras', 'sentiment': 'Sentimento'},
    color='sentiment'
)
fig.show()

## 4. WordClouds por Sentimento

In [None]:
# WordClouds
from wordcloud import STOPWORDS

# Adicionar stopwords em portugu√™s
stopwords_pt = set(STOPWORDS)
stopwords_pt.update([
    'produto', 'comprei', 'compra', 'chegou', 'veio', 'muito', 'bem',
    'pra', 't√°', 'ta', '√©', 'e', 'o', 'a', 'de', 'da', 'do', 'que',
    'em', 'um', 'uma', 'os', 'as', 'para', 'com', 'por', 'mais'
])

fig, axes = plt.subplots(1, 3, figsize=(20, 6))
sentiments = ['positivo', 'neutro', 'negativo']
colors = ['Greens', 'Oranges', 'Reds']

for idx, (sentiment, color) in enumerate(zip(sentiments, colors)):
    text = ' '.join(df[df['sentiment'] == sentiment]['review_text'].astype(str))
    
    wordcloud = WordCloud(
        width=600,
        height=400,
        background_color='white',
        stopwords=stopwords_pt,
        colormap=color,
        max_words=100
    ).generate(text)
    
    axes[idx].imshow(wordcloud, interpolation='bilinear')
    axes[idx].set_title(f'Reviews {sentiment.upper()}', fontsize=16, fontweight='bold')
    axes[idx].axis('off')

plt.tight_layout()
plt.show()

## 5. An√°lise de Aspectos

In [None]:
# Analisar cobertura de aspectos
aspect_cols = [col for col in df.columns if col.startswith('has_') or col.startswith('aspect_')]

if aspect_cols:
    print("üìã Aspectos identificados nas reviews:")
    
    aspect_stats = {}
    for col in aspect_cols:
        aspect_name = col.replace('has_', '').replace('aspect_', '')
        if df[col].dtype == bool or df[col].dtype == 'bool':
            count = df[col].sum()
        else:
            count = df[col].notna().sum()
        
        aspect_stats[aspect_name] = count
    
    # Plotar
    fig = px.bar(
        x=list(aspect_stats.keys()),
        y=list(aspect_stats.values()),
        title='Frequ√™ncia de Men√ß√£o de Aspectos',
        labels={'x': 'Aspecto', 'y': 'N√∫mero de Men√ß√µes'}
    )
    fig.show()
    
    # Tabela
    aspect_df = pd.DataFrame({
        'Aspecto': list(aspect_stats.keys()),
        'Men√ß√µes': list(aspect_stats.values()),
        'Percentual': [v/len(df)*100 for v in aspect_stats.values()]
    })
    print(aspect_df)

## 6. An√°lise de Emojis e Caracteres Especiais

In [None]:
# Fun√ß√£o para extrair emojis
def extract_emojis(text):
    return [c for c in text if c in emoji.EMOJI_DATA]

# Contar emojis
df['has_emoji'] = df['review_text'].apply(lambda x: len(extract_emojis(str(x))) > 0)
df['emoji_count'] = df['review_text'].apply(lambda x: len(extract_emojis(str(x))))

print(f"üòä Reviews com emojis: {df['has_emoji'].sum()} ({df['has_emoji'].sum()/len(df)*100:.1f}%)")
print(f"üòä Total de emojis: {df['emoji_count'].sum()}")

# Emojis por sentimento
emoji_by_sentiment = df.groupby('sentiment')['emoji_count'].sum()
print("\nüòä Emojis por sentimento:")
print(emoji_by_sentiment)

## 7. Exemplos de Reviews

In [None]:
# Mostrar exemplos de cada sentimento
print("\n" + "="*80)
print("üìù EXEMPLOS DE REVIEWS POR SENTIMENTO")
print("="*80)

for sentiment in ['positivo', 'neutro', 'negativo']:
    print(f"\nüéØ SENTIMENTO: {sentiment.upper()}")
    print("-" * 80)
    
    samples = df[df['sentiment'] == sentiment].sample(min(3, len(df[df['sentiment'] == sentiment])))
    
    for idx, row in samples.iterrows():
        print(f"\nüìå Review {idx+1}:")
        if 'review_title' in row and pd.notna(row['review_title']):
            print(f"   T√≠tulo: {row['review_title']}")
        if 'overall_rating' in row and pd.notna(row['overall_rating']):
            print(f"   Rating: {'‚≠ê' * int(row['overall_rating'])} ({row['overall_rating']})")
        print(f"   Texto: {row['review_text'][:200]}...")
        print()

## 8. Balanceamento de Classes

In [None]:
# An√°lise de balanceamento
class_distribution = df['sentiment'].value_counts()
max_count = class_distribution.max()
min_count = class_distribution.min()
imbalance_ratio = max_count / min_count

print("‚öñÔ∏è An√°lise de Balanceamento de Classes:")
print(f"\n   Classe majorit√°ria: {class_distribution.idxmax()} ({max_count} reviews)")
print(f"   Classe minorit√°ria: {class_distribution.idxmin()} ({min_count} reviews)")
print(f"   Raz√£o de desbalanceamento: {imbalance_ratio:.2f}x")

if imbalance_ratio > 3:
    print("\n   ‚ö†Ô∏è ALERTA: Dataset significativamente desbalanceado!")
    print("   üìå Recomenda√ß√µes:")
    print("      - Considerar t√©cnicas de balanceamento (SMOTE, undersampling, etc)")
    print("      - Usar class_weights no treinamento")
    print("      - Avaliar com m√©tricas al√©m de accuracy (F1-score, precision, recall)")
elif imbalance_ratio > 1.5:
    print("\n   ‚ö†Ô∏è Dataset moderadamente desbalanceado")
    print("   üìå Considerar usar class_weights no treinamento")
else:
    print("\n   ‚úÖ Dataset razoavelmente balanceado")

## 9. Conclus√µes e Pr√≥ximos Passos

In [None]:
print("\n" + "="*80)
print("üìä RESUMO DA AN√ÅLISE EXPLORAT√ìRIA")
print("="*80)

print(f"\n1Ô∏è‚É£ DADOS GERAIS:")
print(f"   - Total de reviews: {len(df):,}")
print(f"   - Comprimento m√©dio: {df['word_count'].mean():.0f} palavras")
print(f"   - Reviews com emojis: {df['has_emoji'].sum()/len(df)*100:.1f}%")

print(f"\n2Ô∏è‚É£ DISTRIBUI√á√ÉO DE SENTIMENTOS:")
for sentiment, count in class_distribution.items():
    print(f"   - {sentiment.capitalize()}: {count:,} ({count/len(df)*100:.1f}%)")

print(f"\n3Ô∏è‚É£ QUALIDADE DOS DADOS:")
print(f"   - Missing values: {df.isnull().sum().sum()}")
print(f"   - Duplicatas: {df.duplicated().sum()}")

print(f"\n4Ô∏è‚É£ PR√ìXIMOS PASSOS:")
print("   ‚úÖ Preprocessamento de texto (limpeza, normaliza√ß√£o)")
print("   ‚úÖ Tokeniza√ß√£o com BERT tokenizer")
print("   ‚úÖ Split train/val/test")
print("   ‚úÖ Balanceamento de classes (se necess√°rio)")
print("   ‚úÖ Fine-tuning do modelo BERT")

print("\n" + "="*80)

In [None]:
# Salvar estat√≠sticas
stats = {
    'total_reviews': len(df),
    'sentiment_distribution': class_distribution.to_dict(),
    'avg_length': df['word_count'].mean(),
    'median_length': df['word_count'].median(),
    'has_emoji_pct': df['has_emoji'].sum() / len(df) * 100,
    'imbalance_ratio': imbalance_ratio
}

import json
with open('../../data/processed/dataset_statistics.json', 'w', encoding='utf-8') as f:
    json.dump(stats, f, indent=2, ensure_ascii=False)

print("üìÅ Estat√≠sticas salvas em: data/processed/dataset_statistics.json")