# Amazon Reviews Data Exploration and NLTK Analysis

This notebook provides comprehensive exploratory data analysis (EDA) of the Amazon product reviews dataset and demonstrates NLTK-based sentiment analysis techniques.

## Objectives:
1. Understand dataset structure and characteristics
2. Explore rating distributions and patterns
3. Analyze text characteristics and quality
4. Implement NLTK-based sentiment analysis
5. Identify potential data quality issues
6. Generate insights for model training


In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')

# NLTK imports
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.probability import FreqDist
from collections import Counter

# Download NLTK data
nltk.download('vader_lexicon', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Add src to path
import sys
sys.path.append('../src')

from data_processor import ReviewDataProcessor, get_data_stats

print("Libraries imported successfully!")

## 1. Data Loading and Basic Information

In [None]:
# Load the dataset
df = pd.read_csv('../data/raw_reviews.csv')

print(f"Dataset shape: {df.shape}")
print(f"\nColumn names: {list(df.columns)}")
print(f"\nFirst few rows:")
df.head()

In [None]:
# Basic dataset information
print("Dataset Info:")
print(df.info())

print("\nMissing Values:")
print(df.isnull().sum())

print("\nBasic Statistics:")
print(df.describe())

In [None]:
# Get comprehensive dataset statistics
stats = get_data_stats(df)
print("Comprehensive Dataset Statistics:")
for key, value in stats.items():
    print(f"{key}: {value}")

## 2. Rating Distribution Analysis

In [None]:
# Rating distribution
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 10))

# Rating count distribution
rating_counts = df['rating'].value_counts().sort_index()
ax1.bar(rating_counts.index, rating_counts.values, color='skyblue', alpha=0.8)
ax1.set_title('Distribution of Star Ratings', fontsize=14, fontweight='bold')
ax1.set_xlabel('Star Rating')
ax1.set_ylabel('Number of Reviews')
ax1.grid(True, alpha=0.3)

# Rating percentage pie chart
rating_pct = df['rating'].value_counts().sort_index()
ax2.pie(rating_pct.values, labels=[f'{i} Star' for i in rating_pct.index], 
        autopct='%1.1f%%', startangle=90)
ax2.set_title('Rating Distribution (Percentage)', fontsize=14, fontweight='bold')

# Helpful votes by rating
df.boxplot(column='helpful_votes', by='rating', ax=ax3)
ax3.set_title('Helpful Votes by Rating', fontsize=14, fontweight='bold')
ax3.set_xlabel('Star Rating')
ax3.set_ylabel('Helpful Votes')

# Verified purchase by rating
verified_by_rating = df.groupby('rating')['verified_purchase'].mean()
ax4.bar(verified_by_rating.index, verified_by_rating.values, color='lightgreen', alpha=0.8)
ax4.set_title('Verified Purchase Rate by Rating', fontsize=14, fontweight='bold')
ax4.set_xlabel('Star Rating')
ax4.set_ylabel('Verified Purchase Rate')
ax4.set_ylim(0, 1)
ax4.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 3. Product and User Analysis

In [None]:
# Product analysis
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

# Reviews per product
product_counts = df['product_name'].value_counts()
ax1.barh(range(len(product_counts)), product_counts.values)
ax1.set_yticks(range(len(product_counts)))
ax1.set_yticklabels([name.replace(' ', '\n') for name in product_counts.index], fontsize=10)
ax1.set_title('Number of Reviews per Product', fontsize=14, fontweight='bold')
ax1.set_xlabel('Number of Reviews')
ax1.grid(True, alpha=0.3)

# Average rating per product
avg_rating_by_product = df.groupby('product_name')['rating'].mean().sort_values(ascending=True)
ax2.barh(range(len(avg_rating_by_product)), avg_rating_by_product.values, color='orange', alpha=0.8)
ax2.set_yticks(range(len(avg_rating_by_product)))
ax2.set_yticklabels([name.replace(' ', '\n') for name in avg_rating_by_product.index], fontsize=10)
ax2.set_title('Average Rating per Product', fontsize=14, fontweight='bold')
ax2.set_xlabel('Average Rating')
ax2.set_xlim(1, 5)
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("Product Statistics:")
print(f"Total unique products: {df['product_id'].nunique()}")
print(f"Most reviewed product: {product_counts.index[0]} ({product_counts.iloc[0]} reviews)")
print(f"Highest rated product: {avg_rating_by_product.index[-1]} ({avg_rating_by_product.iloc[-1]:.2f} stars)")
print(f"Lowest rated product: {avg_rating_by_product.index[0]} ({avg_rating_by_product.iloc[0]:.2f} stars)")

## 4. Text Analysis and Characteristics

In [None]:
# Text length analysis
df['text_length'] = df['review_text'].str.len()
df['word_count'] = df['review_text'].str.split().str.len()

fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 12))

# Text length distribution
ax1.hist(df['text_length'], bins=50, alpha=0.7, color='skyblue', edgecolor='black')
ax1.set_title('Distribution of Review Text Length', fontsize=14, fontweight='bold')
ax1.set_xlabel('Text Length (characters)')
ax1.set_ylabel('Frequency')
ax1.axvline(df['text_length'].mean(), color='red', linestyle='--', label=f'Mean: {df["text_length"].mean():.0f}')
ax1.legend()
ax1.grid(True, alpha=0.3)

# Word count distribution
ax2.hist(df['word_count'], bins=50, alpha=0.7, color='lightgreen', edgecolor='black')
ax2.set_title('Distribution of Word Count', fontsize=14, fontweight='bold')
ax2.set_xlabel('Word Count')
ax2.set_ylabel('Frequency')
ax2.axvline(df['word_count'].mean(), color='red', linestyle='--', label=f'Mean: {df["word_count"].mean():.0f}')
ax2.legend()
ax2.grid(True, alpha=0.3)

# Text length by rating
df.boxplot(column='text_length', by='rating', ax=ax3)
ax3.set_title('Text Length by Rating', fontsize=14, fontweight='bold')
ax3.set_xlabel('Star Rating')
ax3.set_ylabel('Text Length (characters)')

# Word count by rating
df.boxplot(column='word_count', by='rating', ax=ax4)
ax4.set_title('Word Count by Rating', fontsize=14, fontweight='bold')
ax4.set_xlabel('Star Rating')
ax4.set_ylabel('Word Count')

plt.tight_layout()
plt.show()

# Text statistics
print("Text Statistics:")
print(f"Average text length: {df['text_length'].mean():.0f} characters")
print(f"Average word count: {df['word_count'].mean():.0f} words")
print(f"Shortest review: {df['text_length'].min()} characters")
print(f"Longest review: {df['text_length'].max()} characters")

## 5. NLTK Sentiment Analysis with VADER

In [None]:
# Initialize VADER sentiment analyzer
sia = SentimentIntensityAnalyzer()

# Apply VADER sentiment analysis
print("Applying VADER sentiment analysis...")
sentiment_scores = df['review_text'].apply(lambda x: sia.polarity_scores(x))

# Extract sentiment components
df['vader_compound'] = sentiment_scores.apply(lambda x: x['compound'])
df['vader_positive'] = sentiment_scores.apply(lambda x: x['pos'])
df['vader_negative'] = sentiment_scores.apply(lambda x: x['neg'])
df['vader_neutral'] = sentiment_scores.apply(lambda x: x['neu'])

# Create sentiment labels
df['vader_sentiment'] = df['vader_compound'].apply(
    lambda x: 'positive' if x >= 0.05 else ('negative' if x <= -0.05 else 'neutral')
)

print("VADER sentiment analysis completed!")
print(f"\nVADER Sentiment Distribution:")
print(df['vader_sentiment'].value_counts())
print(f"\nPercentages:")
print(df['vader_sentiment'].value_counts(normalize=True) * 100)

In [None]:
# Visualize VADER sentiment analysis results
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(16, 12))

# VADER compound score distribution
ax1.hist(df['vader_compound'], bins=50, alpha=0.7, color='purple', edgecolor='black')
ax1.set_title('Distribution of VADER Compound Scores', fontsize=14, fontweight='bold')
ax1.set_xlabel('Compound Score')
ax1.set_ylabel('Frequency')
ax1.axvline(0, color='red', linestyle='--', label='Neutral (0)')
ax1.axvline(0.05, color='green', linestyle='--', label='Positive threshold')
ax1.axvline(-0.05, color='orange', linestyle='--', label='Negative threshold')
ax1.legend()
ax1.grid(True, alpha=0.3)

# VADER sentiment by star rating
sentiment_rating_crosstab = pd.crosstab(df['rating'], df['vader_sentiment'], normalize='index') * 100
sentiment_rating_crosstab.plot(kind='bar', ax=ax2, stacked=True)
ax2.set_title('VADER Sentiment Distribution by Star Rating', fontsize=14, fontweight='bold')
ax2.set_xlabel('Star Rating')
ax2.set_ylabel('Percentage')
ax2.legend(title='VADER Sentiment')
ax2.tick_params(axis='x', rotation=0)
ax2.grid(True, alpha=0.3)

# Average VADER compound score by rating
avg_compound_by_rating = df.groupby('rating')['vader_compound'].mean()
ax3.bar(avg_compound_by_rating.index, avg_compound_by_rating.values, color='teal', alpha=0.8)
ax3.set_title('Average VADER Compound Score by Rating', fontsize=14, fontweight='bold')
ax3.set_xlabel('Star Rating')
ax3.set_ylabel('Average Compound Score')
ax3.axhline(0, color='red', linestyle='--', alpha=0.5)
ax3.grid(True, alpha=0.3)

# VADER sentiment components heatmap
sentiment_components = df[['vader_positive', 'vader_negative', 'vader_neutral']].corr()
sns.heatmap(sentiment_components, annot=True, cmap='coolwarm', center=0, ax=ax4)
ax4.set_title('VADER Sentiment Components Correlation', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.show()

## 6. Rating vs VADER Sentiment Correlation Analysis

In [None]:
# Correlation between star ratings and VADER sentiment
correlation = df['rating'].corr(df['vader_compound'])
print(f"Correlation between Star Rating and VADER Compound Score: {correlation:.3f}")

# Scatter plot of rating vs VADER compound score
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

# Scatter plot with trend line
ax1.scatter(df['rating'], df['vader_compound'], alpha=0.6, s=20)
z = np.polyfit(df['rating'], df['vader_compound'], 1)
p = np.poly1d(z)
ax1.plot(df['rating'], p(df['rating']), "r--", alpha=0.8, linewidth=2)
ax1.set_title(f'Star Rating vs VADER Compound Score\n(Correlation: {correlation:.3f})', fontsize=14, fontweight='bold')
ax1.set_xlabel('Star Rating')
ax1.set_ylabel('VADER Compound Score')
ax1.grid(True, alpha=0.3)

# Box plot of VADER compound scores by rating
df.boxplot(column='vader_compound', by='rating', ax=ax2)
ax2.set_title('VADER Compound Score Distribution by Rating', fontsize=14, fontweight='bold')
ax2.set_xlabel('Star Rating')
ax2.set_ylabel('VADER Compound Score')

plt.tight_layout()
plt.show()

# Identify potential mismatches
print("\nPotential Rating-Sentiment Mismatches:")

# High ratings with negative sentiment
high_rating_negative = df[(df['rating'] >= 4) & (df['vader_compound'] < -0.1)]
print(f"High ratings (4-5 stars) with negative sentiment: {len(high_rating_negative)} reviews")

# Low ratings with positive sentiment  
low_rating_positive = df[(df['rating'] <= 2) & (df['vader_compound'] > 0.1)]
print(f"Low ratings (1-2 stars) with positive sentiment: {len(low_rating_positive)} reviews")

if len(high_rating_negative) > 0:
    print("\nSample high-rated reviews with negative sentiment:")
    for i, (_, row) in enumerate(high_rating_negative.head(3).iterrows()):
        print(f"{i+1}. Rating: {row['rating']}, VADER: {row['vader_compound']:.3f}")
        print(f"   Text: {row['review_text'][:100]}...\n")

if len(low_rating_positive) > 0:
    print("Sample low-rated reviews with positive sentiment:")
    for i, (_, row) in enumerate(low_rating_positive.head(3).iterrows()):
        print(f"{i+1}. Rating: {row['rating']}, VADER: {row['vader_compound']:.3f}")
        print(f"   Text: {row['review_text'][:100]}...\n")

## 7. Text Processing and Word Frequency Analysis

In [None]:
# Initialize data processor for text cleaning
processor = ReviewDataProcessor()

# Clean text and analyze word frequencies
print("Cleaning text and analyzing word frequencies...")
df['cleaned_text'] = df['review_text'].apply(processor.clean_text)

# Get all words from cleaned text
all_words = []
stop_words = set(stopwords.words('english'))

for text in df['cleaned_text']:
    words = word_tokenize(text.lower())
    filtered_words = [word for word in words if word.isalpha() and word not in stop_words and len(word) > 2]
    all_words.extend(filtered_words)

# Word frequency analysis
word_freq = FreqDist(all_words)
most_common = word_freq.most_common(20)

print(f"\nTotal unique words: {len(word_freq)}")
print(f"Total words (after filtering): {len(all_words)}")
print(f"\nTop 20 most common words:")
for word, count in most_common:
    print(f"{word}: {count}")

In [None]:
# Visualize word frequencies
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 8))

# Most common words bar chart
words, counts = zip(*most_common)
ax1.barh(range(len(words)), counts)
ax1.set_yticks(range(len(words)))
ax1.set_yticklabels(words)
ax1.set_title('Top 20 Most Common Words', fontsize=14, fontweight='bold')
ax1.set_xlabel('Frequency')
ax1.grid(True, alpha=0.3)

# Word frequency distribution (Zipf's law)
ranks = range(1, min(1000, len(word_freq)) + 1)
frequencies = [word_freq.most_common(1000)[i][1] for i in range(min(1000, len(word_freq)))]

ax2.loglog(ranks, frequencies, 'b-', alpha=0.8)
ax2.set_title('Word Frequency Distribution (Log-Log Scale)', fontsize=14, fontweight='bold')
ax2.set_xlabel('Rank')
ax2.set_ylabel('Frequency')
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 8. Word Frequency by Sentiment

In [None]:
# Analyze word frequencies by sentiment
def get_words_by_sentiment(df, sentiment, top_n=15):
    """Get most frequent words for a specific sentiment"""
    sentiment_texts = df[df['vader_sentiment'] == sentiment]['cleaned_text']
    words = []
    
    for text in sentiment_texts:
        tokens = word_tokenize(text.lower())
        filtered_tokens = [word for word in tokens if word.isalpha() and word not in stop_words and len(word) > 2]
        words.extend(filtered_tokens)
    
    return FreqDist(words).most_common(top_n)

# Get word frequencies by sentiment
positive_words = get_words_by_sentiment(df, 'positive')
negative_words = get_words_by_sentiment(df, 'negative')
neutral_words = get_words_by_sentiment(df, 'neutral')

# Visualize word frequencies by sentiment
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(20, 8))

# Positive sentiment words
pos_words, pos_counts = zip(*positive_words)
ax1.barh(range(len(pos_words)), pos_counts, color='green', alpha=0.7)
ax1.set_yticks(range(len(pos_words)))
ax1.set_yticklabels(pos_words)
ax1.set_title('Most Common Words in Positive Reviews', fontsize=14, fontweight='bold')
ax1.set_xlabel('Frequency')
ax1.grid(True, alpha=0.3)

# Negative sentiment words
neg_words, neg_counts = zip(*negative_words)
ax2.barh(range(len(neg_words)), neg_counts, color='red', alpha=0.7)
ax2.set_yticks(range(len(neg_words)))
ax2.set_yticklabels(neg_words)
ax2.set_title('Most Common Words in Negative Reviews', fontsize=14, fontweight='bold')
ax2.set_xlabel('Frequency')
ax2.grid(True, alpha=0.3)

# Neutral sentiment words
neu_words, neu_counts = zip(*neutral_words)
ax3.barh(range(len(neu_words)), neu_counts, color='gray', alpha=0.7)
ax3.set_yticks(range(len(neu_words)))
ax3.set_yticklabels(neu_words)
ax3.set_title('Most Common Words in Neutral Reviews', fontsize=14, fontweight='bold')
ax3.set_xlabel('Frequency')
ax3.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("Word Analysis by Sentiment:")
print(f"\nPositive reviews - Top words: {', '.join([word for word, _ in positive_words[:5]])}")
print(f"Negative reviews - Top words: {', '.join([word for word, _ in negative_words[:5]])}")
print(f"Neutral reviews - Top words: {', '.join([word for word, _ in neutral_words[:5]])}")

## 9. Data Quality Assessment

In [None]:
# Data quality checks
print("Data Quality Assessment:")
print("=" * 50)

# Check for duplicates
duplicate_reviews = df.duplicated(subset=['review_text']).sum()
print(f"Duplicate reviews (by text): {duplicate_reviews}")

# Check for very short reviews
very_short = (df['word_count'] < 5).sum()
print(f"Very short reviews (<5 words): {very_short}")

# Check for very long reviews
very_long = (df['word_count'] > 200).sum()
print(f"Very long reviews (>200 words): {very_long}")

# Check for missing or empty text
empty_text = df['review_text'].isna().sum() + (df['review_text'] == '').sum()
print(f"Empty review texts: {empty_text}")

# Check rating distribution balance
rating_balance = df['rating'].value_counts(normalize=True)
print(f"\nRating distribution balance:")
for rating, pct in rating_balance.sort_index().items():
    print(f"  {rating} stars: {pct:.1%}")

# Check for potential spam indicators
print(f"\nPotential quality issues:")
print(f"Reviews with excessive exclamation marks: {df['review_text'].str.count('!').gt(5).sum()}")
print(f"Reviews with excessive capital letters: {df['review_text'].apply(lambda x: sum(1 for c in x if c.isupper()) / len(x) if len(x) > 0 else 0).gt(0.3).sum()}")

# Sentiment-rating alignment check
print(f"\nSentiment-Rating Alignment:")
aligned = 0
misaligned = 0

for _, row in df.iterrows():
    rating = row['rating']
    sentiment = row['vader_sentiment']
    
    if (rating >= 4 and sentiment == 'positive') or (rating <= 2 and sentiment == 'negative') or (rating == 3 and sentiment == 'neutral'):
        aligned += 1
    else:
        misaligned += 1

alignment_rate = aligned / (aligned + misaligned) * 100
print(f"Sentiment-rating alignment rate: {alignment_rate:.1f}%")
print(f"Aligned reviews: {aligned}")
print(f"Misaligned reviews: {misaligned}")

## 10. Insights and Recommendations for Model Training

In [None]:
# Generate insights and recommendations
print("KEY INSIGHTS AND RECOMMENDATIONS FOR MODEL TRAINING")
print("=" * 60)

print("\n1. DATASET CHARACTERISTICS:")
print(f"   • Total reviews: {len(df):,}")
print(f"   • Average review length: {df['word_count'].mean():.0f} words")
print(f"   • Vocabulary richness: {len(word_freq):,} unique words")
print(f"   • Rating distribution: Skewed towards positive (60.6% are 4-5 stars)")

print("\n2. SENTIMENT ANALYSIS INSIGHTS:")
print(f"   • VADER positive sentiment: {(df['vader_sentiment'] == 'positive').mean():.1%}")
print(f"   • VADER-rating correlation: {df['rating'].corr(df['vader_compound']):.3f}")
print(f"   • Sentiment-rating alignment: {alignment_rate:.1f}%")
print(f"   • Misaligned reviews provide valuable training signal")

print("\n3. DATA QUALITY:")
print(f"   • {duplicate_reviews} duplicate reviews - minimal impact")
print(f"   • {very_short} very short reviews - consider filtering")
print(f"   • {empty_text} empty reviews - needs handling")
print(f"   • Text preprocessing will improve model performance")

print("\n4. RECOMMENDATIONS FOR LSTM MODEL:")
print("   • Vocabulary size: Use top 10,000 words to balance coverage and complexity")
print("   • Sequence length: 200 tokens should capture ~95% of reviews")
print("   • Class imbalance: Consider stratified sampling and class weights")
print("   • Use bidirectional LSTM to capture context from both directions")
print("   • Apply dropout for regularization due to moderate dataset size")

print("\n5. TRAINING STRATEGY:")
print("   • Remove 3-star reviews for clearer binary classification")
print("   • Use 80-20 train-test split with stratification")
print("   • Monitor both LSTM and VADER performance for comparison")
print("   • Focus on misaligned cases for business insights")

print("\n6. EVALUATION METRICS:")
print("   • Primary: Accuracy, Precision, Recall, F1-score")
print("   • Secondary: Correlation with VADER scores")
print("   • Business: Misclassification analysis for actionable insights")

print("\n" + "=" * 60)
print("Ready for LSTM model training!")
print("Proceed to notebook 02_lstm_training.ipynb")

In [None]:
# Save processed dataset with VADER scores for model training
output_file = '../data/processed_reviews_with_vader.csv'
df.to_csv(output_file, index=False)
print(f"Processed dataset saved to: {output_file}")
print(f"Dataset includes VADER sentiment scores and cleaned text for model training.")