# Step 5: Final Visualization and Model Comparison

This notebook completes the sentiment analysis project with visualizations and bonus features.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from collections import Counter
from wordcloud import WordCloud
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Set style
plt.style.use('default')
sns.set_palette("husl")
np.random.seed(42)

In [None]:
# Load data and models
df = pd.read_csv('../data/preprocessed_reviews.csv')

with open('../results/tfidf_vectorizer.pkl', 'rb') as f:
    vectorizer = pickle.load(f)
with open('../results/logistic_regression_model.pkl', 'rb') as f:
    lr_model = pickle.load(f)
with open('../results/naive_bayes_model.pkl', 'rb') as f:
    nb_model = pickle.load(f)

print(f"Dataset shape: {df.shape}")
print(f"Vocabulary size: {len(vectorizer.get_feature_names_out())}")

In [None]:
# Prepare data for evaluation
X = vectorizer.transform(df['cleaned_review'])
y = df['sentiment'].values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Get predictions
lr_pred = lr_model.predict(X_test)
nb_pred = nb_model.predict(X_test)

# Calculate accuracies
lr_acc = accuracy_score(y_test, lr_pred)
nb_acc = accuracy_score(y_test, nb_pred)

print(f"Logistic Regression Accuracy: {lr_acc:.4f}")
print(f"Naive Bayes Accuracy: {nb_acc:.4f}")
print(f"\nBest Model: {'Logistic Regression' if lr_acc > nb_acc else 'Naive Bayes'}")

In [None]:
# Model Comparison Visualization
fig, ax = plt.subplots(1, 1, figsize=(10, 6))

models = ['Logistic Regression', 'Naive Bayes']
accuracies = [lr_acc, nb_acc]

bars = ax.bar(models, accuracies, color=['blue', 'orange'], alpha=0.7)
ax.set_ylabel('Accuracy')
ax.set_title('Model Performance Comparison')
ax.set_ylim(0, 1)

# Add value labels
for bar, acc in zip(bars, accuracies):
    ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01, 
            f'{acc:.4f}', ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.show()

In [None]:
# Generate Word Clouds
positive_text = ' '.join(df[df['sentiment'] == 1]['cleaned_review'])
negative_text = ' '.join(df[df['sentiment'] == 0]['cleaned_review'])

fig, axes = plt.subplots(1, 2, figsize=(20, 8))

# Positive word cloud
positive_wc = WordCloud(width=800, height=400, background_color='white',
                       colormap='Greens', max_words=100, random_state=42).generate(positive_text)
axes[0].imshow(positive_wc, interpolation='bilinear')
axes[0].set_title('Positive Sentiment Word Cloud', fontsize=16, fontweight='bold')
axes[0].axis('off')

# Negative word cloud
negative_wc = WordCloud(width=800, height=400, background_color='white',
                       colormap='Reds', max_words=100, random_state=42).generate(negative_text)
axes[1].imshow(negative_wc, interpolation='bilinear')
axes[1].set_title('Negative Sentiment Word Cloud', fontsize=16, fontweight='bold')
axes[1].axis('off')

plt.tight_layout()
plt.show()

# Save word clouds
positive_wc.to_file('../results/positive_wordcloud.png')
negative_wc.to_file('../results/negative_wordcloud.png')
print("Word clouds saved to results/ directory")

In [None]:
# Most Frequent Words Analysis
pos_words = Counter(positive_text.split()).most_common(15)
neg_words = Counter(negative_text.split()).most_common(15)

fig, axes = plt.subplots(2, 1, figsize=(12, 10))

# Positive words
words, counts = zip(*pos_words)
axes[0].barh(range(len(words)), counts, color='green', alpha=0.7)
axes[0].set_yticks(range(len(words)))
axes[0].set_yticklabels(words)
axes[0].set_title('Top 15 Most Frequent Words in Positive Reviews')
axes[0].invert_yaxis()

# Negative words
words, counts = zip(*neg_words)
axes[1].barh(range(len(words)), counts, color='red', alpha=0.7)
axes[1].set_yticks(range(len(words)))
axes[1].set_yticklabels(words)
axes[1].set_title('Top 15 Most Frequent Words in Negative Reviews')
axes[1].invert_yaxis()

plt.tight_layout()
plt.show()

print("Top 5 Positive Words:", [w[0] for w in pos_words[:5]])
print("Top 5 Negative Words:", [w[0] for w in neg_words[:5]])

In [None]:
# Feature Importance from Logistic Regression
feature_names = vectorizer.get_feature_names_out()
coefficients = lr_model.coef_[0]

# Top positive and negative features
top_pos_idx = np.argsort(coefficients)[-10:]
top_neg_idx = np.argsort(coefficients)[:10]

top_pos_features = [(feature_names[i], coefficients[i]) for i in reversed(top_pos_idx)]
top_neg_features = [(feature_names[i], coefficients[i]) for i in top_neg_idx]

print("Top 10 Positive Features:")
for feature, coef in top_pos_features:
    print(f"  {feature}: {coef:.4f}")

print("\nTop 10 Negative Features:")
for feature, coef in top_neg_features:
    print(f"  {feature}: {coef:.4f}")

In [None]:
# Final Project Summary
print("=" * 50)
print("     SENTIMENT ANALYSIS PROJECT COMPLETED")
print("=" * 50)

print(f"\n📊 Dataset: {len(df)} reviews ({len(df[df['sentiment']==1])} positive, {len(df[df['sentiment']==0])} negative)")
print(f"\n🤖 Models Trained:")
print(f"   • Logistic Regression: {lr_acc:.4f} accuracy")
print(f"   • Naive Bayes: {nb_acc:.4f} accuracy")

best_model = 'Logistic Regression' if lr_acc > nb_acc else 'Naive Bayes'
print(f"\n🏆 Best Model: {best_model}")

print(f"\n✅ Deliverables:")
print(f"   • Data preprocessing pipeline")
print(f"   • TF-IDF feature extraction")
print(f"   • Model training and evaluation")
print(f"   • Word clouds and visualizations")
print(f"   • Feature importance analysis")

print(f"\n🎉 PROJECT SUCCESSFULLY COMPLETED!")
print("=" * 50)