In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import warnings
warnings.filterwarnings('ignore')

# Download necessary NLTK resources
try:
    nltk.download('stopwords', quiet=True)
    nltk.download('wordnet', quiet=True)
    nltk.download('omw-1.4', quiet=True)
except:
    print("NLTK download failed, but continuing...")

# Load the dataset
print("Loading the dataset...")
df = pd.read_csv('Downloads/laptops_dataset_final_600.csv')

# Display basic information about the dataset
print("\nDataset shape:", df.shape)
print("\nColumn names:", df.columns.tolist())
print("\nFirst few rows:")
print(df.head())

# Check for missing values
print("\nMissing values per column:")
print(df.isnull().sum())

# Handle missing values
df = df.dropna(subset=['review'])  # Drop rows where review is missing
df['title'] = df['title'].fillna('')  # Fill missing titles with empty string

# Create a sentiment column based on rating
# Ratings > 3 are positive, <= 3 are negative
print("\nCreating sentiment target based on ratings...")
df['sentiment'] = df['rating'].apply(lambda x: 'positive' if x > 3 else 'negative')

# Check class distribution
print("\nSentiment class distribution:")
print(df['sentiment'].value_counts())
print(f"Percentage of positive reviews: {(df['sentiment'] == 'positive').mean() * 100:.2f}%")

# Text preprocessing function
def preprocess_text(text):
    if not isinstance(text, str):
        return ''
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    
    # Remove punctuation and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    # Remove stopwords and lemmatize
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    
    words = text.split()
    filtered_words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    
    return ' '.join(filtered_words)

# Apply preprocessing to title and review columns
print("\nPreprocessing text data...")
df['processed_review'] = df['review'].apply(preprocess_text)
df['processed_title'] = df['title'].apply(preprocess_text)

# Combine title and review for better features
df['processed_text'] = df['processed_title'] + " " + df['processed_review']

# Encode target variable
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['sentiment'])

# Feature matrix
X = df['processed_text']

# Print processed examples
print("\nExample of processed text:")
print(X.iloc[0])

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"\nTraining set size: {X_train.shape[0]}")
print(f"Testing set size: {X_test.shape[0]}")

# Create a pipeline with TF-IDF vectorization and Support Vector Machine classifier
print("\nBuilding and training SVM model with hyperparameter tuning...")
svm_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000)),
    ('classifier', SVC(random_state=42))
])

# Define hyperparameter grid for SVM
svm_param_grid = {
    'tfidf__ngram_range': [(1, 1), (1, 2)],
    'tfidf__max_features': [3000, 5000],
    'classifier__C': [0.1, 1, 10],
    'classifier__kernel': ['linear', 'rbf'],
    'classifier__gamma': ['scale', 'auto']
}

# Perform grid search for SVM
svm_grid_search = GridSearchCV(
    svm_pipeline,
    svm_param_grid,
    cv=3,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

svm_grid_search.fit(X_train, y_train)

# Get best SVM model
svm_best = svm_grid_search.best_estimator_
print("\nBest SVM parameters:")
print(svm_grid_search.best_params_)

# Evaluate SVM on test set
svm_y_pred = svm_best.predict(X_test)
print("\nSVM Test Accuracy:", accuracy_score(y_test, svm_y_pred))
print("\nSVM Classification Report:")
print(classification_report(y_test, svm_y_pred, target_names=label_encoder.classes_))

# Confusion Matrix for SVM
svm_cm = confusion_matrix(y_test, svm_y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(svm_cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=label_encoder.classes_,
            yticklabels=label_encoder.classes_)
plt.title('SVM Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.savefig('svm_confusion_matrix.png')
plt.close()

# Create a pipeline with TF-IDF vectorization and Random Forest classifier
print("\nBuilding and training Random Forest model with hyperparameter tuning...")
rf_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000)),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Define hyperparameter grid for Random Forest
rf_param_grid = {
    'tfidf__ngram_range': [(1, 1), (1, 2)],
    'tfidf__max_features': [3000, 5000],
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [None, 10, 20],
    'classifier__min_samples_split': [2, 5]
}

# Perform grid search for Random Forest
rf_grid_search = GridSearchCV(
    rf_pipeline,
    rf_param_grid,
    cv=3,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

rf_grid_search.fit(X_train, y_train)

# Get best Random Forest model
rf_best = rf_grid_search.best_estimator_
print("\nBest Random Forest parameters:")
print(rf_grid_search.best_params_)

# Evaluate Random Forest on test set
rf_y_pred = rf_best.predict(X_test)
print("\nRandom Forest Test Accuracy:", accuracy_score(y_test, rf_y_pred))
print("\nRandom Forest Classification Report:")
print(classification_report(y_test, rf_y_pred, target_names=label_encoder.classes_))

# Confusion Matrix for Random Forest
rf_cm = confusion_matrix(y_test, rf_y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(rf_cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=label_encoder.classes_,
            yticklabels=label_encoder.classes_)
plt.title('Random Forest Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.savefig('rf_confusion_matrix.png')
plt.close()

# Compare models
models = ['SVM', 'Random Forest']
accuracies = [accuracy_score(y_test, svm_y_pred), accuracy_score(y_test, rf_y_pred)]

plt.figure(figsize=(10, 6))
sns.barplot(x=models, y=accuracies)
plt.title('Model Comparison - Test Accuracy')
plt.ylim(0, 1)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.savefig('model_comparison.png')
plt.close()

# Examine misclassified examples
print("\nExamining misclassified examples...")
misclassified_indices = np.where(rf_y_pred != y_test)[0]

if len(misclassified_indices) > 0:
    # Get original indices in the test set
    original_indices = X_test.index[misclassified_indices]
    
    # Sample up to 5 misclassified examples
    sample_size = min(5, len(misclassified_indices))
    sample_indices = np.random.choice(original_indices, sample_size, replace=False)
    
    print(f"\nSample of {sample_size} misclassified examples:")
    for idx in sample_indices:
        print(f"\nProduct: {df.loc[idx, 'product_name']}")
        print(f"Original Rating: {df.loc[idx, 'rating']}")
        print(f"Original Review: {df.loc[idx, 'review'][:100]}...")
        print(f"Actual Sentiment: {df.loc[idx, 'sentiment']}")
        print(f"Predicted Sentiment: {label_encoder.inverse_transform([rf_y_pred[np.where(X_test.index == idx)[0][0]]])[0]}")
        print("-" * 50)

# Feature importance analysis for Random Forest
if hasattr(rf_best['classifier'], 'feature_importances_'):
    # Get feature names from the vectorizer
    tfidf_vectorizer = rf_best.named_steps['tfidf']
    feature_names = np.array(tfidf_vectorizer.get_feature_names_out())
    
    # Get feature importances
    importances = rf_best.named_steps['classifier'].feature_importances_
    
    # Sort feature importances in descending order
    indices = np.argsort(importances)[::-1]
    
    # Print and plot the top 20 most important features
    top_n = 20
    top_features = [(feature_names[i], importances[i]) for i in indices[:top_n]]
    
    print("\nTop 20 most important words for sentiment prediction:")
    for i, (feature, importance) in enumerate(top_features, 1):
        print(f"{i}. {feature}: {importance:.4f}")
    
    # Plot feature importances
    plt.figure(figsize=(12, 8))
    plt.title(f"Top {top_n} Feature Importances (Random Forest)")
    plt.bar(range(top_n), [imp for _, imp in top_features], align='center')
    plt.xticks(range(top_n), [feat for feat, _ in top_features], rotation=45, ha='right')
    plt.tight_layout()
    plt.savefig('feature_importance.png')
    plt.close()

# Word cloud visualization for positive and negative sentiments
try:
    from wordcloud import WordCloud
    
    # Create word clouds for positive and negative reviews
    plt.figure(figsize=(12, 6))
    
    # Positive reviews
    positive_text = ' '.join(df[df['sentiment'] == 'positive']['processed_text'])
    if positive_text.strip():
        wordcloud_positive = WordCloud(width=600, height=300, background_color='white', 
                                     max_words=100, contour_width=3, contour_color='steelblue')
        wordcloud_positive.generate(positive_text)
        
        plt.subplot(1, 2, 1)
        plt.imshow(wordcloud_positive, interpolation='bilinear')
        plt.title('Positive Reviews')
        plt.axis('off')
    
    # Negative reviews
    negative_text = ' '.join(df[df['sentiment'] == 'negative']['processed_text'])
    if negative_text.strip():
        wordcloud_negative = WordCloud(width=600, height=300, background_color='white',
                                     max_words=100, contour_width=3, contour_color='firebrick')
        wordcloud_negative.generate(negative_text)
        
        plt.subplot(1, 2, 2)
        plt.imshow(wordcloud_negative, interpolation='bilinear')
        plt.title('Negative Reviews')
        plt.axis('off')
    
    plt.tight_layout()
    plt.savefig('sentiment_wordclouds.png')
    plt.close()
except ImportError:
    print("WordCloud package not installed. Skipping word cloud visualization.")

print("\nAnalysis complete! Models have been trained and evaluated.")

Loading the dataset...

Dataset shape: (24113, 7)

Column names: ['product_name', 'overall_rating', 'no_ratings', 'no_reviews', 'rating', 'title', 'review']

First few rows:
                                        product_name  overall_rating  \
0  Apple MacBook AIR Apple M2 - (8 GB/256 GB SSD/...             4.7   
1  Apple MacBook AIR Apple M2 - (8 GB/256 GB SSD/...             4.7   
2  Apple MacBook AIR Apple M2 - (8 GB/256 GB SSD/...             4.7   
3  Apple MacBook AIR Apple M2 - (8 GB/256 GB SSD/...             4.7   
4  Apple MacBook AIR Apple M2 - (8 GB/256 GB SSD/...             4.7   

  no_ratings no_reviews  rating             title  \
0     15,210        900       5  Perfect product!   
1     15,210        900       5         Fabulous!   
2     15,210        900       5         Fabulous!   
3     15,210        900       4        Delightful   
4     15,210        900       5           Awesome   

                                              review  
0  Loved it, it's m