# Lab Assignment 2 - Sentiment Analysis
# Author Name : Gahanesh Raavi
# ASU ID : 1234497630
# File Creation Date : 02/02/2025


In [30]:
#  Import Necessary Libraries
import pandas as pd              # Used for data manipulation
import numpy as np               # Used for numerical operations
import matplotlib.pyplot as plt  # Used for data visualization
import seaborn as sns            # Used for statistical plots

# Load the restaurant review data
file_path = 'restaurant_reviews_az.csv'  # Update this path if necessary
data = pd.read_csv(file_path)

# Displaying the first few rows of the dataset
print("Sample Data from the Dataset:")
print(data.head())

# Showing basic dataset information
print("\nDataset Information:")
print(data.info())

# Displaying column names
print("\nColumn Names:", data.columns.tolist())

Sample Data from the Dataset:
                review_id                 user_id             business_id  \
0  IVS7do_HBzroiCiymNdxDg  fdFgZQQYQJeEAshH4lxSfQ  sGy67CpJctjeCWClWqonjA   
1  QP2pSzSqpJTMWOCuUuyXkQ  JBLWSXBTKFvJYYiM-FnCOQ  3w7NRntdQ9h0KwDsksIt5Q   
2  oK0cGYStgDOusZKz9B1qug  2_9fKnXChUjC5xArfF8BLg  OMnPtRGmbY8qH_wIILfYKA   
3  E_ABvFCNVLbfOgRg3Pv1KQ  9MExTQ76GSKhxSWnTS901g  V9XlikTxq0My4gE8LULsjw   
4  Rd222CrrnXkXukR2iWj69g  LPxuausjvDN88uPr-Q4cQA  CA5BOxKRDPGJgdUQ8OUOpw   

   stars  useful  funny  cool  \
0      3       1      1     0   
1      5       1      1     1   
2      5       1      0     0   
3      5       0      0     0   
4      4       1      0     0   

                                                text                 date  
0  OK, the hype about having Hatch chili in your ...  2020-01-27 22:59:06  
1  Pandemic pit stop to have an ice cream.... onl...  2020-04-19 05:33:16  
2  I was lucky enough to go to the soft opening a...  2020-02-29 19:43:44  
3  I

In [7]:
# Remove 3-star reviews
filtered_data = data[data['stars'] != 3].copy()

# Create the Sentiment column: 0 for 1-2 stars, 1 for 4-5 stars
filtered_data['Sentiment'] = np.where(filtered_data['stars'] <= 2, 0, 1)

# Display the updated dataset
print("Sample Data After Filtering:")
print(filtered_data.head())

# Show the value counts for Sentiment column to verify the distribution
print("\nSentiment Distribution:")
print(filtered_data['Sentiment'].value_counts())

Sample Data After Filtering:
                review_id                 user_id             business_id  \
1  QP2pSzSqpJTMWOCuUuyXkQ  JBLWSXBTKFvJYYiM-FnCOQ  3w7NRntdQ9h0KwDsksIt5Q   
2  oK0cGYStgDOusZKz9B1qug  2_9fKnXChUjC5xArfF8BLg  OMnPtRGmbY8qH_wIILfYKA   
3  E_ABvFCNVLbfOgRg3Pv1KQ  9MExTQ76GSKhxSWnTS901g  V9XlikTxq0My4gE8LULsjw   
4  Rd222CrrnXkXukR2iWj69g  LPxuausjvDN88uPr-Q4cQA  CA5BOxKRDPGJgdUQ8OUOpw   
5  kx6O_lyLzUnA7Xip5wh2NA  YsINprB2G1DM8qG1hbrPUg  rViAhfKLKmwbhTKROM9m0w   

   stars  useful  funny  cool  \
1      5       1      1     1   
2      5       1      0     0   
3      5       0      0     0   
4      4       1      0     0   
5      1       0      0     0   

                                                text                 date  \
1  Pandemic pit stop to have an ice cream.... onl...  2020-04-19 05:33:16   
2  I was lucky enough to go to the soft opening a...  2020-02-29 19:43:44   
3  I've gone to claim Jumpers all over the US and...  2020-03-14 21:47:07   
4

In [32]:
from sklearn.model_selection import train_test_split

# Extracting features as (text) and target as (Sentiment)
X = filtered_data['text']
y = filtered_data['Sentiment']

# Now Split the dataset into 80% for training and 20% for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Displaying the size of each set
print("Training set size:", len(X_train))
print("Test set size:", len(X_test))

Training set size: 35274
Test set size: 8819


In [34]:
from sklearn.feature_extraction.text import CountVectorizer

# Using the Count Vectorizer with a maximum of 1000 features
vectorizer = CountVectorizer(max_features=1000)

# Fitting and transforming the training data and transforming the test data
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Showing feature names and vector shape
print("Sample feature names:", vectorizer.get_feature_names_out()[:10])
print(f"Training Data Shape: {X_train_vec.shape}")
print(f"Test Data Shape: {X_test_vec.shape}")

Sample feature names: ['00' '10' '100' '11' '12' '15' '19' '20' '25' '30']
Training Data Shape: (35274, 1000)
Test Data Shape: (8819, 1000)


In [36]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Using and training the Naive Bayes classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_vec, y_train)

# Making predictions on the test set
y_pred = nb_classifier.predict(X_test_vec)

# Evaluating the model
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print(f"Accuracy Score: {accuracy_score(y_test, y_pred):.4f}")

Confusion Matrix:
[[2185  369]
 [ 328 5937]]

Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.86      0.86      2554
           1       0.94      0.95      0.94      6265

    accuracy                           0.92      8819
   macro avg       0.91      0.90      0.90      8819
weighted avg       0.92      0.92      0.92      8819

Accuracy Score: 0.9210


In [38]:
from sklearn.svm import LinearSVC

# Using and training the SVM classifier with updated parameters
svm_classifier = LinearSVC(random_state=42, dual=False, max_iter=5000)
svm_classifier.fit(X_train_vec, y_train)

# Making predictions on the test set
y_pred_svm = svm_classifier.predict(X_test_vec)

# Evaluating the SVM model
print("Confusion Matrix (SVM):")
print(confusion_matrix(y_test, y_pred_svm))

print("\nClassification Report (SVM):")
print(classification_report(y_test, y_pred_svm))

print(f"Accuracy Score (SVM): {accuracy_score(y_test, y_pred_svm):.4f}")

Confusion Matrix (SVM):
[[2294  260]
 [ 202 6063]]

Classification Report (SVM):
              precision    recall  f1-score   support

           0       0.92      0.90      0.91      2554
           1       0.96      0.97      0.96      6265

    accuracy                           0.95      8819
   macro avg       0.94      0.93      0.94      8819
weighted avg       0.95      0.95      0.95      8819

Accuracy Score (SVM): 0.9476


In [40]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Using the TF-IDF Vectorizer with a maximum of 1000 features
tfidf_vectorizer = TfidfVectorizer(max_features=1000)

# Fitting and transforming the training data and transforming the test data
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Showing the feature names and vector shape
print("Sample TF-IDF feature names:", tfidf_vectorizer.get_feature_names_out()[:10])
print(f"Training Data Shape (TF-IDF): {X_train_tfidf.shape}")
print(f"Test Data Shape (TF-IDF): {X_test_tfidf.shape}")

Sample TF-IDF feature names: ['00' '10' '100' '11' '12' '15' '19' '20' '25' '30']
Training Data Shape (TF-IDF): (35274, 1000)
Test Data Shape (TF-IDF): (8819, 1000)


In [42]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Using and training the Naive Bayes classifier
nb_classifier_tfidf = MultinomialNB()
nb_classifier_tfidf.fit(X_train_tfidf, y_train)

# Making predictions on the test set
y_pred_tfidf = nb_classifier_tfidf.predict(X_test_tfidf)

# Evaluating the model
print("Confusion Matrix (TF-IDF + Naive Bayes):")
print(confusion_matrix(y_test, y_pred_tfidf))

print("\nClassification Report (TF-IDF + Naive Bayes):")
print(classification_report(y_test, y_pred_tfidf))

print(f"Accuracy Score (TF-IDF + Naive Bayes): {accuracy_score(y_test, y_pred_tfidf):.4f}")

Confusion Matrix (TF-IDF + Naive Bayes):
[[1863  691]
 [ 129 6136]]

Classification Report (TF-IDF + Naive Bayes):
              precision    recall  f1-score   support

           0       0.94      0.73      0.82      2554
           1       0.90      0.98      0.94      6265

    accuracy                           0.91      8819
   macro avg       0.92      0.85      0.88      8819
weighted avg       0.91      0.91      0.90      8819

Accuracy Score (TF-IDF + Naive Bayes): 0.9070


In [44]:
from sklearn.svm import LinearSVC

# Using and training the SVM classifier
svm_classifier_tfidf = LinearSVC(random_state=42, dual=False, max_iter=5000)
svm_classifier_tfidf.fit(X_train_tfidf, y_train)

# Making predictions on the test data set
y_pred_svm_tfidf = svm_classifier_tfidf.predict(X_test_tfidf)

# Evaluating the SVM model
print("Confusion Matrix (TF-IDF + SVM):")
print(confusion_matrix(y_test, y_pred_svm_tfidf))

print("\nClassification Report (TF-IDF + SVM):")
print(classification_report(y_test, y_pred_svm_tfidf))

print(f"Accuracy Score (TF-IDF + SVM): {accuracy_score(y_test, y_pred_svm_tfidf):.4f}")

Confusion Matrix (TF-IDF + SVM):
[[2322  232]
 [ 209 6056]]

Classification Report (TF-IDF + SVM):
              precision    recall  f1-score   support

           0       0.92      0.91      0.91      2554
           1       0.96      0.97      0.96      6265

    accuracy                           0.95      8819
   macro avg       0.94      0.94      0.94      8819
weighted avg       0.95      0.95      0.95      8819

Accuracy Score (TF-IDF + SVM): 0.9500


In [46]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import numpy as np
import nltk

# Downloading VADER as it is not downloaded before
nltk.download('vader_lexicon')

# Using the Sentiment Analyzer
sia = SentimentIntensityAnalyzer()

# Function to classify sentiment based on the compound score
def predict_sentiment(text):
    score = sia.polarity_scores(text)['compound']
    return 1 if score >= 0 else 0

# Applying the sentiment prediction on the test data set
y_pred_vader = [predict_sentiment(text) for text in X_test]

# Evaluating the performance
print("Confusion Matrix (VADER Sentiment):")
print(confusion_matrix(y_test, y_pred_vader))

print("\nClassification Report (VADER Sentiment):")
print(classification_report(y_test, y_pred_vader))

print(f"Accuracy Score (VADER Sentiment): {accuracy_score(y_test, y_pred_vader):.4f}")

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/gahan/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Confusion Matrix (VADER Sentiment):
[[1446 1108]
 [ 110 6155]]

Classification Report (VADER Sentiment):
              precision    recall  f1-score   support

           0       0.93      0.57      0.70      2554
           1       0.85      0.98      0.91      6265

    accuracy                           0.86      8819
   macro avg       0.89      0.77      0.81      8819
weighted avg       0.87      0.86      0.85      8819

Accuracy Score (VADER Sentiment): 0.8619


# Model Performance Comparison and Observations

After testing different sentiment analysis models on the Yelp reviews, here’s how they performed:

	1.	Naive Bayes (Count Vectorizer)
	•	This model did okay, with an accuracy around 92%.
	•	It handled straightforward cases well but stumbled when the sentiment was more subtle or nuanced.
 
	2.	Naive Bayes (TF-IDF)
	•	Things got better when we used TF-IDF features, bumping the accuracy up to 90%.
	•	By focusing more on important words rather than just raw counts, the model made smarter predictions.
 
	3.	SVM (Count Vectorizer)
	•	SVM stepped up the game, hitting an accuracy of 94%.
	•	It was more precise and consistent than Naive Bayes, making fewer mistakes.
 
	4.	SVM (TF-IDF)
	•	This was the star performer, with an impressive accuracy of 95%.
	•	Combining SVM with TF-IDF gave us a model that was both powerful and reliable.
 
	5.	VADER (Lexicon-Based)
	•	VADER did a decent job, scoring around 86% accuracy.
 
	•	It’s great for quick sentiment analysis without needing training data, but it struggled with sarcasm, context-specific meanings, and more complex expressions.

# Acknowledgement : 
I acknowledge that I have taken the help of ChatGPT in completing this assignment. I confirm that I have not used any other generative AI tools or external assistance beyond this interaction. All the work and content presented are my own, with guidance from ChatGPT where necessary.

In [57]:
!pip install jupyter
!pip install nbconvert
!jupyter nbconvert "LA2_Raavi_Gahanesh.ipynb" --to html

[NbConvertApp] Converting notebook LA2_Raavi_Gahanesh.ipynb to html
[NbConvertApp] Writing 311172 bytes to LA2_Raavi_Gahanesh.html
