In [3]:
!pip install textblob

Collecting textblob
  Downloading textblob-0.19.0-py3-none-any.whl.metadata (4.4 kB)
Collecting nltk>=3.9 (from textblob)
  Downloading nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Downloading textblob-0.19.0-py3-none-any.whl (624 kB)
   ---------------------------------------- 0.0/624.3 kB ? eta -:--:--
   ---------------------------------------- 0.0/624.3 kB ? eta -:--:--
    --------------------------------------- 10.2/624.3 kB ? eta -:--:--
   - ------------------------------------- 20.5/624.3 kB 217.9 kB/s eta 0:00:03
   - ------------------------------------- 30.7/624.3 kB 217.9 kB/s eta 0:00:03
   -- ------------------------------------ 41.0/624.3 kB 245.8 kB/s eta 0:00:03
   --- ----------------------------------- 61.4/624.3 kB 272.3 kB/s eta 0:00:03
   ----- --------------------------------- 81.9/624.3 kB 327.3 kB/s eta 0:00:02
   ----- --------------------------------- 92.2/624.3 kB 275.8 kB/s eta 0:00:02
   ----- --------------------------------- 92.2/624.3 kB 275.8 kB/s et

In [4]:
!pip install vaderSentiment

Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl.metadata (572 bytes)
Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
   ---------------------------------------- 0.0/126.0 kB ? eta -:--:--
   --- ------------------------------------ 10.2/126.0 kB ? eta -:--:--
   --- ------------------------------------ 10.2/126.0 kB ? eta -:--:--
   ------ -------------------------------- 20.5/126.0 kB 110.1 kB/s eta 0:00:01
   ------ -------------------------------- 20.5/126.0 kB 110.1 kB/s eta 0:00:01
   ------ -------------------------------- 20.5/126.0 kB 110.1 kB/s eta 0:00:01
   --------- ----------------------------- 30.7/126.0 kB 100.9 kB/s eta 0:00:01
   --------- ----------------------------- 30.7/126.0 kB 100.9 kB/s eta 0:00:01
   --------------- ----------------------- 51.2/126.0 kB 131.3 kB/s eta 0:00:01
   ------------------- ------------------- 61.4/126.0 kB 142.6 kB/s eta 0:00:01
   ------------------- ------------------- 61.4/126.0 kB 14

In [1]:
# CISB5123 - Text Analytics - 03
# Lab Assignment 2 - Sentiment Analysis
# Name: Wai Chin Kang
# ID: SW01082417

import pandas as pd
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report

# 1. Load and preprocess data
file_path = "Reviews.csv" 

df = pd.read_csv(file_path)
df = df[['Score', 'Summary', 'Text']].dropna()
df = df[df['Score'] != 3] 

# Create sentiment labels
sentiment_map = {1: 'negative', 2: 'negative', 4: 'positive', 5: 'positive'}
df['Sentiment'] = df['Score'].map(sentiment_map)
df['Content'] = df['Summary'] + ". " + df['Text']

# Reduce dataset for testing purpose 
df = df.sample(n=1000, random_state=42)

# 2. Lexicon-based Analysis
analyzer = SentimentIntensityAnalyzer()
lexicon_results = []

for text in df['Content']:
    # TextBlob
    tb_polarity = TextBlob(text).sentiment.polarity
    tb_sentiment = 'positive' if tb_polarity > 0 else 'negative' if tb_polarity < 0 else 'neutral'

    # VADER
    vs = analyzer.polarity_scores(text)
    vader_compound = vs['compound']
    vader_sentiment = 'positive' if vader_compound > 0.05 else 'negative' if vader_compound < -0.05 else 'neutral'

    lexicon_results.append((tb_sentiment, vader_sentiment))

df['TextBlob_Pred'] = [res[0] for res in lexicon_results]
df['VADER_Pred'] = [res[1] for res in lexicon_results]

print("\nClassification Report - TextBlob:")
print(classification_report(df['Sentiment'], df['TextBlob_Pred']))

print("\nClassification Report - VADER:")
print(classification_report(df['Sentiment'], df['VADER_Pred']))

# 3. Machine Learning-based Analysis
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['Content'])
y = df['Sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Naive Bayes
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)
y_pred_nb = nb_model.predict(X_test)
print("\nClassification Report - Naive Bayes:")
print(classification_report(y_test, y_pred_nb))

# SVM
svm_model = SVC(kernel='linear')
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_test)
print("\nClassification Report - SVM:")
print(classification_report(y_test, y_pred_svm))


Classification Report - TextBlob:
              precision    recall  f1-score   support

    negative       0.65      0.37      0.48       147
     neutral       0.00      0.00      0.00         0
    positive       0.90      0.96      0.93       853

    accuracy                           0.88      1000
   macro avg       0.52      0.45      0.47      1000
weighted avg       0.86      0.88      0.86      1000


Classification Report - VADER:
              precision    recall  f1-score   support

    negative       0.74      0.44      0.55       147
     neutral       0.00      0.00      0.00         0
    positive       0.91      0.97      0.94       853

    accuracy                           0.89      1000
   macro avg       0.55      0.47      0.50      1000
weighted avg       0.89      0.89      0.88      1000


Classification Report - Naive Bayes:
              precision    recall  f1-score   support

    negative       0.75      0.20      0.32        45
    positive       0.88 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Classification Report - SVM:
              precision    recall  f1-score   support

    negative       0.56      0.33      0.42        45
    positive       0.89      0.95      0.92       255

    accuracy                           0.86       300
   macro avg       0.72      0.64      0.67       300
weighted avg       0.84      0.86      0.84       300



In [3]:
# Save preprocessed data to CSV
df[['Score', 'Summary', 'Text', 'Sentiment', 'Content']].to_csv("Extracted_Review.csv", index=False)

In [2]:
# 4. Discussion
print("""
Discussion:
Lexicon-based models, such as TextBlob and VADER, are easy to use and understand, but because they depend on predetermined word sentiment scores, 
their performance may be constrained. VADER typically performs better than TextBlob in this dataset, particularly when it comes to precision for 
positive sentiments.

By learning from the data, machine learning models—Naive Bayes and SVM in particular—perform better.
While SVM offers competitive results with a better margin separation but at a higher computational cost, Naive Bayes offers high accuracy with 
quick training. All things considered, machine learning models are more flexible and efficient for detecting subtle sentiment in bigger datasets.
""")


Discussion:
Lexicon-based models, such as TextBlob and VADER, are easy to use and understand, but because they depend on predetermined word sentiment scores, 
their performance may be constrained. VADER typically performs better than TextBlob in this dataset, particularly when it comes to precision for 
positive sentiments.

By learning from the data, machine learning models—Naive Bayes and SVM in particular—perform better.
While SVM offers competitive results with a better margin separation but at a higher computational cost, Naive Bayes offers high accuracy with 
quick training. All things considered, machine learning models are more flexible and efficient for detecting subtle sentiment in bigger datasets.

