In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [3]:
# Download stopwords
nltk.download('stopwords')

# Initialize components
stop_words = set(stopwords.words('english'))
ps = PorterStemmer()

# Read the dataset and select necessary columns
data = pd.read_csv('Reviews.csv')[['Score', 'Text']].dropna()

# Function to clean the review text
def preprocess_text(review):
    review = str(review).lower()
    review = re.sub(r'<.*?>', '', review)  # Remove HTML tags
    review = re.sub(r'http\S+|www\S+|https\S+', '', review)  # Remove URLs
    review = re.sub(r'\d+', '', review)  # Remove digits
    review = re.sub(r'[^a-z\s]', '', review)  # Remove special characters
    words = review.split()
    words = [ps.stem(word) for word in words if word not in stop_words]
    return ' '.join(words)

# Apply text cleaning
data['cleaned_review'] = data['Text'].apply(preprocess_text)

# Function to assign sentiment based on rating
def categorize_sentiment(rating):
    if rating <= 2:
        return 'negative'
    elif rating == 3:
        return 'neutral'
    else:
        return 'positive'

# Apply sentiment labeling
data['sentiment_label'] = data['Score'].apply(categorize_sentiment)

# Initialize VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()
data['vader_score'] = data['Text'].apply(lambda x: analyzer.polarity_scores(x)['compound'])

# Function to categorize VADER sentiment
def classify_vader_sentiment(vader_score):
    if vader_score > 0.05:
        return 'positive'
    elif vader_score < -0.05:
        return 'negative'
    else:
        return 'neutral'

data['vader_sentiment_label'] = data['vader_score'].apply(classify_vader_sentiment)

# Print VADER classification report
print("VADER Sentiment Analysis Report:")
print(classification_report(data['sentiment_label'], data['vader_sentiment_label']))

# TF-IDF Vectorization for ML model
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(data['cleaned_review'])
y = data['sentiment_label']

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Logistic Regression Model
classifier = LogisticRegression(max_iter=1000)
classifier.fit(X_train, y_train)

# Predict and Evaluate the Logistic Regression model
y_pred = classifier.predict(X_test)
print("Logistic Regression Sentiment Analysis Report:")
print(classification_report(y_test, y_pred))

# Save the processed data
data.to_csv('Processed_Reviews.csv', index=False)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Azif\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


VADER Sentiment Analysis Report:
              precision    recall  f1-score   support

    negative       0.59      0.40      0.47     82037
     neutral       0.14      0.04      0.06     42640
    positive       0.84      0.95      0.89    443777

    accuracy                           0.80    568454
   macro avg       0.52      0.46      0.48    568454
weighted avg       0.75      0.80      0.77    568454

Logistic Regression Sentiment Analysis Report:
              precision    recall  f1-score   support

    negative       0.73      0.66      0.69     16181
     neutral       0.51      0.18      0.26      8485
    positive       0.90      0.97      0.93     89025

    accuracy                           0.86    113691
   macro avg       0.71      0.60      0.63    113691
weighted avg       0.84      0.86      0.85    113691



VADER (Lexicon-based Approach)
Strengths:

No training needed: VADER doesn’t need training data, so it’s easy to use right away.

Fast: It works quickly, making it great for real-time analysis like social media posts.

Good for informal text: It handles slang, emojis, and short text well.

Weaknesses:

Limited to predefined rules: It can’t adjust to specific topics or new words.

Struggles with neutral text: VADER often misclassifies neutral reviews.

False positives: It might wrongly label some negative or neutral reviews as positive.

Logistic Regression (Machine Learning Approach)
Strengths:

Learns from data: Logistic Regression improves as it learns from labeled data.

Handles complex relationships: It can understand complex patterns in text.

Balanced: It provides a good balance between precision and recall.

Weaknesses:

Needs labeled data: It requires a lot of data to train, which takes time.

Hard for neutral sentiment: It doesn’t perform as well for neutral sentiments.

Sensitive to noise: If the data is messy, the model may struggle.