# PIT - CS423 (Intelligent Systems)

# Sentiment Analysis using VADER and Machine Learning

__Submitted By: Lariosa, Gerald Darwin,__

__Malacaste, Febby Kim__

_Submitted At: May 23, 2025_

## Import Libraries

In [1]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# NLP preprocessing libraries
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

# Model training and evaluation
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# For exporting the model
import pickle


## Load the Dataset and Explore

In [2]:
# Load the CSV dataset
df = pd.read_csv('kaggle_RC_2019-05.csv')

# Explore the dataset
print("Initial dataset shape:", df.shape)
display(df.head())
df.info()


Initial dataset shape: (1000000, 4)


Unnamed: 0,subreddit,body,controversiality,score
0,gameofthrones,Your submission has been automatically removed...,0,1
1,aww,"Dont squeeze her with you massive hand, you me...",0,19
2,gaming,It's pretty well known and it was a paid produ...,0,3
3,news,You know we have laws against that currently c...,0,10
4,politics,"Yes, there is a difference between gentle supp...",0,1


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 4 columns):
 #   Column            Non-Null Count    Dtype 
---  ------            --------------    ----- 
 0   subreddit         1000000 non-null  object
 1   body              1000000 non-null  object
 2   controversiality  1000000 non-null  int64 
 3   score             1000000 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 30.5+ MB


## Data Preprocessing - Clean Raw Data

In [3]:
# 1. Remove rows with null values in the 'body' column
df = df.dropna(subset=['body'])
print("After dropping null 'body' entries:", df.shape)

# 2. Remove duplicate comments (using the 'body' column)
df = df.drop_duplicates(subset=['body'])
print("After removing duplicate comments:", df.shape)

# 3. Remove very short comments (less than 4 tokens)
df['token_count'] = df['body'].apply(lambda x: len(x.split()))
df = df[df['token_count'] >= 4]
print("After filtering out very short comments:", df.shape)
df = df.drop(columns='token_count')

# 4. Remove comments that are “[removed]” or “[deleted]”
df = df[~df['body'].str.lower().isin(['[removed]', '[deleted]'])]
print("After removing removed/deleted comments:", df.shape)


After dropping null 'body' entries: (1000000, 4)
After removing duplicate comments: (971729, 4)
After filtering out very short comments: (971729, 5)
After removing removed/deleted comments: (971729, 4)


## Define Custom Preprocessing (Tokenizing, Lemmatizing, and Stopword Removal)

In [25]:
# Load the standard English stopwords.
default_stopwords = set(stopwords.words('english'))

# Define a set of intensifiers that you want to preserve.
intensifiers = {
    "very", "really", "so", "too", "extremely",
    "incredibly", "absolutely", "completely",
    "utterly", "highly", "remarkably", "awfully", "not"
}

# Remove intensifiers from the default stopwords so they are preserved.
updated_stopwords = default_stopwords - intensifiers

# Print the removed intensifiers (those that remain preserved)
print("Intensifiers preserved:\n", intensifiers.intersection(default_stopwords))
print("\nTotal stopwords originally:", len(default_stopwords))
print("Total stopwords updated:", len(updated_stopwords))

# Initialize the lemmatizer (we'll use it below)
lemmatizer = WordNetLemmatizer()

def custom_analyzer(text):
    """
    Custom analyzer to perform:
    - Lowercasing
    - Punctuation removal
    - Tokenization
    - Stopword removal with updated stopwords list (preserving intensifiers)
    - Lemmatization
    """
    # Convert text to lowercase
    text = text.lower()
    
    # Remove punctuation and non-alphanumeric characters (preserving spaces)
    text = re.sub(r'[^a-z\s]', '', text)
    
    # Tokenize text into words
    tokens = word_tokenize(text)
    
    # Remove stopwords using updated_stopwords
    tokens = [token for token in tokens if token not in updated_stopwords]
    
    # Apply lemmatization
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    return tokens


Intensifiers preserved:
 {'very', 'too', 'not', 'so'}

Total stopwords originally: 198
Total stopwords updated: 194


## Label the Data with VADER

In [5]:
# Initialize VADER sentiment analyzer
sia = SentimentIntensityAnalyzer()

def label_sentiment(text):
    """
    Label text as 'positive' if compound score ≥ 0.05,
    'negative' if compound score ≤ -0.05 and
    returns None for neutral sentiment.
    """
    score = sia.polarity_scores(text)['compound']
    if score >= 0.05:
        return 'positive'
    elif score <= -0.05:
        return 'negative'
    else:
        return None

# Apply VADER labeling on the original text (to preserve punctuation cues)
df['label'] = df['body'].apply(label_sentiment)

# Keep only those rows that are clearly positive or negative.
df = df[df['label'].notnull()]
print("Dataset shape after VADER labeling (excluding neutrals):", df.shape)


Dataset shape after VADER labeling (excluding neutrals): (711241, 5)


In [6]:
df.head()

Unnamed: 0,subreddit,body,controversiality,score,label
0,gameofthrones,Your submission has been automatically removed...,0,1,positive
1,aww,"Dont squeeze her with you massive hand, you me...",0,19,positive
2,gaming,It's pretty well known and it was a paid produ...,0,3,positive
3,news,You know we have laws against that currently c...,0,10,negative
4,politics,"Yes, there is a difference between gentle supp...",0,1,positive


In [7]:
print(df['label'].value_counts())


label
positive    406722
negative    304519
Name: count, dtype: int64


## Splitting Data and Vectorization

In [26]:
# Using stratification to maintain class balance in train/test sets
X_train, X_test, y_train, y_test = train_test_split(
    df['body'], df['label'], test_size=0.2, random_state=42, stratify=df['label']
)

print("Train size:", len(X_train), "| Test size:", len(X_test))

Train size: 568992 | Test size: 142249


In [27]:
from joblib import parallel_backend

vectorizer = TfidfVectorizer(analyzer=custom_analyzer, ngram_range=(1, 2))

# Enable parallel processing during TF-IDF transformation
with parallel_backend('loky', n_jobs=-1):  # 'loky' uses CPU cores
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)

print("TF-IDF transformation completed in parallel.")



TF-IDF transformation completed in parallel.


## Training multiple classifiers & Evaluating Models

In [28]:
log_reg = LogisticRegression(
    solver='saga',
    penalty='l2',
    C=2.5,                      # Increased to 4 for more capacity
    max_iter=2000,            # Extended iterations for robust convergence on large datasets
    tol=1e-4,
    class_weight='balanced',
    n_jobs=-1,                # Utilize all cores for faster computation
    random_state=42
)

# Reduced complexity for Random Forest to help allocate more time/resources to Logistic Regression
rf = RandomForestClassifier(
    n_estimators=100,        
    max_depth=8,            
    min_samples_split=5,
    min_samples_leaf=3,
    class_weight='balanced',
    n_jobs=-1,
    random_state=42
)

# Initialize the other classifiers (static configurations)
classifiers = {
    "Logistic Regression": log_reg,
    "Naive Bayes": MultinomialNB(),
    "Random Forest": rf,
    "SVM": LinearSVC(random_state=42)
}

evaluation_results = {}

# Train and evaluate models in parallel using Joblib
with parallel_backend('loky', n_jobs=-1): 
    for clf_name, clf in classifiers.items():
        clf.fit(X_train_vec, y_train)
        y_pred = clf.predict(X_test_vec)

        evaluation_results[clf_name] = {
            "Accuracy": accuracy_score(y_test, y_pred),
            "Precision": precision_score(y_test, y_pred, pos_label='positive'),
            "Recall": recall_score(y_test, y_pred, pos_label='positive'),
            "F1 Score": f1_score(y_test, y_pred, pos_label='positive')
        }

        print(f"=== {clf_name} Classification Report ===")
        print(classification_report(y_test, y_pred))
        print("=" * 60)

# Convert results to a DataFrame for easy viewing
eval_df = pd.DataFrame(evaluation_results).T
print("Evaluation Metrics Summary:")
print(eval_df)

=== Logistic Regression Classification Report ===
              precision    recall  f1-score   support

    negative       0.86      0.88      0.87     60904
    positive       0.91      0.89      0.90     81345

    accuracy                           0.89    142249
   macro avg       0.88      0.89      0.88    142249
weighted avg       0.89      0.89      0.89    142249

=== Naive Bayes Classification Report ===
              precision    recall  f1-score   support

    negative       0.84      0.64      0.72     60904
    positive       0.77      0.91      0.83     81345

    accuracy                           0.79    142249
   macro avg       0.80      0.77      0.78    142249
weighted avg       0.80      0.79      0.79    142249

=== Random Forest Classification Report ===
              precision    recall  f1-score   support

    negative       0.71      0.68      0.70     60904
    positive       0.77      0.79      0.78     81345

    accuracy                           0.74   

In [29]:
import joblib

# Select the best model based on F1 Score
best_model_name = max(evaluation_results, key=lambda key: evaluation_results[key]["F1 Score"])
best_model = classifiers[best_model_name]

print("Best Model Selected:", best_model_name)

Best Model Selected: Logistic Regression


## Saving the best model

In [30]:
# Save the best model using pickle
with open("sentiment_best_model.pkl", "wb") as model_file:
    pickle.dump(best_model, model_file)

print("Best model saved in 'sentiment_best_model.pkl'.")

Best model saved in 'sentiment_best_model.pkl'.


In [31]:
with open("vectorizer.pkl", "wb") as vec_file:
    pickle.dump(vectorizer, vec_file)
    
print("Vectorizer saved in 'vectorizer.pkl'.")

Vectorizer saved in 'vectorizer.pkl'.
