In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report
import re

# Load the dataset
df = pd.read_csv('dataset/sentiment_detection_dataset.csv', encoding='latin1')
df = df.dropna()

# Map the label column values
label_mapping = {
    'negative': 0,
    'neutral': 1,
    'positive': 2
}
df['label'] = df['label'].map(label_mapping)

In [2]:
# Define cleaning functions
def remove_urls(text):
    return re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

def remove_non_ascii(text):
    # Remove non-ASCII characters
    return ''.join(char for char in text if ord(char) < 128)

def remove_digits(text):
    # Remove numeric digits
    return re.sub(r'\d+', '', text)

def remove_special_characters(text):
    # Remove special characters except whitespace
    return re.sub(r'[^\w\s]', '', text)

def normalize_case(text):
    # Normalize text to lowercase
    return text.lower()

def clean_text(text):
    # Remove URLs
    text = remove_urls(text)
    # Remove non-ASCII characters
    text = remove_non_ascii(text)
    # Remove numeric digits
    text = remove_digits(text)
    # Remove special characters except whitespace
    text = remove_special_characters(text)
    # Normalize case
    text = normalize_case(text)
    # Remove extra whitespace
    text = ' '.join(text.split())
    return text

# Apply cleaning functions to the 'comment' column
df['comment'] = df['comment'].apply(clean_text)

In [3]:
# Split dataset
X = df['comment']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [4]:
# Define a function to train, evaluate models and print the classification report
def train_and_evaluate_model(pipeline, model_name):
    # Train the model
    pipeline.fit(X_train, y_train)
    
    # Predict on the test set
    y_pred = pipeline.predict(X_test)
    
    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    print(f'{model_name} Accuracy: {accuracy}')
    
    # Print precision, recall, and F1-score
    print(f'{model_name} Classification Report:\n')
    print(classification_report(y_test, y_pred, target_names=['negative', 'neutral', 'positive']))
    print('-'*60)

In [5]:
# 1. SVM Model
pipeline_svm = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),
    ('svc', SVC(kernel='linear'))  # SVM classifier with a linear kernel
])
train_and_evaluate_model(pipeline_svm, 'SVM')

SVM Accuracy: 0.6940805434255216
SVM Classification Report:

              precision    recall  f1-score   support

    negative       0.72      0.57      0.64      2356
     neutral       0.64      0.75      0.69      3343
    positive       0.77      0.73      0.75      2545

    accuracy                           0.69      8244
   macro avg       0.71      0.69      0.69      8244
weighted avg       0.70      0.69      0.69      8244

------------------------------------------------------------


In [6]:
# 2. Logistic Regression Model
pipeline_lr = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),
    ('lr', LogisticRegression(max_iter=1000,C=0.5))  # Logistic Regression
])
train_and_evaluate_model(pipeline_lr, 'Logistic Regression')

Logistic Regression Accuracy: 0.6754002911208151
Logistic Regression Classification Report:

              precision    recall  f1-score   support

    negative       0.72      0.52      0.61      2356
     neutral       0.61      0.77      0.68      3343
    positive       0.77      0.69      0.73      2545

    accuracy                           0.68      8244
   macro avg       0.70      0.66      0.67      8244
weighted avg       0.69      0.68      0.67      8244

------------------------------------------------------------


In [7]:
# 3. Random Forest Model
pipeline_rf = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),
    ('rf', RandomForestClassifier(n_estimators=100,max_depth=10))  # Random Forest Classifier
])
train_and_evaluate_model(pipeline_rf, 'Random Forest')

Random Forest Accuracy: 0.435710819990296
Random Forest Classification Report:

              precision    recall  f1-score   support

    negative       0.80      0.01      0.02      2356
     neutral       0.42      0.99      0.59      3343
    positive       0.88      0.10      0.18      2545

    accuracy                           0.44      8244
   macro avg       0.70      0.37      0.26      8244
weighted avg       0.67      0.44      0.30      8244

------------------------------------------------------------


In [8]:
# 4. Naive Bayes Model
pipeline_nb = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),
    ('nb', MultinomialNB())  # Naive Bayes Classifier
])
train_and_evaluate_model(pipeline_nb, 'Naive Bayes')

Naive Bayes Accuracy: 0.6259097525473072
Naive Bayes Classification Report:

              precision    recall  f1-score   support

    negative       0.76      0.41      0.53      2356
     neutral       0.55      0.80      0.65      3343
    positive       0.74      0.60      0.66      2545

    accuracy                           0.63      8244
   macro avg       0.68      0.60      0.61      8244
weighted avg       0.66      0.63      0.62      8244

------------------------------------------------------------
