### Import Required Libraries

In [71]:
import pandas as pd
import numpy as np
from nltk.classify import NaiveBayesClassifier
from nltk.classify.util import accuracy
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report
from warnings import filterwarnings
filterwarnings('ignore')

## **NLTK Model**

In [72]:
df = pd.read_csv('Comments.csv')
df.dropna(subset=['Comment'], inplace=True)

comments = df['Comment'].tolist()
sentiments = df['Sentiment'].tolist()

training = dict(zip(comments, sentiments))

# Convert the dictionary to a list of tuples
# Each tuple contains a comment and the corresponding sentiment label
data_tuples = [(comment, sentiment) for comment, sentiment in training.items()]

# Split into training and testing sets
train_size = int(len(data_tuples) * 0.8)
train_set, test_set = data_tuples[:train_size], data_tuples[train_size:]

# Define a simple feature extractor
def word_features(comment):
    # Check if the comment is a string
    if isinstance(comment, str):
        return dict((word, True) for word in comment.split())
    else:
        # Handle non-string values (e.g., floats)
        print(f"Skipping non-string value: {comment}")
        return {}

# Convert the data to the required format
train_features = [(word_features(comment), sentiment) for (comment, sentiment) in train_set]
test_features = [(word_features(comment), sentiment) for (comment, sentiment) in test_set]

# Train the Naive Bayes classifier
classifier = NaiveBayesClassifier.train(train_features)

# Evaluate accuracy
acc = accuracy(classifier, test_features)
print(f'Accuracy: {acc * 100:.2f}%')

Accuracy: 51.76%


## **Ternary Sci-Kit Learn Model**

In [73]:
# prepare training data

training_data = pd.read_csv('comments.csv', index_col=0)
training_data.dropna(subset=['Comment'], inplace=True)

# set features and label

X = training_data['Comment']
y = training_data['Sentiment']

# split dataset into training and testing subsets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# vectorize features

vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# fit training data to model

bayes_classifier = MultinomialNB()
bayes_classifier.fit(X_train_vec, y_train)

# get accuracy score

y_pred = bayes_classifier.predict(X_test_vec)
print(f"Accuracy: {(accuracy_score(y_test, y_pred)*100):.2f}%")
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 65.21%
Classification Report:
               precision    recall  f1-score   support

         0.0       1.00      0.01      0.01       475
         1.0       0.73      0.21      0.32       937
         2.0       0.65      0.97      0.78      2270

    accuracy                           0.65      3682
   macro avg       0.79      0.39      0.37      3682
weighted avg       0.71      0.65      0.56      3682



## **Binary Sci-Kit Learn Model**

In [74]:
# prepare training data

training_data = pd.read_csv('comments.csv', index_col=0)
training_data['Sentiment'] = np.where(training_data['Sentiment'] == 1, 2, training_data['Sentiment'])
training_data.dropna(subset=['Comment'], inplace=True)

# set features and label

X = training_data['Comment']
y = training_data['Sentiment']

# split dataset into training and testing subsets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# vectorize features

vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# fit training data to model

bayes_classifier = BernoulliNB()
bayes_classifier.fit(X_train_vec, y_train)

# get accuracy score

y_pred = bayes_classifier.predict(X_test_vec)
print(f"Accuracy: {(accuracy_score(y_test, y_pred)*100):.2f}%")
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 85.74%
Classification Report:
               precision    recall  f1-score   support

         0.0       0.38      0.17      0.23       475
         2.0       0.89      0.96      0.92      3207

    accuracy                           0.86      3682
   macro avg       0.63      0.56      0.58      3682
weighted avg       0.82      0.86      0.83      3682



### Get List of Stopwords

In [75]:
stopwords = []
with open('stopwords.txt') as file:
    for line in file:
        stopwords.append(line)
stopwords = [item.replace('\n', '') for item in stopwords]

## **Ternary Sci-Kit Learn Model (without Stopwords)**

In [76]:
# prepare training data

training_data = pd.read_csv('comments.csv', index_col=0)
training_data.dropna(subset=['Comment'], inplace=True)

# set features and label

X = training_data['Comment']
y = training_data['Sentiment']

# split dataset into training and testing subsets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# vectorize features

vectorizer = TfidfVectorizer(stop_words=stopwords, strip_accents='unicode')
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# fit training data to model

bayes_classifier = MultinomialNB()
bayes_classifier.fit(X_train_vec, y_train)

# get accuracy score

y_pred = bayes_classifier.predict(X_test_vec)
print(f"Accuracy: {(accuracy_score(y_test, y_pred)*100):.2f}%")
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 65.21%
Classification Report:
               precision    recall  f1-score   support

         0.0       0.86      0.01      0.02       475
         1.0       0.73      0.20      0.31       937
         2.0       0.65      0.97      0.78      2270

    accuracy                           0.65      3682
   macro avg       0.74      0.39      0.37      3682
weighted avg       0.70      0.65      0.56      3682



## **Binary Sci-Kit Learn Model (without Stopwords)**

In [77]:
# prepare training data

training_data = pd.read_csv('comments.csv', index_col=0)
training_data['Sentiment'] = np.where(training_data['Sentiment'] == 1, 2, training_data['Sentiment'])
training_data.dropna(subset=['Comment'], inplace=True)

# set features and label

X = training_data['Comment']
y = training_data['Sentiment']

# split dataset into training and testing subsets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# vectorize features

vectorizer = TfidfVectorizer(stop_words=stopwords, strip_accents='unicode')
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# fit training data to model

bayes_classifier = BernoulliNB()
bayes_classifier.fit(X_train_vec, y_train)

# get accuracy score

y_pred = bayes_classifier.predict(X_test_vec)
print(f"Accuracy: {(accuracy_score(y_test, y_pred)*100):.2f}%")
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 86.69%
Classification Report:
               precision    recall  f1-score   support

         0.0       0.44      0.13      0.20       475
         2.0       0.88      0.98      0.93      3207

    accuracy                           0.87      3682
   macro avg       0.66      0.55      0.56      3682
weighted avg       0.83      0.87      0.83      3682

