In [1]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
import time

In [2]:
# Loading Dataset: Dataset 3 from research paper
df_test = pd.read_csv("E:/Mini_project_datasets/test_separated.csv")
df_train = pd.read_csv("E:/Mini_project_datasets/train_separated.csv")

In [3]:
# Exploratory Data Analysis on the Dataset
print("Test Data: \n", df_test.head(5))
print("Train Data: \n", df_train.head(5))

Test Data: 
    label                                            comment
0      0  Great CD: My lovely Pat has one of the GREAT v...
1      0  One of the best game music soundtracks - for a...
2      1  Batteries died within a year ...: I bought thi...
3      0  works fine, but Maha Energy is better: Check o...
4      0  Great for the non-audiophile: Reviewed quite a...
Train Data: 
    label                                            comment
0      0  Stuning even for the non-gamer: This sound tra...
1      0  The best soundtrack ever to anything.: I'm rea...
2      0  Amazing!: This soundtrack is my favorite music...
3      0  Excellent Soundtrack: I truly like this soundt...
4      0  Remember, Pull Your Jaw Off The Floor After He...


In [4]:
# This function cleans up text so that only English characters are preserved
def processed_comments(df, text_column):
    not_alphanumeric = re.compile(r'[\W]')
    not_ascii = re.compile(r'[^a-z0-1\s]')

    processed_comments = []
    for comment in df[text_column]:
        lower = comment.lower()
        no_punctuation = not_alphanumeric.sub(r' ', lower)
        no_non_ascii = not_ascii.sub(r'', no_punctuation)
        processed_comments.append(no_non_ascii)

    df["processed_comments"] = processed_comments
    df.drop(columns=[text_column], inplace=True)  # Drop the previous text column
    return df

In [5]:
df_test = processed_comments(df_test, "comment")
df_train = processed_comments(df_train, "comment")

In [6]:
df_train["processed_comments"], val_text, df_train["label"], val_label = train_test_split(df_train["processed_comments"], df_train["label"], random_state=42, test_size=0.2)

In [7]:
print('Train Length ', len(df_train["processed_comments"]))
print('Train Label Length ', len(df_train["label"]))
print('Test Length ', len(df_test["processed_comments"]))
print('Test Labels Length ', len(df_test["label"]))

Train Length  1048575
Train Label Length  1048575
Test Length  400000
Test Labels Length  400000


In [8]:
# Drop rows with NaN values in the "processed_comments" column
df_train.dropna(subset=["processed_comments"], inplace=True)
df_test.dropna(subset=["processed_comments"], inplace=True)


In [9]:
# To get the count of comments from Label 0 and Label 1 from Training dataset
pd.DataFrame(df_train["label"]).value_counts()

label
0.0      424467
1.0      414393
dtype: int64

In [10]:
# Vectorize text data
vectorizer = CountVectorizer(max_features=1000)
X_train = vectorizer.fit_transform(df_train["processed_comments"])
X_test = vectorizer.transform(df_test["processed_comments"])

y_train = np.array(df_train["label"])
y_test = np.array(df_test["label"])


A) Multinomial Naive Bayes Classifier

In [20]:
start_time = time.time()
# Train Naive Bayes classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, y_train)
# Predictions
y_pred = nb_classifier.predict(X_test)
end_time = time.time()

In [21]:
# Evaluation
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)

report = classification_report(y_test, y_pred)
print("Classification Report:")
print(report)
print("Training Time:", end_time - start_time, "seconds")

Accuracy: 0.8381
Confusion Matrix:
[[165110  34890]
 [ 29870 170130]]
Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.83      0.84    200000
           1       0.83      0.85      0.84    200000

    accuracy                           0.84    400000
   macro avg       0.84      0.84      0.84    400000
weighted avg       0.84      0.84      0.84    400000

Training Time: 0.341341495513916 seconds


B) Bernoulli Naive Bayes Classifier

In [22]:
start_time = time.time()
# Train Bernoulli Naive Bayes classifier
b_nb_classifier = BernoulliNB()
b_nb_classifier.fit(X_train, y_train)
# Predictions
y_pred = b_nb_classifier.predict(X_test)
end_time = time.time()

In [23]:
# Evaluation
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)

report = classification_report(y_test, y_pred)
print("Classification Report:")
print(report)
print("Training Time:", end_time - start_time, "seconds")

Accuracy: 0.80454
Confusion Matrix:
[[164623  35377]
 [ 42807 157193]]
Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.82      0.81    200000
           1       0.82      0.79      0.80    200000

    accuracy                           0.80    400000
   macro avg       0.80      0.80      0.80    400000
weighted avg       0.80      0.80      0.80    400000

Training Time: 0.7702083587646484 seconds


C) Gaussian Naive Bayes classifier

In [24]:
start_time = time.time()
# Train Bernoulli Naive Bayes classifier
g_nb_classifier = BernoulliNB()
g_nb_classifier.fit(X_train, y_train)
# Predictions
y_pred = g_nb_classifier.predict(X_test)
end_time = time.time()

In [25]:
# Evaluation
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)

report = classification_report(y_test, y_pred)
print("Classification Report:")
print(report)
print("Training Time:", end_time - start_time, "seconds")

Accuracy: 0.80454
Confusion Matrix:
[[164623  35377]
 [ 42807 157193]]
Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.82      0.81    200000
           1       0.82      0.79      0.80    200000

    accuracy                           0.80    400000
   macro avg       0.80      0.80      0.80    400000
weighted avg       0.80      0.80      0.80    400000

Training Time: 0.6454088687896729 seconds


D) Logistic Regression Classifier

In [26]:
start_time = time.time()
# Train Logistic Regression classifier
log_reg_classifier = LogisticRegression(max_iter=1000, random_state=42)
log_reg_classifier.fit(X_train, y_train)

# Predictions
y_pred = log_reg_classifier.predict(X_test)
end_time = time.time()

In [27]:
# Evaluation
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)

report = classification_report(y_test, y_pred)
print("Classification Report:")
print(report)
print("Training Time:", end_time - start_time, "seconds")

Accuracy: 0.8740125
Confusion Matrix:
[[176070  23930]
 [ 26465 173535]]
Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.88      0.87    200000
           1       0.88      0.87      0.87    200000

    accuracy                           0.87    400000
   macro avg       0.87      0.87      0.87    400000
weighted avg       0.87      0.87      0.87    400000

Training Time: 18.874950885772705 seconds


E) Decision Tree

In [28]:
start_time = time.time()
clf_tree = DecisionTreeClassifier( max_depth = 5)
clf_tree=clf_tree.fit( X_train, y_train )
y_pred=clf_tree.predict(X_test)
y_pred
end_time = time.time()

In [29]:
#Evaluation
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)

report = classification_report(y_test, y_pred)
print("Classification Report:")
print(report)
print("Training Time:", end_time - start_time, "seconds")

Accuracy: 0.699875
Confusion Matrix:
[[161037  38963]
 [ 81087 118913]]
Classification Report:
              precision    recall  f1-score   support

           0       0.67      0.81      0.73    200000
           1       0.75      0.59      0.66    200000

    accuracy                           0.70    400000
   macro avg       0.71      0.70      0.70    400000
weighted avg       0.71      0.70      0.70    400000

Training Time: 17.57398796081543 seconds
