In [23]:
import json
data = {}
for name in ["train", "dev", "test"]:
    with open("{0}.json".format(name)) as infile:
        data[name] = json.load(infile)

In [None]:
import numpy as np
import re
import nltk
from sklearn.datasets import load_files
import pandas as pd
import pickle
from nltk.corpus import stopwords

In [24]:
train = pd.DataFrame.from_dict(data["train"])

X = train['text']
y = train['label']

In [25]:
documents = []

from nltk.stem import WordNetLemmatizer

stemmer = WordNetLemmatizer()

for sen in range(0, len(X)):
    # Remove all the special characters
    document = re.sub(r'\W', ' ', str(X[sen]))

    # remove all single characters
    document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)

    # Remove single characters from the start
    document = re.sub(r'\^[a-zA-Z]\s+', ' ', document)

    # Substituting multiple spaces with single space
    document = re.sub(r'\s+', ' ', document, flags=re.I)

    # Removing prefixed 'b'
    document = re.sub(r'^b\s+', '', document)

    # Converting to Lowercase
    document = document.lower()

    # Lemmatization
    document = document.split()

    document = [stemmer.lemmatize(word) for word in document]
    document = ' '.join(document)

    documents.append(document)

In [26]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(max_features=2000, min_df = 5, max_df = 0.7)
X = vectorizer.fit_transform(documents).toarray()

In [27]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidfconverter = TfidfTransformer()
X_TFIDF = tfidfconverter.fit_transform(X).toarray()

In [28]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

X_train_TFIDF, X_test_TFIDF, y_train_TFIDF, y_test_TFIDF = train_test_split(X_TFIDF, y, test_size=0.2, random_state=0)

In [29]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB

classifierM = MultinomialNB()
classifierM.fit(X_train, y_train)
y_predM = classifierM.predict(X_test)

classifierM = MultinomialNB()
classifierM.fit(X_train_TFIDF, y_train_TFIDF)
y_pred_TFIDF = classifierM.predict(X_test_TFIDF)

classifierB = BernoulliNB()
classifierB.fit(X_train, y_train)
y_predB = classifierB.predict(X_test)

In [30]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(accuracy_score(y_test, y_predM))
print(accuracy_score(y_test, y_predB))
print(accuracy_score(y_test_TFIDF, y_pred_TFIDF))

0.6282131661442006
0.6025078369905956
0.606269592476489


In [31]:
print(confusion_matrix(y_test,y_predM))
print(confusion_matrix(y_test,y_predB))
print(confusion_matrix(y_test_TFIDF,y_pred_TFIDF))

[[ 97 105  61]
 [ 57 611 151]
 [ 43 176 294]]
[[ 88 107  68]
 [ 85 615 119]
 [ 55 200 258]]
[[ 15 184  64]
 [  5 730  84]
 [  4 287 222]]


In [32]:
print(classification_report(y_test,y_predM))
print(classification_report(y_test,y_predB))
print(classification_report(y_test_TFIDF,y_pred_TFIDF))

              precision    recall  f1-score   support

    Negative       0.49      0.37      0.42       263
     Neutral       0.68      0.75      0.71       819
    Positive       0.58      0.57      0.58       513

    accuracy                           0.63      1595
   macro avg       0.59      0.56      0.57      1595
weighted avg       0.62      0.63      0.62      1595

              precision    recall  f1-score   support

    Negative       0.39      0.33      0.36       263
     Neutral       0.67      0.75      0.71       819
    Positive       0.58      0.50      0.54       513

    accuracy                           0.60      1595
   macro avg       0.54      0.53      0.53      1595
weighted avg       0.59      0.60      0.60      1595

              precision    recall  f1-score   support

    Negative       0.62      0.06      0.10       263
     Neutral       0.61      0.89      0.72       819
    Positive       0.60      0.43      0.50       513

    accuracy        