In [1]:
import json
data = {}
for name in ["train", "dev", "test"]:
    with open("{0}.json".format(name)) as infile:
        data[name] = json.load(infile)

In [2]:
import numpy as np
import re
import nltk
nltk.download("stopwords")
nltk.download("wordnet")
from sklearn.datasets import load_files
import pandas as pd
import pickle
from nltk.corpus import stopwords


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Jakoa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Jakoa\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
train = pd.DataFrame.from_dict(data["train"])
test = pd.DataFrame.from_dict(data["test"])

X_train = train['text']
y_train = train['label']

X_test = test['text']
y_test = test['label']

# train
# test

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from nltk.stem import WordNetLemmatizer

stemmer = WordNetLemmatizer()
def lemmatize(X: list[str]):
    documents = []
    for sen in range(0, len(X)):
        # Remove all the special characters
        document = re.sub(r'\W', ' ', str(X[sen]))

        # remove all single characters
        document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)

        # Remove single characters from the start
        document = re.sub(r'\^[a-zA-Z]\s+', ' ', document)

        # Substituting multiple spaces with single space
        document = re.sub(r'\s+', ' ', document, flags=re.I)

        # Removing prefixed 'b'
        # document = re.sub(r'^b\s+', '', document)

        # Converting to Lowercase
        document = document.lower()

        # Lemmatization
        document = document.split()

        document = [stemmer.lemmatize(word) for word in document]
        document = ' '.join(document)

        documents.append(document)
    return documents

def vectorize(X: list[str], vocabulary):
    vectorizer = CountVectorizer(min_df = 5, max_df = 0.7, vocabulary=vocabulary)
    X = vectorizer.fit_transform(X).toarray()

    tfidfconverter = TfidfTransformer()
    X_TFIDF = tfidfconverter.fit_transform(X).toarray()

    return X_TFIDF

In [5]:
X_train = lemmatize(X_train)
X_test = lemmatize(X_test)

vectorizer = CountVectorizer(min_df = 5, max_df = 0.7)
vocabulary = vectorizer.fit(X_train).vocabulary_

X_train = vectorize(X_train, vocabulary)
X_test = vectorize(X_test, vocabulary)

In [6]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB

classifierM = MultinomialNB(alpha=0.2)
classifierM.fit(X_train, y_train)
y_predM = classifierM.predict(X_test)


In [7]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(f'MultinomialNB + TFID: {accuracy_score(y_test, y_predM)}')

MultinomialNB + TFID: 0.6240474174428451


In [8]:
print(confusion_matrix(y_test,y_predM))

[[ 21 114  47]
 [  8 513  77]
 [ 12 186 203]]


In [9]:
print(classification_report(y_test,y_predM))

              precision    recall  f1-score   support

    Negative       0.51      0.12      0.19       182
     Neutral       0.63      0.86      0.73       598
    Positive       0.62      0.51      0.56       401

    accuracy                           0.62      1181
   macro avg       0.59      0.49      0.49      1181
weighted avg       0.61      0.62      0.59      1181

