From research, fine tuning a dataset using Multinomial Naive Bayes or SVC doesn't work as well, since the model is overfitted.

e.g. The Twitter Sentiment dataset never reaches >75% accuracy on confusion matrix.

In [None]:
# Example code
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

bayes = MultinomialNB()
bayes.fit(X, y)
X_train, X_test, y_train, y_test = train_test_split(X, y)
y_true = y_test

y_pred = bayes.predict(X_test)
matrix = confusion_matrix(y_true, y_pred)

label_names = pd.Series(['neg', 'pos'])
pd.DataFrame(matrix,
     columns='Predicted ' + label_names,
     index='Is ' + label_names)

In [14]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m18.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m27.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m40.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m43.6 MB/s[0m eta [36m0:00:0

In [8]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk

plt.style.use('ggplot')

nltk.download('punkt')
nltk.download('vader_lexicon')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


True

Method 1: NLTK Continuous Bag Of Words

In [6]:
# Create example texts (with bag of words)

happy_example = " I'm doing well and I can't wait for what's next!"
sad_example = "I hate everything and I dont want to keep going."
th = nltk.word_tokenize(happy_example)
ts = nltk.word_tokenize(sad_example)
print(th, '\n', ts)

['I', "'m", 'doing', 'well', 'and', 'I', 'ca', "n't", 'wait', 'for', 'what', "'s", 'next', '!'] 
 ['I', 'hate', 'everything', 'and', 'I', 'dont', 'want', 'to', 'keep', 'going', '.']


Sentiment Analysis

In [12]:
from nltk.sentiment import SentimentIntensityAnalyzer
from tqdm.notebook import tqdm

sia = SentimentIntensityAnalyzer()

# Create analysis matrix
data = {'happy': sia.polarity_scores(happy_example), 'sad': sia.polarity_scores(sad_example)}
pd.DataFrame.from_dict(data, orient='index')

Unnamed: 0,neg,neu,pos,compound
happy,0.0,0.77,0.23,0.3382
sad,0.451,0.549,0.0,-0.6023


Method 2: HuggingFace

In [19]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoModelForMaskedLM
from scipy.special import softmax

In [35]:
# Load model directly -> Code modified from https://huggingface.co/cardiffnlp/twitter-roberta-base-sentiment

MODEL = "cardiffnlp/twitter-roberta-base-sentiment"

tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
model.save_pretrained(MODEL)


In [36]:
def preprocess(text):
    new_text = []


    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

encoded_text = tokenizer(preprocess(happy_example), return_tensors='pt')
output = model(**encoded_text)
scores = output[0][0].detach().numpy()
scores = softmax(scores)
happy_scores_dict = {
    'neg' : scores[0],
    'neu' : scores[1],
    'pos' : scores[2]
}
encoded_text = tokenizer(preprocess(sad_example), return_tensors='pt')
output = model(**encoded_text)
scores = output[0][0].detach().numpy()
scores = softmax(scores)
sad_scores_dict = {
    'neg' : scores[0],
    'neu' : scores[1],
    'pos' : scores[2]
}

In [39]:
dataHF = {'happy': happy_scores_dict, 'sad': sad_scores_dict}
pd.DataFrame.from_dict(dataHF, orient='index')

Unnamed: 0,neg,neu,pos
happy,0.001066,0.007039,0.991894
sad,0.975153,0.021368,0.003478


In [40]:
pd.DataFrame.from_dict(data, orient='index')

Unnamed: 0,neg,neu,pos,compound
happy,0.0,0.77,0.23,0.3382
sad,0.451,0.549,0.0,-0.6023


As you can see, the tokenized checkpoint provides a much better sentiment analysis model.

Method 3: Create full pipeline

In [46]:
from transformers import pipeline

# Create sentiment analysis pipeline
sent_pipeline = pipeline("text-classification")

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [47]:
sent_pipeline('I love sentiment analysis!')

[{'label': 'POSITIVE', 'score': 0.9997853636741638}]

In [49]:
sent_pipeline('I hate sentiment analysis!')

[{'label': 'NEGATIVE', 'score': 0.9992958307266235}]