# Naive Bayes with MultinomialNB classifier - Nurhayat Altunok

In [None]:
# Importing the required libraries
!pip install nltk

import re
import nltk

# Download NLTK data
nltk.download('punkt')
nltk.download('words')
nltk.download('movie_reviews')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

from nltk.tokenize import word_tokenize
from nltk.corpus import words
from nltk.stem import WordNetLemmatizer
from nltk.corpus import movie_reviews

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.
[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [None]:
# Define a function to check if a word is an English word
def is_english_word(word):
    return word.lower() in english_words

# Define a set of English words
english_words = set(words.words())

# Initialize the WordNet Lemmatizer
lemmatizer = WordNetLemmatizer()

# Modify the preprocess_text function to use the WordNet Lemmatizer for all categories
def preprocess_text(text):
    # Remove non-alphanumeric characters (excluding spaces)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)

    # Tokenization
    words = word_tokenize(text.lower())  # Convert to lowercase and tokenize

    # Lemmatize all words using NLTK's WordNet lemmatizer
    cleaned_words = []
    for word in words:
        lemma = lemmatizer.lemmatize(word)
        if lemma.isalpha() and is_english_word(lemma):
            cleaned_words.append(lemma)

    return ' '.join(cleaned_words)

# movie_reviews dataset
positive_reviews = movie_reviews.fileids('pos')
negative_reviews = movie_reviews.fileids('neg')

# Combine positive and negative reviews with preprocessing (including cleaning)
all_reviews = [(preprocess_text(movie_reviews.raw(fileid)), 'pos') for fileid in positive_reviews] + \
              [(preprocess_text(movie_reviews.raw(fileid)), 'neg') for fileid in negative_reviews]


In [None]:
# Splitting the data into training and testing sets
train_reviews, test_reviews = train_test_split(all_reviews, test_size=0.2, random_state=42)

# Initialize the vectorizer and classifier
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform([text for text, _ in train_reviews])  # Convert text data to numerical features
X_test = vectorizer.transform([text for text, _ in test_reviews])  # Transform test data using the same vectorizer

# Get the vocabulary from the vectorizer
vocabulary = vectorizer.get_feature_names_out()

# Extract true labels for training and testing data (pos/neg)
y_train = [label for _, label in train_reviews]
y_test = [label for _, label in test_reviews]

# Initialize and train the Naive Bayes classifier
nb_classifier = MultinomialNB() #Naive Bayes machine learning algorithm
nb_classifier.fit(X_train, y_train)  # Train the classifier using training data

# Predict using the trained Naive Bayes model - predict sentiment labels for the test data, generating an array of predictions stored in nb_predictions.
nb_predictions = nb_classifier.predict(X_test)  # Make predictions on the test data

# Calculate accuracy - Accuracy of the predictions by comparing them to the actual test labels (y_test).
accuracy = accuracy_score(y_test, nb_predictions)
print(f"Accuracy: {accuracy:.2f}")

# Generate Classification Report - detailed metrics about the performance of our classification model.
print("Classification Report for Naive Bayes Sentiment Analysis:\n")
print(classification_report(y_test, nb_predictions))

Accuracy: 0.79
Classification Report for Naive Bayes Sentiment Analysis:

              precision    recall  f1-score   support

         neg       0.78      0.82      0.80       201
         pos       0.80      0.76      0.78       199

    accuracy                           0.79       400
   macro avg       0.79      0.79      0.79       400
weighted avg       0.79      0.79      0.79       400



In [None]:
# POS-Tagging with NLTK

def print_top_sentiment_words(words_list, category_name, num_words=10):
    word_sentiment_scores = {word: nb_classifier.predict_proba(vectorizer.transform([word]))[0] for word in words_list if is_english_word(word)}

    positive_words = [(word, sentiment[1]) for word, sentiment in word_sentiment_scores.items() if sentiment[1] > sentiment[0]]
    negative_words = [(word, sentiment[0]) for word, sentiment in word_sentiment_scores.items() if sentiment[0] > sentiment[1]]

    positive_words.sort(key=lambda x: x[1], reverse=True)
    negative_words.sort(key=lambda x: x[1], reverse=True)

    print(f"Top {num_words} Positive {category_name}:")
    for word, sentiment in positive_words[:num_words]:
        print(f"{word}: {sentiment:.4f} sentiment")

    print(f"\nTop {num_words} Negative {category_name}:")
    for word, sentiment in negative_words[:num_words]:
        print(f"{word}: {sentiment:.4f} sentiment")


# Separate words based on their POS tags - NLTK POS
adjectives = [word for word in vocabulary if nltk.pos_tag([word])[0][1].startswith('JJ')]
verbs = [word for word in vocabulary if nltk.pos_tag([word])[0][1].startswith('VB')]
nouns = [word for word in vocabulary if nltk.pos_tag([word])[0][1].startswith('NN')]


# Print top sentiment words for each category
print_top_sentiment_words(adjectives, "Adjectives")
print()
print_top_sentiment_words(verbs, "Verbs")
print()
print_top_sentiment_words(nouns, "Nouns")


Top 10 Positive Adjectives:
outstanding: 0.7119 sentiment
political: 0.7010 sentiment
memorable: 0.6974 sentiment
hilarious: 0.6801 sentiment
effective: 0.6711 sentiment
legal: 0.6690 sentiment
fantastic: 0.6662 sentiment
realistic: 0.6581 sentiment
overall: 0.6577 sentiment
private: 0.6500 sentiment

Top 10 Negative Adjectives:
worst: 0.7929 sentiment
stupid: 0.7652 sentiment
ridiculous: 0.7469 sentiment
bad: 0.7339 sentiment
unfunny: 0.7080 sentiment
ludicrous: 0.7003 sentiment
terrible: 0.6877 sentiment
idiotic: 0.6844 sentiment
laughable: 0.6825 sentiment
poor: 0.6751 sentiment

Top 10 Positive Verbs:
astounding: 0.6589 sentiment
refreshing: 0.6497 sentiment
beloved: 0.6461 sentiment
uplifting: 0.6430 sentiment
hatred: 0.6391 sentiment
hunting: 0.6344 sentiment
ted: 0.6299 sentiment
understanding: 0.6229 sentiment
frightening: 0.6227 sentiment
stunning: 0.6216 sentiment

Top 10 Negative Verbs:
wasted: 0.7593 sentiment
supposed: 0.7266 sentiment
insulting: 0.6727 sentiment
uninteres

In [None]:
while True:
    user_input = input("Enter your text (or 'exit' to quit): ")

    if user_input.lower() == 'exit':
        print("Exiting the program.")
        break

    # Convert the input sentence to a numerical feature vector
    user_input_vector = vectorizer.transform([user_input])

    # Predict sentiment probabilities using the trained Naive Bayes model
    sentiment_probabilities = nb_classifier.predict_proba(user_input_vector)[0]

    # Get the predicted sentiment class
    predicted_sentiment_class = nb_classifier.predict(user_input_vector)[0]

    if predicted_sentiment_class == 'pos':
        predicted_emotion = "positive"
    else:
        predicted_emotion = "negative"

    # Print the predicted sentiment and confidence for each class
    print(f"Predicted Sentiment: {predicted_emotion}")
    print(f"Positive Confidence: {sentiment_probabilities[1]:.4f}")
    print(f"Negative Confidence: {sentiment_probabilities[0]:.4f}")


Enter your text (or 'exit' to quit): exit
Exiting the program.


#BERT - Mia Nick

Data

In [None]:
# Importing the required libraries
!pip install nltk
!pip install transformers

import re
import nltk

# Download NLTK data
nltk.download('punkt')
nltk.download('words')
nltk.download('movie_reviews')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import numpy as np

from nltk.tokenize import word_tokenize
from nltk.corpus import words
from nltk.stem import WordNetLemmatizer
from nltk.corpus import movie_reviews
from collections import defaultdict

Collecting transformers
  Downloading transformers-4.33.2-py3-none-any.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m21.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.17.2-py3-none-any.whl (294 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.9/294.9 kB[0m [31m31.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m51.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m67.1 MB/s[0m eta [36m0:00:0

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.
[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [None]:
# Define a function to check if a word is an English word
def is_english_word(word):
    return word.lower() in english_words

# Define a set of English words
english_words = set(words.words())

# Initialize the WordNet Lemmatizer
lemmatizer = WordNetLemmatizer()

# Modify the preprocess_text function to use the WordNet Lemmatizer for all categories
def preprocess_text(text):
    # Remove non-alphanumeric characters (excluding spaces)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)

    # Tokenization#
    words = word_tokenize(text.lower())  # Convert to lowercase and tokenize

    # Lemmatize all words using NLTK's WordNet lemmatizer
    cleaned_words = []
    for word in words:
        lemma = lemmatizer.lemmatize(word)
        if lemma.isalpha() and is_english_word(lemma):
            cleaned_words.append(lemma)

    return ' '.join(cleaned_words)

# movie_reviews dataset
positive_reviews = movie_reviews.fileids('pos')
negative_reviews = movie_reviews.fileids('neg')

# Combine positive and negative reviews with preprocessing (including cleaning)
all_reviews = [(preprocess_text(movie_reviews.raw(fileid)), 'pos') for fileid in positive_reviews] + \
              [(preprocess_text(movie_reviews.raw(fileid)), 'neg') for fileid in negative_reviews]


In [None]:
# Load pre-trained BERT model and tokenizer
model_name = "distilbert-base-uncased"
tokenizer = DistilBertTokenizer.from_pretrained(model_name)
model = DistilBertForSequenceClassification.from_pretrained(model_name)


# Data preprocessing
positive_reviews = movie_reviews.fileids('pos')
negative_reviews = movie_reviews.fileids('neg')

all_reviews = [movie_reviews.raw(fileid) for fileid in positive_reviews] + \
              [movie_reviews.raw(fileid) for fileid in negative_reviews]
labels = [1] * len(positive_reviews) + [0] * len(negative_reviews)  # 1 for positive, 0 for negative

# Tokenize and encode the reviews
encoded_reviews = tokenizer(all_reviews, padding=True, truncation=True, return_tensors='pt')

# Split the data into training and testing sets
train_inputs, test_inputs, train_labels, test_labels = train_test_split(encoded_reviews['input_ids'],
                                                                      torch.tensor(labels),
                                                                      test_size=0.2,
                                                                      random_state=42)

# Define a data loader for training
batch_size = 8
train_dataset = torch.utils.data.TensorDataset(train_inputs, train_labels)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Fine-tune BERT on the sentiment analysis task
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
criterion = torch.nn.CrossEntropyLoss()

num_epochs = 1  # Increase this for better performance
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.train()

for epoch in range(num_epochs):
    running_loss = 0.0
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs, labels=labels)
        loss = criterion(outputs.logits, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {running_loss:.4f}")



Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.weight', 'pre_classifier.bias', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Epoch 1/1, Loss: 120.3226


In [None]:
# Evaluation on the test set
model.eval()

# Move test inputs and labels to the device
test_inputs, test_labels = test_inputs.to(device), test_labels.to(device)

# Initialize variables to store predictions
all_predicted_labels = []
batch_size = 8  # Adjust the batch size for inference

# Perform inference in batches to reduce GPU memory usage
with torch.no_grad():
    for i in range(0, len(test_inputs), batch_size):
        batch_inputs = test_inputs[i:i+batch_size]

        # Forward pass for the batch
        batch_outputs = model(batch_inputs)
        batch_logits = batch_outputs.logits

        # Convert logits to probabilities and get predicted labels
        batch_probs = torch.softmax(batch_logits, dim=1)
        batch_predicted_labels = torch.argmax(batch_probs, dim=1).cpu().numpy()

        # Append predicted labels for this batch to the list
        all_predicted_labels.extend(batch_predicted_labels)

# Convert the list of predicted labels to a numpy array
predicted_labels = np.array(all_predicted_labels)

# Calculate accuracy
accuracy = accuracy_score(test_labels.cpu().numpy(), predicted_labels)
print(f"Accuracy: {accuracy:.2f}")

# Generate Classification Report
class_names = ['negative', 'positive']
report = classification_report(test_labels.cpu().numpy(), predicted_labels, target_names=class_names)
print("Classification Report for BERT Sentiment Analysis:\n")
print(report)


Accuracy: 0.83
Classification Report for BERT Sentiment Analysis:

              precision    recall  f1-score   support

    negative       0.85      0.81      0.83       201
    positive       0.82      0.86      0.84       199

    accuracy                           0.83       400
   macro avg       0.84      0.84      0.83       400
weighted avg       0.84      0.83      0.83       400



In [None]:
import torch
import numpy as np

while True:
    user_input = input("Enter your text (or 'exit' to quit): ")

    if user_input.lower() == 'exit':
        print("Exiting the program.")
        break

    # Tokenize and encode the user input
    user_input_tokens = tokenizer(user_input, padding=True, truncation=True, return_tensors='pt')
    user_input_encoded = {key: val.to(device) for key, val in user_input_tokens.items()}

    # Predict sentiment probabilities
    with torch.no_grad():
        user_output = model(**user_input_encoded)
        user_logits = user_output.logits
        user_probs = torch.softmax(user_logits, dim=1).cpu().numpy()

    # Modify the class names
    class_names = {0: "Negative", 1: "Positive"}

    # Find the sentiment class with the highest probability
    predicted_sentiment_idx = np.argmax(user_probs)
    predicted_sentiment = class_names[predicted_sentiment_idx].capitalize()

    # Print the predicted sentiment and sentiment probabilities
    print(f"Predicted Sentiment: {predicted_sentiment}")
    for class_idx, class_name in class_names.items():
        print(f"{class_name.capitalize()} Sentiment Probability: {user_probs[0][class_idx]:.4f}")


Enter your text (or 'exit' to quit): how are you doing?
Predicted Sentiment: Negative
Negative Sentiment Probability: 0.5826
Positive Sentiment Probability: 0.4174
Enter your text (or 'exit' to quit): exit.
Predicted Sentiment: Negative
Negative Sentiment Probability: 0.5534
Positive Sentiment Probability: 0.4466
Enter your text (or 'exit' to quit): exit
Exiting the program.


# Prototype Chatbot with Sentiments - BERT

In [None]:
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, BlenderbotSmallForConditionalGeneration, BlenderbotSmallTokenizer

# Load DistilBERT for sentiment analysis
sentiment_model_name = "distilbert-base-uncased"
sentiment_tokenizer = AutoTokenizer.from_pretrained(sentiment_model_name)
sentiment_model = AutoModelForSequenceClassification.from_pretrained(sentiment_model_name)

# Load BlenderBot chatbot
chatbot_model_name = "facebook/blenderbot_small-90M"
chatbot_tokenizer = BlenderbotSmallTokenizer.from_pretrained(chatbot_model_name)
chatbot_model = BlenderbotSmallForConditionalGeneration.from_pretrained(chatbot_model_name)

while True:
    user_input = input("Enter your text (or 'exit' to quit): ")

    if user_input.lower() == 'exit':
        print("Exiting the program.")
        break

    # Sentiment analysis using DistilBERT
    user_input_tokens = sentiment_tokenizer(user_input, padding=True, truncation=True, return_tensors='pt')
    user_input_encoded = {key: val for key, val in user_input_tokens.items()}

    with torch.no_grad():
        user_output = sentiment_model(**user_input_encoded)
        user_logits = user_output.logits
        user_probs = torch.softmax(user_logits, dim=1).cpu().numpy()

    class_names = {0: "Negative", 1: "Positive"}
    predicted_sentiment_idx = np.argmax(user_probs)
    predicted_sentiment = class_names[predicted_sentiment_idx].capitalize()

    # Generate chatbot response based on sentiment
    chatbot_input = f"Predicted Sentiment: {predicted_sentiment}. {user_input}"
    chatbot_input_ids = chatbot_tokenizer.encode(chatbot_input, return_tensors="pt")

    chatbot_response_ids = chatbot_model.generate(chatbot_input_ids, max_length=50, num_return_sequences=1, pad_token_id=chatbot_tokenizer.eos_token_id)
    chatbot_response = chatbot_tokenizer.decode(chatbot_response_ids[0], skip_special_tokens=True)

    # Print the predicted sentiment, sentiment probabilities, and chatbot response
    print(f"Predicted Sentiment: {predicted_sentiment}")
    for class_idx, class_name in class_names.items():
        print(f"{class_name.capitalize()} Sentiment Probability: {user_probs[0][class_idx]:.4f}")
    print(f"Chatbot: {chatbot_response}")


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'pre_classifier.weight', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Enter your text (or 'exit' to quit): I feel bad.
Predicted Sentiment: Negative
Negative Sentiment Probability: 0.5309
Positive Sentiment Probability: 0.4691
Chatbot: i'm sorry to hear that. do you have any idea what's going on?
Enter your text (or 'exit' to quit): I am hungry, but there isn't anything to eat.
Predicted Sentiment: Negative
Negative Sentiment Probability: 0.5341
Positive Sentiment Probability: 0.4659
Chatbot: i'm sorry to hear that. i hope you find something you like to eat soon.
Enter your text (or 'exit' to quit): I am happy.
Predicted Sentiment: Negative
Negative Sentiment Probability: 0.5347
Positive Sentiment Probability: 0.4653
Chatbot: that's great! i'm happy for you. do you have any plans for the weekend?
Enter your text (or 'exit' to quit): I'm happy.
Predicted Sentiment: Negative
Negative Sentiment Probability: 0.5334
Positive Sentiment Probability: 0.4666
Chatbot: that's great! i'm glad you're happy. what's going on?
Enter your text (or 'exit' to quit): I am p

#Requirements.txt

In [None]:
!pip freeze > requirements.txt


In [None]:
with open('requirements.txt', 'r') as file:
    print(file.read())


absl-py==1.4.0
aiohttp==3.8.5
aiosignal==1.3.1
alabaster==0.7.13
albumentations==1.3.1
altair==4.2.2
anyio==3.7.1
appdirs==1.4.4
argon2-cffi==23.1.0
argon2-cffi-bindings==21.2.0
array-record==0.4.1
arviz==0.15.1
astropy==5.3.3
astunparse==1.6.3
async-timeout==4.0.3
attrs==23.1.0
audioread==3.0.0
autograd==1.6.2
Babel==2.12.1
backcall==0.2.0
beautifulsoup4==4.11.2
bleach==6.0.0
blinker==1.4
blis==0.7.10
blosc2==2.0.0
bokeh==3.2.2
bqplot==0.12.40
branca==0.6.0
build==1.0.3
CacheControl==0.13.1
cachetools==5.3.1
catalogue==2.0.9
certifi==2023.7.22
cffi==1.15.1
chardet==5.2.0
charset-normalizer==3.2.0
chex==0.1.7
click==8.1.7
click-plugins==1.1.1
cligj==0.7.2
cloudpickle==2.2.1
cmake==3.27.4.1
cmdstanpy==1.1.0
colorcet==3.0.1
colorlover==0.3.0
colour==0.1.5
community==1.0.0b1
confection==0.1.2
cons==0.4.6
contextlib2==21.6.0
contourpy==1.1.0
convertdate==2.4.0
cryptography==41.0.3
cufflinks==0.17.3
cupy-cuda11x==11.0.0
cvxopt==1.3.2
cvxpy==1.3.2
cycler==0.11.0
cymem==2.0.7
Cython==3.0.2
da