<a href="https://colab.research.google.com/github/EmilisGit/Deep_learning/blob/main/NLP_ND.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Sentimentų analizė


In [15]:
import pickle
import pandas as pd
import numpy as np
from tqdm import tqdm
from transformers import pipeline

In [8]:
#@title NaiveBayesSentiment model
class NaiveBayesSentiment:
    '''
        Naive Bayes model for sentiment analysis
    '''

    def __init__(self):
        self.train_x = None
        self.train_y = None
        self.likelihood_positive = None
        self.likelihood_negative = None

    def fit(self, train_x, train_y):
        '''
            Fit the Naive Bayes model to the dataset

            Parameters:
                train_x : pd.Series
                    Training features
                train_y : pd.Series
                    Training labels

            P(A|B) = P(B|A) * P(A) / P(B)
        '''

        self.train_x = train_x
        self.train_y = train_y

        # calculate prior class probabilities
        prior_pos = len(self.train_y[self.train_y==1]) / len(self.train_x)
        prior_neg = len(train_y[train_y==0]) / len(train_x)

        pos = train_x[train_y==1]
        neg = train_x[train_y==0]

        # count words in total in positive and negative classes
        word_count_positive = pos.str.len().sum()
        word_count_negative = neg.str.len().sum()

        # count each word in positive and negative classes
        # (optional) count zero counts for words which are present in one class to another class
        each_word_count_positive = pd.Series([x for y in pos.to_list() for x in y]).value_counts()
        each_word_count_negative = pd.Series([x for y in neg.to_list() for x in y]).value_counts()

        not_in_negative = []
        for word in each_word_count_positive.index:
            if word not in each_word_count_negative.index:
                not_in_negative.append(word)

        not_in_positive = [x for x in each_word_count_negative.index
                        if x not in each_word_count_positive.index]

        not_in_neg_zeros = pd.Series(index=not_in_negative, data=0)
        not_in_pos_zeros = pd.Series(index=not_in_positive, data=0)

        each_word_count_positive = pd.concat([each_word_count_positive, not_in_pos_zeros])
        each_word_count_negative = pd.concat([each_word_count_negative, not_in_neg_zeros])

        # calculate vocabulary size
        vocabulary_size = len(set([x for y in train_x.to_list() for x in y]))

        # get likelihoods for each word in both classes
        self.likelihood_positive = (each_word_count_positive + 1) / (word_count_positive + vocabulary_size)
        self.likelihood_negative = (each_word_count_negative + 1) / (word_count_negative + vocabulary_size)

    def predict(self, test_x: pd.Series) -> list:
        '''
            Predict the sentiment of the test set

            Parameters:
                test_x : pd.Series
                    Test features

            Returns:
                list of int
                    Predicted labels
        '''
        predictions = []
        for sentence in tqdm(test_x.to_list()):

            positive_prob = 1
            negative_prob = 1
            mean_positive = self.likelihood_positive.mean()
            mean_negative = self.likelihood_negative.mean()

            for word in sentence:
                if word not in self.likelihood_positive.index:
                    result_pos = mean_positive
                    result_neg = mean_negative
                else:
                    result_pos = self.likelihood_positive[word]
                    result_neg = self.likelihood_negative[word]

                positive_prob *= result_pos
                negative_prob *= result_neg

            predicted_class = np.argmax([negative_prob, positive_prob])
            predictions.append(predicted_class)

        return predictions

In [14]:
with open("naive_bayes_sentiment_model.pkl", 'rb') as file:
    naive_bayes_model = pickle.load(file)

test_text = "Good, wonderful fantastic great"
naive_bayes_model.predict(pd.Series([test_text.split(" ")]))

100%|██████████| 1/1 [00:00<00:00, 10.89it/s]


[1]

In [16]:
sentiment_analyzer = pipeline("sentiment-analysis")
sentiment_analyzer(test_text)

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Device set to use cpu


[{'label': 'POSITIVE', 'score': 0.9998756647109985}]

#Poezija