In [25]:
import pandas as pd
import csv
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB

In [26]:
dataset = []
labels = []

with open('sentiment-analysis.csv', newline='') as csvfile:
    reader = csv.reader(csvfile)
    next(reader)
    for row in reader:
        result = str(row[0]).split(",")
        if len(result) > 1: 
            dataset.append(result[0])
            labels.append(result[1])

In [27]:
# vetorização dos textos
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(dataset).toarray()

gnb = GaussianNB()
mnb = MultinomialNB()
bnb = BernoulliNB()

In [28]:
############# Holdout ############################
def holdout(classifier):
    # 30% do conjunto de dados é utilizado para os testes e 70% é utilizado para o treinamento
    x_train, x_test, y_train, y_test = train_test_split(X, labels, test_size=0.3, random_state=42)

    classifier.fit(x_train, y_train)
    y_pred = classifier.predict(x_test)

    # avaliação do modelo
    print(classification_report(y_test, y_pred))

In [29]:
holdout(gnb)

              precision    recall  f1-score   support

    Negative       0.92      1.00      0.96        11
    Positive       1.00      0.94      0.97        18

    accuracy                           0.97        29
   macro avg       0.96      0.97      0.96        29
weighted avg       0.97      0.97      0.97        29



In [30]:
holdout(mnb)

              precision    recall  f1-score   support

    Negative       0.73      1.00      0.85        11
    Positive       1.00      0.78      0.88        18

    accuracy                           0.86        29
   macro avg       0.87      0.89      0.86        29
weighted avg       0.90      0.86      0.86        29



In [31]:
holdout(bnb)

              precision    recall  f1-score   support

    Negative       0.69      1.00      0.81        11
    Positive       1.00      0.72      0.84        18

    accuracy                           0.83        29
   macro avg       0.84      0.86      0.83        29
weighted avg       0.88      0.83      0.83        29



In [32]:
############# Validação Cruzada ############################
def cross_validation(classifier):
    num_folds = 5
    fold_size = X.shape[0] // num_folds

    total = 0

    count_fold = 1
    for i in range(num_folds):
        # Dividindo os conjuntos para cada fold
        start = i * fold_size
        end = (i + 1) * fold_size
        data_train = []
        data_test = []
        labels_train = []
        labels_test = []

        for j in range(0, start):
            data_train.append(X[j])
            labels_train.append(labels[j])

        for j in range(end, X.shape[0]):
            data_train.append(X[j])
            labels_train.append(labels[j])

        for j in range(start, end):
            data_test.append(X[j])
            labels_test.append(labels[j])   
    
        classifier.fit(data_train, labels_train)

        # Avaliação modelo
        y_pred = classifier.predict(data_test)
        acc_score = accuracy_score(labels_test, y_pred)
        total += acc_score
        print(f'**********Fold {count_fold}****************')
        print(f'Accuracy: {acc_score}\n')

        count_fold += 1

    m_acc = total / num_folds
    print(f'Average accuracy: {m_acc}')

In [33]:
cross_validation(gnb)

**********Fold 1****************
Accuracy: 1.0

**********Fold 2****************
Accuracy: 1.0

**********Fold 3****************
Accuracy: 0.8947368421052632

**********Fold 4****************
Accuracy: 0.8947368421052632

**********Fold 5****************
Accuracy: 1.0

Average accuracy: 0.9578947368421054


In [34]:
cross_validation(mnb)

**********Fold 1****************
Accuracy: 1.0

**********Fold 2****************
Accuracy: 0.8421052631578947

**********Fold 3****************
Accuracy: 0.7368421052631579

**********Fold 4****************
Accuracy: 0.8947368421052632

**********Fold 5****************
Accuracy: 1.0

Average accuracy: 0.8947368421052632


In [35]:
cross_validation(bnb)

**********Fold 1****************
Accuracy: 1.0

**********Fold 2****************
Accuracy: 0.7368421052631579

**********Fold 3****************
Accuracy: 0.7368421052631579

**********Fold 4****************
Accuracy: 0.8947368421052632

**********Fold 5****************
Accuracy: 1.0

Average accuracy: 0.8736842105263157
