In [142]:
import pandas as pd
import csv
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from IPython.display import display
from sklearn.ensemble import RandomForestClassifier

In [143]:
dataset = []
labels = []

with open('sentiment-analysis.csv', newline='') as csvfile:
    reader = csv.reader(csvfile)
    next(reader)
    for row in reader:
        result = str(row[0]).split(",")
        if len(result) > 1: 
            dataset.append(result[0])
            labels.append(result[1])

In [144]:
# vetorização dos textos
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(dataset)

classifier = RandomForestClassifier(n_estimators=100, random_state=42)

In [147]:
############# Holdout ############################
# 30% do conjunto de dados é utilizado para os testes e 70% é utilizado para o treinamento
x_train, x_test, y_train, y_test = train_test_split(X, labels, test_size=0.3, random_state=42)

classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_test)

# avaliação do modelo
print(classification_report(y_test, y_pred))

# Tabela com rótulos verdadeiros e previstos
data = {'Texto': [vectorizer.inverse_transform(x)[0] for x in x_test],
        'Verdadeiro': y_test, 'Previsto': y_pred}
df = pd.DataFrame(data)
display(df)

acc_score = accuracy_score(y_test, y_pred)
print(f'Accuracy: {acc_score}')

              precision    recall  f1-score   support

    Negative       0.90      0.82      0.86        11
    Positive       0.89      0.94      0.92        18

    accuracy                           0.90        29
   macro avg       0.90      0.88      0.89        29
weighted avg       0.90      0.90      0.90        29



Unnamed: 0,Texto,Verdadeiro,Previsto
0,"[service, terrible, the, was]",Negative,Negative
1,"[always, good, in, it, me, mood, nostalgic, of...",Positive,Positive
2,"[about, at, care, customer, customers, is, out...",Positive,Positive
3,"[and, confusing, designed, is, poorly, so, the...",Negative,Positive
4,"[back, can, delicious, food, go, has, most, re...",Positive,Positive
5,"[love, product, this]",Positive,Positive
6,"[and, found, intuitive, is, navigation, needed...",Positive,Positive
7,"[adrenaline, coaster, exhilarating, pure, ride...",Positive,Positive
8,"[love, product, this]",Positive,Positive
9,"[book, feel, highly, inspired, made, me, recom...",Positive,Positive


Accuracy: 0.896551724137931


In [146]:
############# Validação Cruzada ############################
num_folds = 5
fold_size = X.shape[0] // num_folds

total = 0

# Conversão da matriz esparsa X para uma matriz densa. 
X_dense = X.toarray()

count_fold = 1
for i in range(num_folds):
    # Dividindo os conjuntos para cada fold
    start = i * fold_size
    end = (i + 1) * fold_size
    data_train = []
    data_test = []
    labels_train = []
    labels_test = []

    for j in range(0, start):
        data_train.append(X_dense[j])
        labels_train.append(labels[j])

    for j in range(end, X.shape[0]):
        data_train.append(X_dense[j])
        labels_train.append(labels[j])

    for j in range(start, end):
        data_test.append(X_dense[j])
        labels_test.append(labels[j])   
   
    classifier.fit(data_train, labels_train)

    # Avaliação modelo
    y_pred = classifier.predict(data_test)
    acc_score = accuracy_score(labels_test, y_pred)
    total += acc_score
    print(f'**********Fold {count_fold}****************')
    print(f'Accuracy: {acc_score}\n')

    count_fold += 1

m_acc = total / num_folds
print(f'Average accuracy: {m_acc}')

**********Fold 1****************
Accuracy: 1.0

**********Fold 2****************
Accuracy: 0.7368421052631579

**********Fold 3****************
Accuracy: 0.6842105263157895

**********Fold 4****************
Accuracy: 0.7368421052631579

**********Fold 5****************
Accuracy: 1.0

Average accuracy: 0.8315789473684211
