In [84]:
import os
import random

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import BernoulliNB, MultinomialNB


from sklearn import metrics
from sklearn.model_selection import cross_val_predict
import pandas as pd

# 1. Feature
Prepare dataset for training

In [85]:
# Reading data files
project_path = os.path.realpath('')
# data_set_names = ('imdb_labelled.txt', 'amazon_cells_labelled.txt', 'yelp_labelled.txt')
data_set_names = ('tweets-labeled.txt',)
files = [os.path.join(project_path, 'data', file_name) for file_name in data_set_names]

lines = []
for f in files:
    with open(f, 'r') as text_file:
        lines += text_file.read().split('\n')
        
# Filter only lines valid, i.e. that have two columns (input, label) and the label isn't empty ('')
lines = [line.split('\t') for line in lines if len(line.split('\t'))==2 and line.split('\t')[1]<>'']

In [86]:
# Separate train and test data
TRAIN_PERCENT = 0.8

total_data = len(lines)
total_train_data = int(total_data * TRAIN_PERCENT)

# Suffle list and take first N to train and leave the rest for testing
random.shuffle(lines)

# Separate data into two lists: documents and labels
train_documents = [line[0] for line in lines[:total_train_data]]
train_labels = [int(line[1]) for line in lines[:total_train_data]]

test_documents = [line for line in lines[total_train_data:]]
# test_labels = [int(line[1]) for line in lines[total_train_data:]]

# assert total_data == len(train_documents) + len(test_labels)

In [87]:
# Prepare data model: convert our data into numeric fields using CountVectorizer
# It converts the text reviews into a vector with the frequency of each word on them for each review text (Term Frequency Representation)
count_vectorizer = CountVectorizer(binary='true')
train_documents = count_vectorizer.fit_transform(train_documents)

## 2. Model training

In [158]:
# 2. Model Training
# classifier = MultinomialNB().fit(train_documents, train_labels)
classifier = BernoulliNB().fit(train_documents, train_labels)

In [120]:
dataset = pd.read_csv('data/portuguese_tweets.csv',encoding='utf-8')

# Separate tweets and labels
train_documents = dataset['Text'].values
train_labels = dataset['Classificacao'].values

# Prepare data model
vectorizer = CountVectorizer(analyzer="word")
train_documents = vectorizer.fit_transform(train_documents)

# 2. Model Training
classifier = MultinomialNB()
classifier.fit(train_documents, train_labels)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

## 3. Model evaluation

In [152]:
# count_correct = 0.0
# total_test_data = len(test_documents)

# for doc in test_documents:
#     classification = classifier.predict(count_vectorizer.transform([doc[0]]))
#     if classification == int(doc[1]):
#         count_correct += 1.0

# print('Total correct: {}'.format(count_correct))        
# accuracy = count_correct / total_test_data
# print('Accuracy: {}'.format(accuracy))
print('')




In [156]:
# Cross validation
cross_val_results = cross_val_predict(classifier, train_documents, train_labels, cv=10)
# Measuring avg accuracy
accuracy = metrics.accuracy_score(train_labels,cross_val_results)
print('Accuracy: {}'.format(accuracy))

Accuracy: 0.883156482498


In [157]:
# Model validation
sentimento=[1,2,0]
sentimento=['Positivo','Negativo','Neutro']
print (metrics.classification_report(train_labels, cross_val_results, sentimento))

             precision    recall  f1-score   support

   Positivo       0.95      0.88      0.91      3300
   Negativo       0.89      0.93      0.91      2446
     Neutro       0.80      0.84      0.82      2453

avg / total       0.89      0.88      0.88      8199



In [125]:
# Confusion matrix
# assert len(train_labels) == len(cross_val_results)
print (pd.crosstab(train_labels, cross_val_results, rownames=['Real'], colnames=['Prediction'], margins=True))

Prediction  Negativo  Neutro  Positivo   All
Real                                        
Negativo        2275     162         9  2446
Neutro           240    2067       146  2453
Positivo          45     356      2899  3300
All             2560    2585      3054  8199


## 4. Demo

In [144]:
def classify_review(review):
#     classified = classifier.predict(count_vectorizer.transform([review]))
    classified = classifier.predict(vectorizer.transform([review]))
# freq_testes = vectorizer.transform(testes)
    return classified
    if classified[0] == 1:
        print('This is a negative tweet! :)')
    elif classified[0] == 0:
        print('This is a neutral tweet! :|')
    elif classified[0] == 2:
        print('This is a negative tweet! :(')
    else:
        print('I\'ve no idea of what\'s going on')

In [148]:
# Demo: Negative review
classify_review('O Neymar está deixando a desejar')

array([u'Negativo'], dtype='<U8')

In [149]:
# Demo: Positive review

classify_review(
    'O Brasil está investindo em educação, construindo mais escolas e preparando melhor os professores'
)

array([u'Positivo'], dtype='<U8')

In [150]:
# Demo: Neutral review
classify_review('Esse governo está no início, vamos ver o que vai dar')

array([u'Neutro'], dtype='<U8')