# Twitter sentiment analysis

In [111]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import BernoulliNB, MultinomialNB


from sklearn import metrics
from sklearn.model_selection import cross_val_predict

# 1. Prepare dataset
Prepare data for training

In [113]:
# Reading data files
dataset = pd.read_csv('data/portuguese_tweets.csv',encoding='utf-8')

In [112]:
dataset.count()

Unnamed: 0                   8199
Created At                   8199
Text                         8199
Geo Coordinates.latitude      104
Geo Coordinates.longitude     104
User Location                5489
Username                     8199
User Screen Name             8199
Retweet Count                8199
Classificacao                8199
Observação                      1
Unnamed: 10                     0
Unnamed: 11                     0
Unnamed: 12                     0
Unnamed: 13                     0
Unnamed: 14                     0
Unnamed: 15                     0
Unnamed: 16                     0
Unnamed: 17                     0
Unnamed: 18                     0
Unnamed: 19                     0
Unnamed: 20                     0
Unnamed: 21                     0
Unnamed: 22                     0
Unnamed: 23                     0
Unnamed: 24                     0
dtype: int64

In [114]:
# Separate tweets and labels
train_documents = dataset['Text'].values
train_labels = dataset['Classificacao'].values

In [115]:
# Prepare data model
vectorizer = CountVectorizer(analyzer="word", binary=True)
train_documents = vectorizer.fit_transform(train_documents)

## 2. Model training

In [126]:
# 2. Model Training
# classifier = MultinomialNB().fit(train_documents, train_labels)
classifier = BernoulliNB().fit(train_documents, train_labels)

## 3. Model evaluation

In [127]:
# Cross validation
cross_val_results = cross_val_predict(classifier, train_documents, train_labels, cv=10)
# Measuring avg accuracy
accuracy = metrics.accuracy_score(train_labels,cross_val_results)
print('Accuracy: {}'.format(accuracy))
# Accuracy = (TP +TN )/ (TP + FP +TN +FN)

Accuracy: 0.884620075619


In [128]:
# Confusion matrix
# assert len(train_labels) == len(cross_val_results)
print (pd.crosstab(train_labels, cross_val_results, rownames=['Real'], colnames=['Prediction'], margins=True))

Prediction  Negativo  Neutro  Positivo   All
Real                                        
Negativo        2208     224        14  2446
Neutro           133    2145       175  2453
Positivo          38     362      2900  3300
All             2379    2731      3089  8199


In [129]:
# Model validation
sentimento=['Positivo','Negativo','Neutro']
print (metrics.classification_report(train_labels, cross_val_results, sentimento))
# precision = true positive / (true positive + false positive)
# recall = true positive / (true positive + false negative)
# f1-score = 2*((precision * recall) / (precision + recall)

             precision    recall  f1-score   support

   Positivo       0.94      0.88      0.91      3300
   Negativo       0.93      0.90      0.92      2446
     Neutro       0.79      0.87      0.83      2453

avg / total       0.89      0.88      0.89      8199



## 4. Demo

In [122]:
def classify_review(review):
    classified = classifier.predict(vectorizer.transform([review]))
    return classified[0]

In [123]:
# Demo: Neutral review
classify_review('Esse governo está no início, vamos ver o que vai dar')

u'Neutro'

In [124]:
# Demo: Positive review

classify_review(
    'O Brasil está investindo em educação, construindo mais escolas e preparando melhor os professores'
)

u'Positivo'

In [125]:
# Demo: Negative review
classify_review('O Neymar está deixando a desejar')

u'Negativo'

In [133]:
# Demo: Negative review
classify_review('O Brasil voltou, 20 anos em 2')

u'Positivo'