In [1]:
from database_utils import DatabaseConnector, build_dataframe

db_connector = DatabaseConnector('localhost', 'root', '12345', 'CORPUS_VIES')
retrieved_data = build_dataframe(db_connector.getDataTextAndLabel())

SELECT PARAGRAPH, POLARITY FROM PARAGRAPHS WHERE POLARITY IS NOT NULL AND trim(POLARITY) <> ""
1042  Paragraphs encountered


# Balanceamento dos dados

In [2]:
%matplotlib notebook

from matplotlib import pyplot as plt
import numpy as np

def autolabel(rects):
    for rect in rects:
        height = rect.get_height()
        plt.text(rect.get_x()+rect.get_width()/2., 1.0*height, '%d'%int(height),
                ha='center', va='bottom')

positive_data = retrieved_data.loc[retrieved_data['labels'] == 'PO']
negative_data = retrieved_data.loc[retrieved_data['labels'] == 'NG']
neutral_data = retrieved_data.loc[retrieved_data['labels'] == 'NE']

values = [positive_data.shape[0], negative_data.shape[0], neutral_data.shape[0]]

xlabels = ["Positivos", "Negativos", "Neutros"]
indexes = np.arange(len(xlabels))
barWidth = 0.35

f1 = plt.figure()
ax1 = f1.add_subplot(111)

p = ax1.bar(indexes, values, barWidth, tick_label = values)
plt.ylabel('Quantidade')
plt.xlabel('Classes')
plt.title('Balanceamento do Dataset')
plt.xticks(indexes + barWidth/2., xlabels)

autolabel(p)

plt.show()

<IPython.core.display.Javascript object>

In [2]:
from database_utils import DatabaseConnector, build_dataframe
from machine_learning_utils import MLWrapper
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer

input_data = retrieved_data[(retrieved_data['labels'] == "PO") | (retrieved_data['labels'] == "NG")]
input_data['labels'].replace(to_replace = "NG", value = 0, inplace = True)
input_data['labels'].replace(to_replace = "PO", value = 1, inplace = True)

print("Treinando o modelo")
print("Usando somente os unigramas")
pipeline = Pipeline([
        ('vectorizer', CountVectorizer()),
        ('classifier', MultinomialNB())
    ])

ml = MLWrapper(pipeline)
pipeline = ml.train(input_data, 2)

Treinando o modelo
Usando somente os unigramas
Total news classified: 701
Score: 0.587035827195
Accuracy: 0.68480449946
Confusion matrix:
[[316  75]
 [146 164]]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


Tentando outro classificador

In [12]:
from sklearn import svm
from machine_learning_utils import MLWrapper
from sklearn.svm import SVC

print("Treinando o modelo")
input_data = retrieved_data[(retrieved_data['labels'] == "PO") | (retrieved_data['labels'] == "NG")]
input_data['labels'].replace(to_replace = "NG", value = 0, inplace = True)
input_data['labels'].replace(to_replace = "PO", value = 1, inplace = True)

pipeline = Pipeline([
        ('vectorizer', CountVectorizer()),
        ('classifier', SVC(C= 316))
    ])

ml = MLWrapper(pipeline)
ml.train(input_data, 2)

Treinando o modelo


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


Total news classified: 701
Score: 0.585455197062
Accuracy: 0.650518223794
Confusion matrix:
[[278 113]
 [132 178]]


# Stemming

Utilizar o stemming para o portugues

In [None]:
import nltk

# Exemplo
stemmer = nltk.stem.RSLPStemmer()
stemmer.stem("COPIAR")

Rodando os mesmo algoritmos com o Stemmer

In [None]:
from database_utils import DatabaseConnector, build_dataframe
from machine_learning_utils import MLWrapper
from sklearn.naive_bayes import MultinomialNB

print("Treinando o modelo")
ml_wrapper = MLWrapper(MultinomialNB())
ml_wrapper.train(retrieved_data)

In [None]:
from pickle import load
f = open('bigram_tagger.pkl', 'rb')
tagger = load(f)
f.close()

tagger.tag(['Aécio'])

# Análise de Frequencia dos Adjetivos


In [None]:
import pymysql
from nltk import FreqDist

def get_adjective_by_sentiment(sentiment):
    db = pymysql.connect('localhost', 'root', '12345', 'CORPUS_VIES')
    cursor = db.cursor()

    sql_statement = 'SELECT PARAGRAPH FROM PARAGRAPHS WHERE POLARITY = "%s"' %sentiment
    print(sql_statement)

    cursor.execute(sql_statement)
    print(cursor.rowcount, ' Paragraphs encountered')
    lista = cursor.fetchall()
    db.close()

    result_list = []
    for sentence in lista:
        result = tagger.tag(sentence[0].split())
        result_list += result

    fd = FreqDist([word for (word,tag) in result_list if tag[:3] == 'ADJ'])
    adj_set = set(fd.keys())
    print(len(adj_set), ' Adjectives encountered\n')
    
    return adj_set

pos_adj = get_adjective_by_sentiment("PO")
neg_adj = get_adjective_by_sentiment("NG")
neu_adj = get_adjective_by_sentiment("NE")

In [None]:
pos_excl = pos_adj.difference(neg_adj.union(neu_adj))
neg_excl = neg_adj.difference(pos_adj.union(neu_adj))
neu_excl = neu_adj.difference(neg_adj.union(pos_adj))

print('Quantidade de adjetivos exclusivos')
print('Positivos: ', len(pos_excl))
print('Negativos: ', len(neg_excl))
print('Neutros: ', len(neu_excl))

In [None]:
from pickle import load
from pandas import DataFrame

# retrieved_data[retrieved_data['labels'] == 'NE']

# for l in retrieved_data.values:
#     print(l)

def adj_incidence(textdata_df):
    f = open('bigram_tagger.pkl', 'rb')
    tagger = load(f)
    f.close()
    
    adj_class_counter = {'pos':0, 'neg':0, 'neu':0}
    for value in textdata_df.values:
        tokens = value[1].split()
        tags = tagger.tag(tokens)
        adjs = [token for token, tag in tags if tag == "ADJ" ]
        for adj in adjs:
            if adj in neg_excl: adj_class_counter['neg']+=1
            if adj in pos_excl: adj_class_counter['pos']+=1
            if adj in neu_excl: adj_class_counter['neu']+=1
    return adj_class_counter
        
pos_data = retrieved_data[retrieved_data['labels'] == 'PO']
neg_data = retrieved_data[retrieved_data['labels'] == 'NG']
neu_data = retrieved_data[retrieved_data['labels'] == 'NE']

count_pos = adj_incidence(pos_data)
count_neg = adj_incidence(neg_data)
count_neu = adj_incidence(neu_data)

In [None]:
count_pos, count_neg, count_neu

In [None]:
print(adj_incidence(pos_data.iloc[1]))
print(pos_data.iloc[1].values)

In [None]:
print(pos_excl)