In [1]:
from utils import *
from sklearn.naive_bayes  import MultinomialNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.pipeline import FeatureUnion
from nltk.corpus import stopwords
from nltk import FreqDist


import numpy as np
import nltk

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

%matplotlib notebook

In [28]:
# Recuperação dos textos - treinar o modelo somente com as classes POS e NEG
all_data = get_data_from_db()
# all_data = all_data[(all_data["labels"] == "PO") | (all_data["labels"] == "NG")]
ne_texts = all_data[(all_data["labels"] == "NE")]
ng_texts = all_data[(all_data["labels"] == "NG")]
po_texts = all_data[(all_data["labels"] == "PO")]
print('Textos carregados')

# lexicon = get_LIWC_lexicon()
lexicon = load_claudia_freitas_lexicon()

print("Lexico carregado")



Textos carregados
Lexico carregado


# Palavras mais e menos frequentes e numerais

In [10]:
num_remover = NumRemover()
all_data = num_remover.fit_transform(all_data)

all_texts = ''.join(all_data["texts"].tolist())
fd = FreqDist(all_texts.split())
stopwords_pt = stopwords.words("portuguese")

for word in stopwords_pt:
    if word in fd.keys():
        fd.pop(word)
    


# Stemming

In [4]:
stemmer = Stemmer()

all_texts = stemmer.fit_transform(all_data["texts"])
all_data["texts"] = all_texts

ne_texts = all_data[(all_data["labels"] == "NE")]
ng_texts = all_data[(all_data["labels"] == "NG")]
po_texts = all_data[(all_data["labels"] == "PO")]

# Aplicação do LSA

In [29]:
# cv = CountVectorizer(ngram_range = (1,2), stop_words=stopwords.words("portuguese"), vocabulary= lexicon)
# cv = TfidfVectorizer(, ngram_range = (1,2), stop_words=stopwords.words("portuguese"))

# Features
cv = FeatureUnion([
                    ("bigram", CountVectorizer(ngram_range=(2,2), stop_words= stopwords.words("portuguese"), binary= True)),
                    ("lexicon_vector", CountVectorizer(vocabulary= lexicon)),
                    ])

bag_of_words = cv.fit_transform(all_data["texts"])

svd = TruncatedSVD(n_components=5)
svd.fit(bag_of_words)

TruncatedSVD(algorithm='randomized', n_components=5, n_iter=5,
       random_state=None, tol=0.0)

# Analise de textos Neutros

In [30]:
X_neut = svd.transform(cv.transform(ne_texts["texts"]))

# from matplotlib import pyplot as plt
# plt.plot(X_pos[:,0], X_pos[:,1], 'bo')

fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(X_neut[:,0], X_neut[:,1], X_neut[:,2], c = 'g')
ax.set_title("Vizualização da classe neutra")

<IPython.core.display.Javascript object>

<matplotlib.text.Text at 0x7f48f05df128>

In [9]:
bag_of_words.shape

(1042, 24676)

# Analise de Textos Negativos



In [31]:
X_neg = svd.transform(cv.transform(ng_texts["texts"]))

%matplotlib notebook

# from matplotlib import pyplot as plt
# plt.plot(X_neg[:,0], X_neg[:,1], 'ro')

fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(X_neg[:,0], X_neg[:,1], X_neg[:,2], c='r')
ax.set_title("Vizualização da classe negativa")

<IPython.core.display.Javascript object>

<matplotlib.text.Text at 0x7f48f05e4f98>

In [22]:
X_neg.shape

(391, 3)

# Analise dos textos positivos

In [32]:
X_pos = svd.transform(cv.transform(po_texts["texts"]))

%matplotlib notebook

# from matplotlib import pyplot as plt
# plt.plot(X_pos[:,0], X_pos[:,1], 'bo')

fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(X_pos[:,0], X_pos[:,1], X_pos[:,2], c = 'b')
ax.set_title("Vizualização da classe positiva")

<IPython.core.display.Javascript object>

<matplotlib.text.Text at 0x7f48f05cba90>

# Vizualização dos dois grupos (PO e NG)

In [33]:
# from matplotlib import pyplot as plt
# plt.plot(X_neg[:,0], X_neg[:,1], 'ro', X_pos[:,0], X_pos[:,1], 'bo')

fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(X_pos[:,0], X_pos[:,1], X_pos[:,2], c = 'b')
ax.scatter(X_neg[:,0], X_neg[:,1], X_neg[:,2], c = 'r')
ax.set_title("Vizualização de ambas as classes")

<IPython.core.display.Javascript object>

<matplotlib.text.Text at 0x7f48f060c4e0>

# Vizualização dos dois grupos (PO e NE)

In [26]:
%matplotlib notebook
# from matplotlib import pyplot as plt
# plt.plot(X_neg[:,0], X_neg[:,1], 'ro', X_pos[:,0], X_pos[:,1], 'bo')

fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(X_pos[:,0], X_pos[:,1], X_pos[:,2], c = 'b', label = "PO")
ax.scatter(X_neut[:,0], X_neut[:,1], X_neut[:,2], c = 'g', label = "NE")

ax.set_title("Vizualização das classes PO e NG")
ax.legend()

<IPython.core.display.Javascript object>

<matplotlib.legend.Legend at 0x7f48f043ee80>

# Vizualização dos dois grupos (NG e NE)


In [62]:
%matplotlib notebook
# from matplotlib import pyplot as plt
# plt.plot(X_neg[:,0], X_neg[:,1], 'ro', X_pos[:,0], X_pos[:,1], 'bo')

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(X_neg[:,0], X_neg[:,1], X_neg[:,2], c = 'k', label = "NG")
ax.scatter(X_neut[:,0], X_neut[:,1], X_neut[:,2], c = 'y', label = "NE")

ax.set_title("Vizualização das classes NG e NE")
ax.legend()

<IPython.core.display.Javascript object>

<matplotlib.legend.Legend at 0x7fed75e8ab00>

# Visualização dos 3 grupos

In [34]:
%matplotlib notebook
# from matplotlib import pyplot as plt
# plt.plot(X_neg[:,0], X_neg[:,1], 'ro', X_pos[:,0], X_pos[:,1], 'bo')

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(X_pos[:,0], X_pos[:,1], X_pos[:,2], c = 'b', label = "PO")
ax.scatter(X_neg[:,0], X_neg[:,1], X_neg[:,2], c = 'r', label = "NG")
ax.scatter(X_neut[:,0], X_neut[:,1], X_neut[:,2], c = 'g', label = "NE")

ax.set_title("Vizualização das 3 classes")
ax.legend()

<IPython.core.display.Javascript object>

<matplotlib.legend.Legend at 0x7f48f03b8978>

# Classificar utilizando SVM (Utilizando somente PO e NG)

In [35]:
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold
from pandas import DataFrame
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score

svm = SVC(C=500)
skfold = StratifiedKFold(n_splits=10, shuffle=True)

X_pos_data = np.append(X_pos, np.ones((X_pos.shape[0],1)), axis = 1)
X_neg_data = np.append(X_neg, np.zeros((X_neg.shape[0],1)) , axis = 1)

data = np.append(X_pos_data, X_neg_data, axis=0)

accuracy_mean = 0
for index, (train, test) in enumerate(skfold.split(data[:,0:5], data[:,5])):
    x_train = data[train,0:5]
    y_train =  data[train,5]
    
    x_test = data[test,0:5]
    y_test =  data[test,5]
    
    svm.fit(x_train, y_train)
    predictions = svm.predict(x_test)
    accuracy = accuracy_score(y_test, predictions)
    accuracy_mean += accuracy/10
    
    print("Fold " + str(index) + " : " + str(accuracy) )
    
print("\nAccuracia media: " + str(accuracy_mean))

Fold 0 : 0.605633802817
Fold 1 : 0.6
Fold 2 : 0.614285714286
Fold 3 : 0.571428571429
Fold 4 : 0.7
Fold 5 : 0.614285714286
Fold 6 : 0.528571428571
Fold 7 : 0.585714285714
Fold 8 : 0.571428571429
Fold 9 : 0.557142857143

Accuracia media: 0.594849094567


# Classificar utilizando SVM (Utilizando todas as classes)

In [28]:
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold
from pandas import DataFrame
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score

svm = SVC(C=316)
skfold = StratifiedKFold(n_splits=10, shuffle=True)

X_pos_data = np.append(X_pos, np.ones((X_pos.shape[0],1)), axis = 1)
X_neg_data = np.append(X_neg, -np.ones((X_neg.shape[0],1)), axis = 1)
X_neu_data = np.append(X_neut, np.zeros((X_neut.shape[0],1)), axis = 1)

data = np.append(X_pos_data, X_neg_data, axis=0)
data = np.append(data, X_neu_data, axis=0)

accuracy_mean = 0
for index, (train, test) in enumerate(skfold.split(data[:,0:5], data[:,5])):
    x_train = data[train,0:5]
    y_train =  data[train,5]
    
    x_test = data[test,0:5]
    y_test =  data[test,5]
    
    svm.fit(x_train, y_train)
    predictions = svm.predict(x_test)
    accuracy = accuracy_score(y_test, predictions)
    accuracy_mean += accuracy/10
    
    print("Fold " + str(index) + " : " + str(accuracy) )
    
print("\nAccuracia media: " + str(accuracy_mean))

Fold 0 : 0.5
Fold 1 : 0.5
Fold 2 : 0.519230769231
Fold 3 : 0.471153846154
Fold 4 : 0.509615384615
Fold 5 : 0.548076923077
Fold 6 : 0.596153846154
Fold 7 : 0.461538461538
Fold 8 : 0.509615384615
Fold 9 : 0.490384615385

Accuracia media: 0.510576923077
