In [1]:
from utils import *
import re
from pprint import pprint
from nltk.corpus import stopwords

all_data = get_data_from_db()
all_data = all_data[(all_data["labels"]== "PO") | (all_data["labels"]== "NG")]

num_remover = NumRemover()
all_data = num_remover.fit_transform(all_data)

neg_data = all_data[all_data["labels"] == "NG"]
pos_data = all_data[all_data["labels"] == "PO"]

ratio = 0.7
neg_train = neg_data.iloc[0:round(ratio*neg_data.shape[0]), :]
pos_train = pos_data.iloc[0:round(ratio*pos_data.shape[0]), :]

neg_test = neg_data.iloc[round(ratio*neg_data.shape[0]): , :]
pos_test = pos_data.iloc[round(ratio*neg_data.shape[0]): , :]

stopwords_pt = stopwords.words("portuguese")

# Vocabulários

In [4]:
from pandas import DataFrame
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(stop_words= stopwords_pt)

cv.fit(neg_train["texts"])
vocab_neg = set(cv.vocabulary_.keys())

cv.fit(pos_train["texts"])
vocab_pos = set(cv.vocabulary_.keys())

print("Dimensão vocabulario negativo: " + str(len(vocab_neg)))
print("Dimensão vocabulario positivo: " + str(len(vocab_pos)))

Dimensão vocabulario negativo: 2630
Dimensão vocabulario positivo: 2272


In [5]:
# Verificar interseção dos vocabulários
intersect = vocab_neg.intersection(vocab_pos)
vocab_neg = vocab_neg.difference(intersect)
vocab_pos = vocab_pos.difference(intersect)

lexicon = list(vocab_neg.union(vocab_pos))
print("Dimensão: ", len(lexicon))

Dimensão:  2734


In [6]:
from sklearn.naive_bayes  import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegressionCV
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

features = FeatureUnion([
                    ("lexicon_vector", CountVectorizer(strip_accents= "unicode", vocabulary= lexicon, binary = False))
                    ])

print("USING FEATURE FREQUENCY")
evaluate(all_data, features, 10)

features = FeatureUnion([
                    ("lexicon_vector", CountVectorizer(vocabulary= lexicon, binary = True))
                    ])
print("USING FEATURE PRESENCE")
evaluate(all_data, features, 10)

USING FEATURE FREQUENCY
Naive Bayes---------------------------------
Cross Validation:
Accuracia media:  0.811690140845
Desvio padrão:  0.0343164778714

MaxEnt--------------------------------------
Cross Validation:
Accuracia media:  0.803138832998
Desvio padrão:  0.0253772032591

SVM-----------------------------------------
Cross Validation:
Accuracia media:  0.767464788732
Desvio padrão:  0.0362195449787
USING FEATURE PRESENCE
Naive Bayes---------------------------------
Cross Validation:
Accuracia media:  0.813118712274
Desvio padrão:  0.01860809145

MaxEnt--------------------------------------
Cross Validation:
Accuracia media:  0.827384305835
Desvio padrão:  0.0281973772125

SVM-----------------------------------------
Cross Validation:
Accuracia media:  0.790301810865
Desvio padrão:  0.0478120064414


In [7]:
# Avaliando a incidência dos bigramas
cv = CountVectorizer(strip_accents= "unicode", ngram_range=(1,2), stop_words= stopwords_pt)

neg_counts = cv.fit_transform(neg_train["texts"])
neg_bigrams = cv.vocabulary_

pos_counts = cv.fit_transform(pos_train["texts"])
pos_bigrams = cv.vocabulary_

sp = np.sum(pos_counts.toarray(), axis=0)
sn = np.sum(neg_counts.toarray(), axis=0)

pos_bigrams  = {bigram: sp[index] for bigram, index in pos_bigrams.items()}
neg_bigrams  = {bigram: sn[index] for bigram, index in neg_bigrams.items()}

pos_excl_bigrams = {key:value for key, value in pos_bigrams.items() if key not in neg_bigrams}
neg_excl_bigrams = {key:value for key, value in neg_bigrams.items() if key not in pos_bigrams}

lexicon_bigrams = list(pos_excl_bigrams.keys()) + list(neg_excl_bigrams.keys())
len(lexicon_bigrams)

13034

In [8]:
from pandas import Series
s = Series(neg_excl_bigrams)
s.nlargest(15)

minas energia           15
edison                  12
edison lobao            12
sabia                   12
alves                   11
sabesp                  11
alves pmdb              10
camara                  10
eduardo alves           10
henrique eduardo        10
morto                   10
nao sabia               10
presidente republica    10
vaccari                 10
calheiros                9
dtype: int64

In [9]:
from sklearn.naive_bayes  import MultinomialNB
from sklearn.svm import SVC
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

features = FeatureUnion([
                    ("lexicon_vector", CountVectorizer(strip_accents= "unicode", ngram_range=(1,2), stop_words=stopwords_pt, vocabulary= lexicon_bigrams))
                    ])

print("USING FEATURE FREQUENCY")
evaluate(all_data, features, 10)

print("\nUSING FEATURE PRESENCE")
features = FeatureUnion([
                    ("lexicon_vector", CountVectorizer(ngram_range=(1,2), stop_words=stopwords_pt, vocabulary= lexicon_bigrams, binary= True))
                    ])
evaluate(all_data, features, 10)

USING FEATURE FREQUENCY
Naive Bayes---------------------------------
Cross Validation:
Accuracia media:  0.823138832998
Desvio padrão:  0.0141450266312

MaxEnt--------------------------------------
Cross Validation:
Accuracia media:  0.85444668008
Desvio padrão:  0.034436336099

SVM-----------------------------------------
Cross Validation:
Accuracia media:  0.757505030181
Desvio padrão:  0.038212831324

USING FEATURE PRESENCE
Naive Bayes---------------------------------
Cross Validation:
Accuracia media:  0.830221327968
Desvio padrão:  0.0341666014555

MaxEnt--------------------------------------
Cross Validation:
Accuracia media:  0.844486921529
Desvio padrão:  0.0252220241979

SVM-----------------------------------------
Cross Validation:
Accuracia media:  0.737444668008
Desvio padrão:  0.0312582365699


In [10]:
from sklearn.naive_bayes  import MultinomialNB
from sklearn.svm import SVC
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

features = FeatureUnion([
                    ("lexicon_vector", TfidfVectorizer(strip_accents= "unicode", ngram_range=(1,2), stop_words=stopwords_pt, vocabulary= lexicon_bigrams))
                    ])

print("USING FEATURE FREQUENCY")
evaluate(all_data, features, 10)

print("\nUSING FEATURE PRESENCE")
features = FeatureUnion([
                    ("lexicon_vector", TfidfVectorizer(strip_accents= "unicode", ngram_range=(1,1), stop_words=stopwords_pt, vocabulary= lexicon_bigrams, binary= True))
                    ])
evaluate(all_data, features, 10)

USING FEATURE FREQUENCY
Naive Bayes---------------------------------
Cross Validation:
Accuracia media:  0.797384305835
Desvio padrão:  0.0410000923733

MaxEnt--------------------------------------
Cross Validation:
Accuracia media:  0.838752515091
Desvio padrão:  0.0462242890494

SVM-----------------------------------------
Cross Validation:
Accuracia media:  0.557766599598
Desvio padrão:  0.00187122736419

USING FEATURE PRESENCE
Naive Bayes---------------------------------
Cross Validation:
Accuracia media:  0.776016096579
Desvio padrão:  0.0443630534937

MaxEnt--------------------------------------
Cross Validation:
Accuracia media:  0.814527162978
Desvio padrão:  0.0320598093345

SVM-----------------------------------------
Cross Validation:
Accuracia media:  0.557766599598
Desvio padrão:  0.00187122736419


In [11]:
from sklearn.feature_selection import VarianceThreshold, SelectKBest,chi2, mutual_info_classif
from sklearn.naive_bayes  import MultinomialNB
from sklearn.svm import SVC
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

features = Pipeline([
                    ("lexicon_vector", CountVectorizer(strip_accents= "unicode", ngram_range=(1,2), stop_words=stopwords_pt, vocabulary= lexicon_bigrams)),
                    ("feature_selection", TruncatedSVD(n_components = 6000))
                    ])

run_cross_validation(all_data, features, LogisticRegressionCV(fit_intercept=False, penalty= 'l2', dual= False), n_folds = 10, shuffle= True)
print("\nUSING FEATURE PRESENCE")
features = Pipeline([
                    ("lexicon_vector", CountVectorizer(ngram_range=(1,2), stop_words=stopwords_pt, vocabulary= lexicon_bigrams, binary= True)),
                    ("feature_selection", VarianceThreshold(threshold = 0.01))
                    ])
evaluate(all_data, features, 10)

Cross Validation:
Accuracia media:  0.85444668008
Desvio padrão:  0.034436336099

USING FEATURE PRESENCE
Naive Bayes---------------------------------
Cross Validation:
Accuracia media:  0.616257545272
Desvio padrão:  0.0462808851999

MaxEnt--------------------------------------
Cross Validation:
Accuracia media:  0.639094567404
Desvio padrão:  0.0324848518176

SVM-----------------------------------------
Cross Validation:
Accuracia media:  0.639114688129
Desvio padrão:  0.0267021343644


In [3]:
from sklearn.feature_selection import VarianceThreshold, SelectKBest,chi2, mutual_info_classif
from sklearn.naive_bayes  import MultinomialNB
from sklearn.svm import SVC
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

features = Pipeline([
                    ("lexicon_vector", CountVectorizer(ngram_range=(1,2), stop_words=stopwords_pt, vocabulary= lexicon_bigrams)),
                    ("feature_selection", TruncatedSVD(n_components = 6000))
                    ])

run_cross_validation2(all_data, features, LogisticRegressionCV(fit_intercept=False, penalty= 'l2', dual= False), n_folds = 10, shuffle= True)

Cross Validation:


ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [4]:
all_data

Unnamed: 0,labels,texts
5,NG,o presidente do pt rui falcão acusou na noit...
6,NG,aécio faz coro a denúncias sem provas veicula...
7,NG,segundo falcão é cômico ouvir alguém do ps...
8,NG,ele citou o mensalão mineiro o esquema de cor...
9,NG,para o presidente do pt aécio deveria se preo...
10,NG,a gravidade das acusações do candidato tucano...
12,NG,o brasil acordou hoje perplexo com as mais gr...
13,NG,na campanha de dilma rousseff a primeira reaç...
14,NG,a preocupação é justificável quando a polícia...
16,NG,os dois nomes mais citados são o tesoureiro pe...


In [16]:
type(M)

numpy.ndarray