# Proporcional Difference

Teste do método de seleção de features Proporcional Difference proposto em http://crpit.com/confpapers/CRPITV87Simeon.pdf

Este método foi testado contra outros, inclusive o SentiWordNet, em:
http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.471.5694&rep=rep1&type=pdf#page=77

In [28]:
# Aqui estou usando todo o corpus para selecionar as melhores features.

from utils import  *
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from collections import defaultdict
from pandas import DataFrame

# 1) Separar o corpus por classes
numRemover = NumRemover()
all_data = numRemover.fit_transform(get_data_from_db())
pos_data = numRemover.fit_transform(get_data_from_db(sentiment="PO"))["texts"]
neg_data = numRemover.fit_transform(get_data_from_db(sentiment="NG"))["texts"]
neu_data = numRemover.fit_transform(get_data_from_db(sentiment="NE"))["texts"]

# 2) Criar um vetorizer para unigrams
vectorizer = CountVectorizer(ngram_range=(1,2), stop_words= stopwords.words("portuguese"), strip_accents= "unicode")

# 3) Fazer o fit para todo o corpus
vectorizer.fit(all_data["texts"])

# 4) Pegar todo o vocabulário do corpus
vocab = list(vectorizer.vocabulary_.keys())
vocab_indexed = {value: key for key, value in vectorizer.vocabulary_.items()}

# 5) Obter as matrizes das classes
pos_matrix = vectorizer.transform(pos_data)
neg_matrix = vectorizer.transform(neg_data)
neu_matrix = vectorizer.transform(neu_data)

# 6) Obter o vetor de soma das frquencias das palavras nas classes
pos_sum = pos_matrix.sum(axis=0)
neg_sum = neg_matrix.sum(axis=0)
neu_sum = neu_matrix.sum(axis=0)

# 7) Construir um dicionario de frequencias para cada classe e calcular o PD de cada uma
# função que calcula o PD
pd = lambda c1, c2, c3: max([(c1 - c2 - c3)/(c1 + c2 + c3), (c2 - c1 - c3)/(c1 + c2 + c3), (c3 - c2 - c1)/(c1 + c2 + c3)])
freq_dict = [ {"PO": pos_sum[0,index], "NG": neg_sum[0,index], "NE": neu_sum[0,index], "PD": pd(pos_sum[0,index], neg_sum[0,index], neu_sum[0,index])} for index in vocab_indexed.keys()]
freq_df = DataFrame(data = freq_dict, index= vocab_indexed.values())
freq_df = freq_df[["PO", "NG", "NE", "PD"]] # Reordenando as colunas do data frame

# 8) Selecionar as melhores features
threshold = 1
selected = freq_df[freq_df["PD"] == threshold].index.values
selected = list(selected)
print("Dimensionalidade: ", len(selected))

# 9) Avaliar o desempenho dos classificadores utilizando as features selecionadas
unigram_vectorizer = CountVectorizer(ngram_range=(1,2), stop_words= stopwords.words("portuguese"), strip_accents= "unicode", vocabulary= selected)
evaluate(all_data, unigram_vectorizer, 10)

Dimensionalidade:  23444
Naive Bayes---------------------------------
Cross Validation:
Accuracia media:  0.742761248186
Desvio padrão:  0.0402812737308

MaxEnt--------------------------------------
Cross Validation:
Accuracia media:  0.897296806967
Desvio padrão:  0.0220098213888

SVM-----------------------------------------
Cross Validation:
Accuracia media:  0.494375907112
Desvio padrão:  0.0349591032269


In [57]:
# Aqui usamos somente o conjunto de treinamento para selecionar as melhores features com PD

from utils import  *
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from collections import defaultdict
from pandas import DataFrame

from sklearn.model_selection import train_test_split
import random 

# 1) Separar o corpus por classes
numRemover = NumRemover()
all_data = numRemover.fit_transform(get_data_from_db())
seed = int(random.uniform(0, 100))
train_ratio = 0.7
train, test, ytrain, ytest = train_test_split(all_data, all_data["labels"], train_size = train_ratio, stratify = all_data["labels"], random_state = seed)

pos_data = train[train["labels"] == "PO"]["texts"]
neg_data = train[train["labels"] == "NG"]["texts"]
neu_data = train[train["labels"] == "NE"]["texts"]

# 2) Criar um vetorizer para unigrams
vectorizer = CountVectorizer(ngram_range=(1,2), stop_words= stopwords.words("portuguese"), strip_accents= "unicode")

# 3) Fazer o fit para todo o corpus
vectorizer.fit(train["texts"])

# 4) Pegar todo o vocabulário do corpus
vocab = list(vectorizer.vocabulary_.keys())
vocab_indexed = {value: key for key, value in vectorizer.vocabulary_.items()}

# 5) Obter as matrizes das classes
pos_matrix = vectorizer.transform(pos_data)
neg_matrix = vectorizer.transform(neg_data)
neu_matrix = vectorizer.transform(neu_data)

# 6) Obter o vetor de soma das frquencias das palavras nas classes
pos_sum = pos_matrix.sum(axis=0)
neg_sum = neg_matrix.sum(axis=0)
neu_sum = neu_matrix.sum(axis=0)

# 7) Construir um dicionario de frequencias para cada classe e calcular o PD de cada uma
# função que calcula o PD
pd = lambda c1, c2, c3: max([(c1 - c2 - c3)/(c1 + c2 + c3), (c2 - c1 - c3)/(c1 + c2 + c3), (c3 - c2 - c1)/(c1 + c2 + c3)])
freq_dict = [ {"PO": pos_sum[0,index], "NG": neg_sum[0,index], "NE": neu_sum[0,index], "PD": pd(pos_sum[0,index], neg_sum[0,index], neu_sum[0,index])} for index in vocab_indexed.keys()]
freq_df = DataFrame(data = freq_dict, index= vocab_indexed.values())
freq_df = freq_df[["PO", "NG", "NE", "PD"]] # Reordenando as colunas do data frame

# 8) Selecionar as melhores features
threshold = 1
selected = freq_df[freq_df["PD"] >= threshold].index.values
selected = list(selected)
print("Dimensionalidade: ", len(selected))

# 9) Avaliar o desempenho dos classificadores utilizando as features selecionadas
unigram_vectorizer = CountVectorizer(ngram_range=(1,2), stop_words= stopwords.words("portuguese"), strip_accents= "unicode", vocabulary= selected)
evaluate(all_data, unigram_vectorizer, 10)

Dimensionalidade:  17728
Naive Bayes---------------------------------
Cross Validation:
Accuracia media:  0.635359216255
Desvio padrão:  0.0341172871526

MaxEnt--------------------------------------
Cross Validation:
Accuracia media:  0.732220609579
Desvio padrão:  0.0476956566556

SVM-----------------------------------------
Cross Validation:
Accuracia media:  0.593196661829
Desvio padrão:  0.0314476741683


In [64]:
# Aqui usamos somente o conjunto de treinamento para selecionar as melhores features com PD

from utils import  *
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from collections import defaultdict
from pandas import DataFrame

from sklearn.model_selection import train_test_split
import random 

# 1) Separar o corpus por classes
numRemover = NumRemover()
all_data = numRemover.fit_transform(get_data_from_db())
seed = int(random.uniform(0, 100))
train_ratio = 0.9
train, test, ytrain, ytest = train_test_split(all_data, all_data["labels"], train_size = train_ratio, stratify = all_data["labels"], random_state = seed)

pos_data = train[train["labels"] == "PO"]["texts"]
neg_data = train[train["labels"] == "NG"]["texts"]
neu_data = train[train["labels"] == "NE"]["texts"]

# 2) Criar um vetorizer para unigrams
vectorizer = CountVectorizer(ngram_range=(1,2), stop_words= stopwords.words("portuguese"), strip_accents= "unicode")

# 3) Fazer o fit para todo o corpus
vectorizer.fit(train["texts"])

# 4) Pegar todo o vocabulário do corpus
vocab = list(vectorizer.vocabulary_.keys())
vocab_indexed = {value: key for key, value in vectorizer.vocabulary_.items()}

# 5) Obter as matrizes das classes
pos_matrix = vectorizer.transform(pos_data)
neg_matrix = vectorizer.transform(neg_data)
neu_matrix = vectorizer.transform(neu_data)

# 6) Obter o vetor de soma das frquencias das palavras nas classes
pos_sum = pos_matrix.sum(axis=0)
neg_sum = neg_matrix.sum(axis=0)
neu_sum = neu_matrix.sum(axis=0)

# 7) Construir um dicionario de frequencias para cada classe e calcular o PD de cada uma
# função que calcula o PD
pd = lambda c1, c2, c3: max([(c1 - c2 - c3)/(c1 + c2 + c3), (c2 - c1 - c3)/(c1 + c2 + c3), (c3 - c2 - c1)/(c1 + c2 + c3)])
freq_dict = [ {"PO": pos_sum[0,index], "NG": neg_sum[0,index], "NE": neu_sum[0,index], "PD": pd(pos_sum[0,index], neg_sum[0,index], neu_sum[0,index])} for index in vocab_indexed.keys()]
freq_df = DataFrame(data = freq_dict, index= vocab_indexed.values())
freq_df = freq_df[["PO", "NG", "NE", "PD"]] # Reordenando as colunas do data frame

# 8) Selecionar as melhores features
threshold = 1
selected = freq_df[freq_df["PD"] >= threshold].index.values
selected = list(selected)
print("Dimensionalidade: ", len(selected))

# 9) Avaliar o desempenho dos classificadores utilizando as features selecionadas
unigram_vectorizer = CountVectorizer(ngram_range=(1,2), stop_words= stopwords.words("portuguese"), strip_accents= "unicode", vocabulary= selected)
evaluate(all_data, unigram_vectorizer, 10)

Dimensionalidade:  21455
Naive Bayes---------------------------------
Cross Validation:
Accuracia media:  0.721661828737
Desvio padrão:  0.0390942898891

MaxEnt--------------------------------------
Cross Validation:
Accuracia media:  0.835885341074
Desvio padrão:  0.0252679329833

SVM-----------------------------------------
Cross Validation:
Accuracia media:  0.522224238026
Desvio padrão:  0.0434539102682
