# Sentiment Analysis: Sentiment Lexicon

La idea es incorporar información externa acerca de la presencia de palabras positivas y negativas.

Opciones:
1. sustituir las palabras por marcadores especiales POS y NEG.
2. agregar nuevos features numéricos que indiquen la presencia/cantidad de POS y NEG.

Vamos por la 2.


Usamos:
- https://mpqa.cs.pitt.edu/lexicons/subj_lexicon/


In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from util import load_datasets
train, dev, test = load_datasets()
X_train, y_train = train
X_dev, y_dev = dev
X_test, y_test = test

## Estado del Arte Actual

In [114]:
from model import build_pipeline
from util import print_eval

pipeline = build_pipeline()
pipeline.fit(X_train, y_train)
print_eval(pipeline, X_dev, y_dev)

accuracy	0.89

             precision    recall  f1-score   support

        neg       0.87      0.93      0.90       162
        pos       0.91      0.84      0.87       138

avg / total       0.89      0.89      0.89       300

[[150  12]
 [ 22 116]]


## Carga del Lexicón

In [12]:
filename = 'subjectivity_clues_hltemnlp05/subjclueslen1-HLTEMNLP05.tff'
f = open(filename)
lines = f.readlines()
f.close()

In [91]:
words = []
for line in lines:
    sline = line.split()
    dline = dict([token.split('=') for token in sline if '=' in token])
    word = dline['word1']
    pol = dline['priorpolarity']
    if pol not in {'both', 'neutral'}:
        if pol in {'negative', 'weakneg'}:
            pol = 'NEG'
        else:
            pol = 'POS'
        words.append((word, pol))

word_dict = dict(words)

## Nuevo Tokenizer

In [94]:
tkn = CountVectorizer().build_tokenizer()
def my_tkn(s):
    tokens = tkn(s)
    return [word_dict.get(token, token) for token in tokens]

In [69]:
my_tkn('creaky bastard')
#print(X_dev[0].decode('utf-8')[:200])
#my_tkn(X_dev[0].decode('utf-8'))

['creaky', 'NEG']

In [110]:
#vect = CountVectorizer(vocabulary=['POS', 'NEG'], tokenizer=my_tkn)
vect.fit(X_train)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function my_tkn at 0x7eff3ed09b70>, use_idf=True,
        vocabulary=['POS', 'NEG'])

In [74]:
vect.transform(X_train[:10]).toarray()

array([[106, 107],
       [ 59,  30],
       [ 43,  22],
       [ 48,  31],
       [ 30,  30],
       [ 12,  22],
       [ 28,  22],
       [ 36,  30],
       [ 40,  35],
       [ 24,  41]])

## Feature Union

In [116]:
from sklearn.pipeline import FeatureUnion
vect = FeatureUnion([
    ('bow', CountVectorizer(binary=True)),
    ('pol', TfidfVectorizer(vocabulary=['POS', 'NEG'], tokenizer=my_tkn)),
])
vect.fit(X_train)

FeatureUnion(n_jobs=1,
       transformer_list=[('bow', CountVectorizer(analyzer='word', binary=True, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
    ...   tokenizer=<function my_tkn at 0x7eff3ed09b70>, use_idf=True,
        vocabulary=['POS', 'NEG']))],
       transformer_weights=None)

In [117]:
vect.transform(X_train[:1])

<1x32424 sparse matrix of type '<class 'numpy.float64'>'
	with 644 stored elements in Compressed Sparse Row format>

## Experimentos

In [113]:
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from util import print_eval

pipeline = Pipeline([
    ('vect', FeatureUnion([
        ('bow', CountVectorizer(binary=True)),
        ('pol', TfidfVectorizer(vocabulary=['POS', 'NEG'], tokenizer=my_tkn)),
    ])),
    ('clf', LogisticRegression(random_state=0)),
])
pipeline.fit(X_train, y_train)
print_eval(pipeline, X_dev, y_dev)

accuracy	0.85

             precision    recall  f1-score   support

        neg       0.85      0.88      0.87       162
        pos       0.86      0.82      0.84       138

avg / total       0.85      0.85      0.85       300

[[143  19]
 [ 25 113]]


In [99]:
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from util import print_eval

pipeline = Pipeline([
    ('vect', FeatureUnion([
        ('bow', CountVectorizer(
            binary=True,
            min_df=3,
            max_df=0.90,
            ngram_range=(1, 5),
        )),
        ('pol', CountVectorizer(vocabulary=['POS', 'NEG'], tokenizer=my_tkn)),
    ])),
    ('clf', LogisticRegression(random_state=0)),
])
pipeline.fit(X_train, y_train)
print_eval(pipeline, X_dev, y_dev)

accuracy	0.86

             precision    recall  f1-score   support

        neg       0.85      0.91      0.88       162
        pos       0.89      0.80      0.84       138

avg / total       0.87      0.86      0.86       300

[[148  14]
 [ 27 111]]
