# Sentiment Analysis: Sentiment Lexicon

La idea es incorporar información externa acerca de la presencia de palabras positivas y negativas.

Opciones:
1. sustituir las palabras por marcadores especiales POS y NEG.
2. agregar nuevos features numéricos que indiquen la presencia/cantidad de POS y NEG.

Vamos por la 2.


Usamos:
- https://mpqa.cs.pitt.edu/lexicons/subj_lexicon/


In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from util import load_datasets
train, dev, test = load_datasets()
X_train, y_train = train
X_dev, y_dev = dev
X_test, y_test = test

## Estado del Arte Actual

In [3]:
from model import build_pipeline
from util import print_eval

pipeline = build_pipeline()
pipeline.fit(X_train, y_train)
print_eval(pipeline, X_dev, y_dev)

accuracy	0.89

             precision    recall  f1-score   support

        neg       0.87      0.93      0.90       162
        pos       0.91      0.84      0.87       138

avg / total       0.89      0.89      0.89       300

[[150  12]
 [ 22 116]]


## Carga del Lexicón

In [4]:
filename = 'subjectivity_clues_hltemnlp05/subjclueslen1-HLTEMNLP05.tff'
f = open(filename)
lines = f.readlines()
f.close()

In [5]:
words = []
for line in lines:
    sline = line.split()
    dline = dict([token.split('=') for token in sline if '=' in token])
    word = dline['word1']
    pol = dline['priorpolarity']
    if pol not in {'both', 'neutral'}:
        if pol in {'negative', 'weakneg'}:
            pol = 'NEG'
        else:
            pol = 'POS'
        words.append((word, pol))

word_dict = dict(words)

In [6]:
word_dict['abandon']

'NEG'

## Nuevo Tokenizer

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
tkn = CountVectorizer().build_tokenizer()
def my_tkn(s):
    tokens = tkn(s)
    return [word_dict.get(token, token) for token in tokens]

In [8]:
my_tkn('creaky bastard')
#print(X_dev[0].decode('utf-8')[:200])
#my_tkn(X_dev[0].decode('utf-8'))

['creaky', 'NEG']

In [9]:
vect = CountVectorizer(vocabulary=['POS', 'NEG'], tokenizer=my_tkn)
vect.fit(X_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function my_tkn at 0x7f3b3fa7f7b8>,
        vocabulary=['POS', 'NEG'])

In [10]:
vect.transform(X_train[:10]).toarray()
#vect.get_feature_names()

array([[106, 107],
       [ 59,  30],
       [ 43,  22],
       [ 48,  31],
       [ 30,  30],
       [ 12,  22],
       [ 28,  22],
       [ 36,  30],
       [ 40,  35],
       [ 24,  41]])

## Scaler

In [11]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
vect = Pipeline([
    ('pol', CountVectorizer(vocabulary=['POS', 'NEG'], tokenizer=my_tkn)),
    ('scl', StandardScaler(with_mean=False)),
])
vect.fit(X_train)



Pipeline(memory=None,
     steps=[('pol', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_...    vocabulary=['POS', 'NEG'])), ('scl', StandardScaler(copy=True, with_mean=False, with_std=True))])

In [12]:
vect.transform(X_train[:10]).toarray()



array([[5.11584988, 6.67733308],
       [2.84750135, 1.87214946],
       [2.07529759, 1.3729096 ],
       [2.31661127, 1.93455444],
       [1.44788204, 1.87214946],
       [0.57915282, 1.3729096 ],
       [1.35135657, 1.3729096 ],
       [1.73745845, 1.87214946],
       [1.93050939, 2.18417437],
       [1.15830563, 2.55860426]])

## Feature Union

In [13]:
from sklearn.pipeline import FeatureUnion
vect = FeatureUnion([
    ('bow', CountVectorizer(binary=True)),
    ('pol', 
         Pipeline([
            ('pol', CountVectorizer(vocabulary=['POS', 'NEG'], tokenizer=my_tkn)),
            ('scl', StandardScaler(with_mean=False)),
        ])
    )
])
vect.fit(X_train)



FeatureUnion(n_jobs=1,
       transformer_list=[('bow', CountVectorizer(analyzer='word', binary=True, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
    ... vocabulary=['POS', 'NEG'])), ('scl', StandardScaler(copy=True, with_mean=False, with_std=True))]))],
       transformer_weights=None)

In [14]:
vect.transform(X_train[:1])



<1x32424 sparse matrix of type '<class 'numpy.float64'>'
	with 644 stored elements in Compressed Sparse Row format>

## Experimentos

In [15]:
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from util import print_eval

pipeline = Pipeline([
    ('vect', FeatureUnion([
        ('bow', CountVectorizer(binary=True)),
        ('pol', 
             Pipeline([
                ('pol', CountVectorizer(vocabulary=['POS', 'NEG'], tokenizer=my_tkn)),
                ('scl', StandardScaler(with_mean=False)),
            ])
        )
    ])),
    ('clf', LogisticRegression(random_state=0)),
])
pipeline.fit(X_train, y_train)
print_eval(pipeline, X_dev, y_dev)



accuracy	0.85

             precision    recall  f1-score   support

        neg       0.85      0.89      0.87       162
        pos       0.86      0.81      0.84       138

avg / total       0.85      0.85      0.85       300

[[144  18]
 [ 26 112]]


In [16]:
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from util import print_eval

pipeline = Pipeline([
    ('vect', FeatureUnion([
        ('bow', CountVectorizer(
            binary=True,
            min_df=3,
            max_df=0.90,
            ngram_range=(1, 5),
        )),
        ('pol', 
             Pipeline([
                ('pol', CountVectorizer(vocabulary=['POS', 'NEG'], tokenizer=my_tkn)),
                ('scl', StandardScaler(with_mean=False)),
            ])
        )
    ])),
    ('clf', LogisticRegression(random_state=0)),
])
pipeline.fit(X_train, y_train)
print_eval(pipeline, X_dev, y_dev)



accuracy	0.89

             precision    recall  f1-score   support

        neg       0.87      0.93      0.90       162
        pos       0.91      0.84      0.87       138

avg / total       0.89      0.89      0.89       300

[[150  12]
 [ 22 116]]
