# Sentiment Lexicon

In [None]:
import pickle
import numpy as np
import pandas as pd

from afinn import Afinn
import nltk
from nltk.corpus import sentiwordnet as swn
from nltk.corpus import wordnet as wn
nltk.download('sentiwordnet')
nltk.download('wordnet')

[nltk_data] Downloading package sentiwordnet to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data]   Package sentiwordnet is already up-to-date!
[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
with open ('./data/lemma_text_train_set.csv', 'rb') as fp:
    train_set = pickle.load(fp)

with open ('./data/lemma_text_val_set.csv', 'rb') as fp:
    val_set = pickle.load(fp)

In [None]:
def pos_tag_converter(tag):
    """
    Convert Spacy POS tag into one taken by sentiwordnet
    """
    if tag == 'NOUN' or tag == 'DET':
        return 'n'
    elif tag == 'ADJ':
        return 'a'
    elif tag == 'VERB':
        return 'v'
    elif tag == 'ADV':
        return 'r'        
    return None


def sentiwn_score(docs):
    scores = []
    word_scores = []
    for doc in docs:
        doc_score = []

        for token in doc:

            pos = pos_tag_converter(token.pos_)

            try:
                if pos == None:
                    doc_score.append([0,0])
                else:
                    synset = swn.senti_synsets(token.text, pos = pos)
                    synset = list(synset)[0]
                    doc_score.append([synset.pos_score(), -synset.neg_score()])
            except:
                    doc_score.append([0,0])

        doc_score = [max(score, key = abs) for score in doc_score]
        word_scores.append(doc_score)
        scores.append(sum(doc_score))   # sum of score per word dividied by total possible score to normalize
    
    class_target = np.sign(scores)
    return class_target, scores



afinn = Afinn()

def afinn_score(docs):
    scores = []
    word_scores = []
    for doc in docs:
        doc_score = [afinn.score(token.text) for token in doc]
        word_scores.append(doc_score)
        scores.append(sum(doc_score))

    class_target = np.sign(scores)
    return class_target, scores

In [None]:
# Extract sign of lexicon score to use as class for model classification positive, neutral, or negative

train_swn_target, train_swn_score = sentiwn_score(train_set)
train_afn_target, train_afn_score = afinn_score(train_set)

val_swn_target, val_swn_score = sentiwn_score(val_set)
val_afn_target, val_afn_score = afinn_score(val_set)

In [None]:
pd.DataFrame(train_swn_target).to_csv('./data/train_swn_target.csv', index = False, header = False)
pd.DataFrame(val_swn_target).to_csv('./data/val_swn_target.csv', index = False, header = False)
pd.DataFrame(train_afn_target).to_csv('./data/train_afn_target.csv', index = False, header = False)
pd.DataFrame(val_afn_target).to_csv('./data/val_afn_target.csv', index = False, header = False)

In [None]:
print("Train Set's Values per Class based on SentiWordNet Lexicon:\n", pd.Series(train_swn_target).value_counts(), '\n',
        "Train Set's Percents per class based on SentiWordNet Lexicon:\n", pd.Series(train_swn_target).value_counts() / len(train_swn_target))

Train Set's Values per Class based on SentiWordNet Lexicon:
  1.0    14691
-1.0    12042
 0.0     9632
dtype: int64 
 Train Set's Percents per class based on SentiWordNet Lexicon:
  1.0    0.403987
-1.0    0.331143
 0.0    0.264870
dtype: float64


In [None]:
print("Train Set's Values per based on Afinn Lexicon:\n", pd.Series(train_afn_target).value_counts(), '\n',
        "Train Set's Percents per class based on Afinn Lexicon:\n", pd.Series(train_afn_target).value_counts()/ len(train_afn_target))

Train Set's Values per based on Afinn Lexicon:
 -1.0    12698
 0.0    12086
 1.0    11581
dtype: int64 
 Train Set's Percents per class based on Afinn Lexicon:
 -1.0    0.349182
 0.0    0.332353
 1.0    0.318466
dtype: float64


In [None]:
print('Val Sets Values per Class based on SentiWordNet Lexicon:\n', pd.Series(val_swn_target).value_counts(), '\n',
        'Val Sets Percents per class based on SentiWordNet Lexicon:\n', pd.Series(val_swn_target).value_counts() / len(val_swn_target))

Val Sets Values per Class based on SentiWordNet Lexicon:
  1.0    2005
-1.0    1682
 0.0    1313
dtype: int64 
 Val Sets Percents per class based on SentiWordNet Lexicon:
  1.0    0.4010
-1.0    0.3364
 0.0    0.2626
dtype: float64


In [None]:
print('Val Sets Values per based on Afinn Lexicon:\n', pd.Series(val_afn_target).value_counts(), '\n',
        'Val Sets Percents per class based on Afinn Lexicon:\n', pd.Series(val_afn_target).value_counts()/ len(val_afn_target))

Val Sets Values per based on Afinn Lexicon:
 -1.0    1759
 0.0    1712
 1.0    1529
dtype: int64 
 Val Sets Percents per class based on Afinn Lexicon:
 -1.0    0.3518
 0.0    0.3424
 1.0    0.3058
dtype: float64


In [None]:
type(train_swn_target)

numpy.ndarray