In [1]:
# importing libraries
import nltk
import pandas as pd
from nltk.corpus import wordnet as wn
from nltk.corpus import sentiwordnet as swn
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
# import nltk
# nltk.download('sentiwordnet')

from src.nlp_preprocessing import pos_tag
from src.nlp_preprocessing import penn_to_wn

In [2]:
df = pd.read_csv('../data/preprocessed_small_sample.csv')

In [3]:
def get_sentiment_score(tokens):
    """
    Calculate sentiment score for a list of tokens.

    Parameters:
    - tokens (list): List of tokens representing words.

    Returns:
    - score (float): Sentiment score calculated based on the tokens.
    """

    # Define score default as zero
    score = 0
    # Tag individual tokens with their part of speech
    tags = pos_tag(tokens)
    
    # Iterate through each token and its corresponding tag
    for word, tag in tags:
        
        # Convert Penn Treebank tag to WordNet tag
        wordnet_tag = penn_to_wn(tag)
        # If WordNet tag is not found, skip to the next token
        if not wordnet_tag:
            continue
        
        # Get synsets (sets of synonyms) for the word with the specified POS tag
        synsets = wn.synsets(word, pos=wordnet_tag)
        # If no synsets are found, skip to the next token
        if not synsets:
            continue
        
        # Select the most common synset
        synset = synsets[0]
        # Get sentiment score for the synset using SentiWordNet
        sentiwordnet_synset = swn.senti_synset(synset.name())
        
        # Update the score by adding the difference between positive and negative scores
        score += (sentiwordnet_synset.pos_score() - sentiwordnet_synset.neg_score())
        
    return score

In [6]:
# testing
swn.senti_synset(wn.synsets("amazing", wn.ADJ)[0].name()).pos_score()

0.5

In [12]:
# testing
synsets = swn.senti_synsets('abandon')

for i in synsets:
    print("POS score:", i.pos_score())
    print("NEG score:", i.neg_score())
    print("POS OBJ:", i.obj_score())
    print('Overall Score:', i.pos_score() - i.neg_score())
    print()

POS score: 0.0
NEG score: 0.375
POS OBJ: 0.625
Overall Score: -0.375

POS score: 0.125
NEG score: 0.375
POS OBJ: 0.5
Overall Score: -0.25

POS score: 0.0
NEG score: 0.125
POS OBJ: 0.875
Overall Score: -0.125

POS score: 0.0
NEG score: 0.125
POS OBJ: 0.875
Overall Score: -0.125

POS score: 0.0
NEG score: 0.0
POS OBJ: 1.0
Overall Score: 0.0

POS score: 0.0
NEG score: 0.125
POS OBJ: 0.875
Overall Score: -0.125

POS score: 0.0
NEG score: 0.375
POS OBJ: 0.625
Overall Score: -0.375



In [14]:
# testing
synsets = swn.senti_synsets('love')

for i in synsets:
    print("POS score:", i.pos_score())
    print("NEG score:", i.neg_score())
    print("POS OBJ:", i.obj_score())
    print('Overall Score:', i.pos_score() - i.neg_score())
    print()

POS score: 0.625
NEG score: 0.0
POS OBJ: 0.375
Overall Score: 0.625

POS score: 0.375
NEG score: 0.0
POS OBJ: 0.625
Overall Score: 0.375

POS score: 0.125
NEG score: 0.0
POS OBJ: 0.875
Overall Score: 0.125

POS score: 0.25
NEG score: 0.0
POS OBJ: 0.75
Overall Score: 0.25

POS score: 0.0
NEG score: 0.0
POS OBJ: 1.0
Overall Score: 0.0

POS score: 0.0
NEG score: 0.0
POS OBJ: 1.0
Overall Score: 0.0

POS score: 0.5
NEG score: 0.0
POS OBJ: 0.5
Overall Score: 0.5

POS score: 1.0
NEG score: 0.0
POS OBJ: 0.0
Overall Score: 1.0

POS score: 0.625
NEG score: 0.0
POS OBJ: 0.375
Overall Score: 0.625

POS score: 0.375
NEG score: 0.125
POS OBJ: 0.5
Overall Score: 0.25

