In [1]:
import pandas as pd
import nltk
from nltk.corpus import sentiwordnet as swn
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize as tokenizer

nltk.download('sentiwordnet')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

import string
import re

[nltk_data] Downloading package sentiwordnet to
[nltk_data]     C:\Users\towhi\AppData\Roaming\nltk_data...
[nltk_data]   Package sentiwordnet is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\towhi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\towhi\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\towhi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\towhi\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [2]:
stop_words = set(stopwords.words('english'))

In [3]:
def senti_lookup(word):
    synsets = list(swn.senti_synsets(word))
    avg_pos, avg_neg, avg_obj = 0.0, 0.0, 0.0
    size = len(synsets)
    if size == 0:
        raise Error(f"'{word}' not found.")
    
    for synset in synsets:
        pos, neg, obj = synset.pos_score(), synset.neg_score(), synset.obj_score()
        avg_pos += pos/size
        avg_neg += neg/size
        avg_obj += obj/size
    
    return round(avg_pos, 5), round(avg_obj, 5), round(avg_neg, 5)

In [4]:
def tokenize(sentence):
    INVALID_POS = ["CC", "CD", "DT", "EX", "IN", "LS", "PDT", "POS", "PRP", "PRP$", "RP", "TO", "WDT", "WP", "WRB"]
    sentence = re.sub(f"[{re.escape(string.punctuation)}\…]+", " ", sentence)
    # Filter common words
    tokens = nltk.pos_tag(sentence.split())
    tokens_ = tokens
    tokens = [tok for (tok, pos) in tokens if tok.lower() not in stop_words and pos not in INVALID_POS]
    return tokens

In [5]:
def calc_metrics_1(text):
    def eval_neutrality(pos, obj, neg):
        THRESHOLD = 0.5
        if obj >= THRESHOLD:
            return 1
        return 0
    
    def eval_objectivity(pos, obj, neg):
        GAP_THRESHOLD = 0.65
        OBJ_THRESHOLD = 0.65
        gap = abs(pos - neg) / max(pos, neg)
        if obj <= OBJ_THRESHOLD and gap >= GAP_THRESHOLD:
            return 1
        return 0

    tokens = tokenize(text)
    size = len(tokens)
    n_neutral = 0
    n_biased = 0
    for token in tokens:
        try:
            pos, obj, neg = senti_lookup(token)
            n_neutral += eval_neutrality(pos, obj, neg)
            n_biased += eval_objectivity(pos, obj, neg)
        except:
            pass
    
    neu_score = n_neutral / size
    obj_score = 1.0 - (n_biased / size)
    
    return round(neu_score, 5), round(obj_score, 5)

In [6]:
def calc_metrics_2(text):
    neu_score = 0.0
    obj_score = 0.0
    sentences = re.split(f"[{re.escape(string.punctuation)}\…]+", text)
    for sent in sentences:
        tokens = tokenize(text)
        size = max(len(tokens), 1)
        avg_pos, avg_obj, avg_neg = 0.0, 0.0, 0.0
    
        for token in tokens:
            try:
                pos, obj, neg = senti_lookup(token)
                avg_pos += pos
                avg_obj += obj
                avg_neg += neg
            except:
                pass
    
        avg_pos = avg_pos / size
        avg_obj = avg_obj / size
        avg_neg = avg_neg / size
    
        # Neutrality test
        if avg_obj >= avg_pos and avg_obj >= avg_neg:
            neu_score += 1.0
    
        # Objectivity test
        THRESHOLD = 0.1
        if abs(avg_pos - avg_neg) <= THRESHOLD:
            obj_score += 1.0
    
    neu_score = neu_score / len(sentences)
    obj_score = obj_score / len(sentences)
    
    return round(neu_score, 5), round(obj_score, 5)

In [7]:
def calc_scores(corpus, method):
    '''
    corpus: list of summaries
    method: m-1 or m-2
    '''
    avg_neu_score = 0
    avg_obj_score = 0
    corpus_size = corpus.shape[0]
    for text in corpus:
        if method == "m-1":
            neu_score, obj_score = calc_metrics_1(text)
        elif method == "m-2":
            neu_score, obj_score = calc_metrics_2(text)
        else:
            raise Error(f"{method} is not supported. Try either 'm-1' or 'm-2'")
            
        avg_neu_score += neu_score / corpus_size
        avg_obj_score += obj_score / corpus_size
    
    return round(avg_neu_score, 5), round(avg_obj_score, 5)

In [8]:
def evaluate(path, method, is_ref=False):
    data = pd.read_csv(path)
    avg_neu_score = 0
    avg_obj_score = 0
    if is_ref:
        for col in ["summ_1", "summ_2", "summ_3"]:
            corpus = data[col].values
            neu_score, obj_score = calc_scores(corpus, method)
            avg_neu_score += neu_score/3
            avg_obj_score += obj_score/3
    else:
        corpus = data["summary"].values
        avg_neu_score, avg_obj_score = calc_scores(corpus, method)
    
    return round(avg_neu_score, 5), round(avg_obj_score, 5)

In [9]:
evaluate(path="./outputs/train_300epochs.baseHtilt_cosineHhatversHhat_HtiltmeanHtiltcontext_FULL.csv", method="m-1")

FileNotFoundError: [Errno 2] No such file or directory: './outputs/train_300epochs.baseHtilt_cosineHhatversHhat_HtiltmeanHtiltcontext_FULL.csv'

In [None]:
evaluate(path="./outputs/train_300epochs.baseHtilt_cosineHhatversHhat_HtiltmeanHtiltcontext_FULL.csv", method="m-2")

(1.0, 0.59903)