<a href="https://www.kaggle.com/code/dascient/dascient-vader-scu-uap-lexicon?scriptVersionId=144273619" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
# Natural Language Toolkit: vader - TAMPERED w/ SCU Lexicon
#
# Copyright (C) 2001-2023 NLTK Project
# Author: C.J. Hutto <Clayton.Hutto@gtri.gatech.edu>
#         Ewan Klein <ewan@inf.ed.ac.uk> (modifications)
#         Pierpaolo Pantone <24alsecondo@gmail.com> (modifications)
#         George Berry <geb97@cornell.edu> (modifications)
#         Malavika Suresh <malavika.suresh0794@gmail.com> (modifications)
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
#
# Modifications to the original VADER code have been made in order to
# integrate it into NLTK. These have involved changes to
# ensure Python 3 compatibility, and refactoring to achieve greater modularity.

"""
If you use the VADER sentiment analysis tools, please cite:

Hutto, C.J. & Gilbert, E.E. (2014). VADER: A Parsimonious Rule-based Model for
Sentiment Analysis of Social Media Text. Eighth International Conference on
Weblogs and Social Media (ICWSM-14). Ann Arbor, MI, June 2014.
"""

import math
import re
import string
from itertools import product
import warnings
warnings.filterwarnings("ignore")
from IPython.display import clear_output
import nltk.data
clear_output()

class VaderConstants:
    """
    A class to keep the Vader lists and constants.
    """

    ##Constants##
    # (empirically derived mean sentiment intensity rating increase for booster words)
    B_INCR = 0.293
    B_DECR = -0.293

    # (empirically derived mean sentiment intensity rating increase for using
    # ALLCAPs to emphasize a word)
    C_INCR = 0.733

    N_SCALAR = -0.74

    NEGATE = {'UFO',
             'again',
             'alien',
             'another',
             'attack',
             'beam',
             'beautiful',
             'bed',
             'bedroom',
             'being',
             'believe',
             'capture',
             'caught',
             'chase',
             'chemtrail',
             'cloud',
             'clouds',
             'coming',
             'confuse',
             'contact',
             'daily',
             "didn't",
             'discover',
             'dream',
             'earth',
             'every',
             'excited',
             'experience',
             'figure',
             'fireball',
             'firework',
             'fireworks',
             'frequent',
             'happy',
             'help',
             'hope',
             'less',
             'me',
             'mufon',
             'needle',
             'nightly',
             'often',
             'orange',
             'pain',
             'paranormal',
             'please',
             'presence',
             'psychic',
             'review',
             'room',
             'scared',
             'seen',
             'shock',
             'shocked',
             'skin',
             'sleep',
             'star-like',
             'starlike',
             'starship',
             'storm',
             'surgery',
             'taken',
             'their',
             'thigh',
             'tragedy',
             'trance',
             'ufo',
             "ufo's",
             'ufos',
             'visit',
             'visitation',
             'want',
             'zoom',
             'zoomed'
             }


    # booster/dampener 'intensifiers' or 'degree adverbs'
    # https://en.wiktionary.org/wiki/Category:English_degree_adverbs

    BOOSTER_DICT = {
        'northeastern'  : B_INCR,
        'law'  : B_INCR,
        'duty'  : B_INCR,
        'appearing'  : B_INCR,
        'background'  : B_INCR,
        'clockwise'  : B_INCR,
        'momentarily'  : B_INCR,
        'degrees'  : B_INCR,
        'appearance'  : B_INCR,
        'approx'  : B_INCR,
        'approximate'  : B_INCR,
        'within'  : B_INCR,
        'NORAD'  : B_INCR,
        'plasma'  : B_INCR,
        'astronomical'  : B_INCR,
        'Lockheed'  : B_INCR,
        'retire'  : B_INCR,
        'locomotion'  : B_INCR,
        'feather'  : B_INCR,
        'seemed'  : B_INCR,
        'witnesses'  : B_INCR,
        'aviation'  : B_INCR,
        'nuclear'  : B_INCR,
        'air force'  : B_INCR,
        'north'  : B_INCR,
        'seem'  : B_INCR,
        'fort'  : B_INCR,
        'register'  : B_INCR,
        'translucent'  : B_INCR,
        'hexagon'  : B_INCR,
        'normal'  : B_INCR,
        'target'  : B_INCR,
        'morning'  : B_INCR,
        'rotar'  : B_INCR,
        'southwest'  : B_INCR,
        'AF'  : B_INCR,
        'south'  : B_INCR,
        'approach'  : B_INCR,
        'opaque'  : B_INCR,
        'hexagonal'  : B_INCR,
        'guard'  : B_INCR,
        'briefly'  : B_INCR,
        'rectangular'  : B_INCR,
        'upward'  : B_INCR,
        'officer'  : B_INCR,
        'telescope'  : B_INCR,
        'propulsion'  : B_INCR,
        'perimeter'  : B_INCR,
        'traverse'  : B_INCR,
        'army'  : B_INCR,
        'eastern'  : B_INCR,
        'elongate'  : B_INCR,
        'customer'  : B_INCR,
        'binoculars'  : B_INCR,
        'pulled'  : B_INCR,
        'emanate'  : B_INCR,
        'scope'  : B_INCR,
        'southern'  : B_INCR,
        'assume'  : B_INCR,
        'capability'  : B_INCR,
        'radar'  : B_INCR,
        'structure'  : B_INCR,
        'tree-line'  : B_INCR,
        'appear'  : B_INCR,
        'western'  : B_INCR,
        'defense'  : B_INCR,
        'irregular'  : B_INCR,
        'call'  : B_INCR,
        'analyze'  : B_INCR,
        'coast  '  : B_INCR,
        'vertex'  : B_INCR,
        'altitude'  : B_INCR,
        'grid'  : B_INCR,
        'blend'  : B_INCR,
        'public'  : B_INCR,
        'northern'  : B_INCR,
        'simultaneous'  : B_INCR,
        'perspective'  : B_INCR,
        'capable'  : B_INCR,
        'navy'  : B_INCR,
        'degree'  : B_INCR,
        'family'  : B_INCR,
        'east'  : B_INCR,
        'enforcement'  : B_INCR,
        'position'  : B_INCR,
        'F14'  : B_INCR,
        'astronomy'  : B_INCR,
        'police'  : B_INCR,
        'base'  : B_INCR,
        'security'  : B_INCR,
        'velocity'  : B_INCR,
        'similar'  : B_INCR,
        'civilian'  : B_INCR,
        'pilot'  : B_INCR,
        'F15'  : B_INCR,
        'southeastern'  : B_INCR,
        'Boeing'  : B_INCR,
        'counterclockwise'  : B_INCR,
        'estimate'  : B_INCR,
        'sentry'  : B_INCR,
        'sheriff'  : B_INCR,
        'assign'  : B_INCR,
        'B2'  : B_INCR,
        'cloak'  : B_INCR,
        'west'  : B_INCR,
        'northwest'  : B_INCR,
        'military'  : B_INCR,
        'marine'  : B_INCR,
        'engine'  : B_INCR,
        'reception'  : B_INCR,
        'treeline'  : B_INCR,
        '911'  : B_INCR,
        'radio'  : B_INCR,
        'southeast'  : B_INCR,
        'height'  : B_INCR,
        'rotating'  : B_INCR,
        'rotate'  : B_INCR,
        'octagonal'  : B_INCR,
        'day'  : B_INCR,
        'physics'  : B_INCR,
        'border'  : B_INCR,
        'northwestern'  : B_INCR,
        'cluster'  : B_INCR,
        'appeared'  : B_INCR,
        'solid'  : B_INCR,
        'rotary'  : B_INCR,
        'horizontal'  : B_INCR,
        'approximately'  : B_INCR,
        'equidistant'  : B_INCR,
        'independent'  : B_INCR,
        'naval'  : B_INCR,
        'policeman'  : B_INCR,
        'resemble'  : B_INCR,
        'northeast'  : B_INCR,
        'commercial'  : B_INCR,
        'southwestern'  : B_INCR,
        'octagon'  : B_INCR,
        'motion'  : B_INCR,
        'deputy'  : B_INCR,
        'daytime'  : B_INCR,
        'flight'  : B_INCR,
        'biologist'  : B_INCR,
        'silo'  : B_INCR,
        'patrol'  : B_INCR,
        'chemist'  : B_INCR,
        'perfect'  : B_INCR,
        'astronomer'  : B_INCR,
        'azimuth'  : B_INCR,
        'radioed'  : B_INCR,
        'elevation'  : B_INCR,
        'coast guard'  : B_INCR,
        'engineer'  : B_INCR,
        'reports'  : B_DECR,
        'towards'  : B_DECR,
        'directions'  : B_DECR,
        'attention'  : B_DECR,
        'directly'  : B_DECR,
        'friends'  : B_DECR,
        'triangle'  : B_DECR,
        'ft'  : B_DECR,
        'pictures'  : B_DECR,
        'event'  : B_DECR,
        'father'  : B_DECR,
        'observe '  : B_DECR,
        'observing'  : B_DECR,
        'extremely'  : B_DECR,
        'standing'  : B_DECR,
        'visible'  : B_DECR,
        'spherical'  : B_DECR,
        'angle'  : B_DECR,
        'hover'  : B_DECR,
        'triangular'  : B_DECR,
        'location'  : B_DECR,
        'diameter'  : B_DECR,
        'direction'  : B_DECR,
        'close'  : B_DECR,
        'speed'  : B_DECR,
        'hovered'  : B_DECR,
        'between'  : B_DECR,
        'object'  : B_DECR,
        'glow'  : B_DECR,
        'sister'  : B_DECR,
        'clearly'  : B_DECR,
        'never'  : B_DECR,
        'trees'  : B_DECR,
        'son'  : B_DECR,
        'near'  : B_DECR,
        'facing'  : B_DECR,
        'movement'  : B_DECR,
        'realized'  : B_DECR,
        'photos'  : B_DECR,
        'behind'  : B_DECR,
        'witnessed'  : B_DECR,
        'cloudy'  : B_DECR,
        'immediate'  : B_DECR,
        'notice'  : B_DECR,
        'camera'  : B_DECR,
        'large'  : B_DECR,
        'brother'  : B_DECR,
        'underneath'  : B_DECR,
        'asked'  : B_DECR,
        'closer'  : B_DECR,
        'called'  : B_DECR,
        'photo'  : B_DECR,
        'yards'  : B_DECR,
        'highway'  : B_DECR,
        'down'  : B_DECR,
        'objects'  : B_DECR,
        'video'  : B_DECR,
        'assumed'  : B_DECR,
        'horizon'  : B_DECR,
        'tree'  : B_DECR,
        'station'  : B_DECR,
        'mile'  : B_DECR,
        'shape'  : B_DECR,
        'together'  : B_DECR,
        'ground'  : B_DECR,
        'mountain'  : B_DECR,
        'mother'  : B_DECR,
        'observed'  : B_DECR,
        'curious'  : B_DECR,
        'fairly'  : B_DECR,
        'feet'  : B_DECR,
        'miles'  : B_DECR,
        'ocean'  : B_DECR,
        'below'  : B_DECR,
        'brightness'  : B_DECR,
        'bottom'  : B_DECR,
        'hill'  : B_DECR,
        'pattern'  : B_DECR,
        'remember'  : B_DECR,
        'daughter'  : B_DECR,
        'hovering'  : B_DECR,
        'joke'  : B_DECR,
        'stationary'  : B_DECR,
        'metallic'  : B_DECR,
        'probably'  : B_DECR,
        'top'  : B_DECR,
        'might'  : B_DECR,
        'approaching'  : B_DECR,
        'slightly'  : B_DECR,
        'smaller'  : B_DECR,
        'toward'  : B_DECR,
        'above'  : B_DECR,
        'different'  : B_DECR,
        'husband'  : B_DECR,
        'glowing'  : B_DECR,
        'mom'  : B_DECR,
        'water'  : B_DECR,
        'wasnt'  : B_DECR,
        'path'  : B_DECR,
        'friend'  : B_DECR,
        'dad'  : B_DECR,
        'observation'  : B_DECR,
        'center'  : B_DECR,
        'began'  : B_DECR,
        'course'  : B_DECR,
        'smoke'  : B_DECR,
        'cigar'  : B_DECR,
        'located'  : B_DECR,
        'overhead'  : B_DECR,
        'minutes'  : B_DECR,
        'dog'  : B_DECR,
        'witness'  : B_DECR,
        'cold'  : B_DECR,
        'report'  : B_DECR,
        'wife'  : B_DECR,
        'appears'  : B_DECR,
        'bright'  : B_DECR,
        'moon'  : B_DECR,
        'moving'  : B_DECR,
        'craft'  : B_DECR,
        'brighter'  : B_DECR,
    }

    # check for special case idioms using a sentiment-laden keyword known to SAGE
    SPECIAL_CASE_IDIOMS = {
        "the shit": 3,
        "the bomb": 3,
        "bad ass": 1.5,
        "yeah right": -2,
        "cut the mustard": 2,
        "kiss of death": -1.5,
        "hand to mouth": -2,
    }

    # for removing punctuation
    REGEX_REMOVE_PUNCTUATION = re.compile(f"[{re.escape(string.punctuation)}]")

    PUNC_LIST = [
        ".",
        "!",
        "?",
        "&",
        "&#44",
        "&#39",
        ",",
        ";",
        ":",
        "-",
        "'",
        '"',
        "!!",
        "!!!",
        "??",
        "???",
        "?!?",
        "!?!",
        "?!?!",
        "!?!?",
    ]
    
    def __init__(self):
        pass

    
    def negated(self, input_words, include_nt=True):
        """
        Determine if input contains negation words
        """
        neg_words = self.NEGATE
        if any(word.lower() in neg_words for word in input_words):
            return True
        if include_nt:
            if any("n't" in word.lower() for word in input_words):
                return True
        #for first, second in pairwise(input_words):
        #    if second.lower() == "least" and first.lower() != "at":
        #        return True
        return False


    def normalize(self, score, alpha=15):
        """
        Normalize the score to be between -1 and 1 using an alpha that
        approximates the max expected value
        """
        norm_score = score / math.sqrt((score * score) + alpha)
        return norm_score


    def scalar_inc_dec(self, word, valence, is_cap_diff):
        """
        Check if the preceding words increase, decrease, or negate/nullify the
        valence
        """
        scalar = 0.0
        word_lower = word.lower()
        if word_lower in self.BOOSTER_DICT:
            scalar = self.BOOSTER_DICT[word_lower]
            if valence < 0:
                scalar *= -1
            # check if booster/dampener word is in ALLCAPS (while others aren't)
            if word.isupper() and is_cap_diff:
                if valence > 0:
                    scalar += self.C_INCR
                else:
                    scalar -= self.C_INCR
        return scalar



class SentiText:
    """
    Identify sentiment-relevant string-level properties of input text.
    """

    def __init__(self, text, punc_list, regex_remove_punctuation):
        if not isinstance(text, str):
            text = str(text.encode("utf-8"))
        self.text = text
        self.PUNC_LIST = punc_list
        self.REGEX_REMOVE_PUNCTUATION = regex_remove_punctuation
        self.words_and_emoticons = self._words_and_emoticons()
        # doesn't separate words from
        # adjacent punctuation (keeps emoticons & contractions)
        self.is_cap_diff = self.allcap_differential(self.words_and_emoticons)


    def _words_plus_punc(self):
        """
        Returns mapping of form:
        {
            'cat,': 'cat',
            ',cat': 'cat',
        }
        """
        no_punc_text = self.REGEX_REMOVE_PUNCTUATION.sub("", self.text)
        # removes punctuation (but loses emoticons & contractions)
        words_only = no_punc_text.split()
        # remove singletons
        words_only = {w for w in words_only if len(w) > 1}
        # the product gives ('cat', ',') and (',', 'cat')
        punc_before = {"".join(p): p[1] for p in product(self.PUNC_LIST, words_only)}
        punc_after = {"".join(p): p[0] for p in product(words_only, self.PUNC_LIST)}
        words_punc_dict = punc_before
        words_punc_dict.update(punc_after)
        return words_punc_dict

    def _words_and_emoticons(self):
        """
        Removes leading and trailing puncutation
        Leaves contractions and most emoticons
            Does not preserve punc-plus-letter emoticons (e.g. :D)
        """
        wes = self.text.split()
        words_punc_dict = self._words_plus_punc()
        wes = [we for we in wes if len(we) > 1]
        for i, we in enumerate(wes):
            if we in words_punc_dict:
                wes[i] = words_punc_dict[we]
        return wes

    def allcap_differential(self, words):
        """
        Check whether just some words in the input are ALL CAPS

        :param list words: The words to inspect
        :returns: `True` if some but not all items in `words` are ALL CAPS
        """
        is_different = False
        allcap_words = 0
        for word in words:
            if word.isupper():
                allcap_words += 1
        cap_differential = len(words) - allcap_words
        if 0 < cap_differential < len(words):
            is_different = True
        return is_different



class SentimentIntensityAnalyzer:
    """
    Give a sentiment intensity score to sentences.
    """

    def __init__(
        self,
        lexicon_file="sentiment/vader_lexicon.zip/vader_lexicon/vader_lexicon.txt",
    ):
        self.lexicon_file = nltk.data.load(lexicon_file)
        self.lexicon = self.make_lex_dict()
        self.constants = VaderConstants()


    def make_lex_dict(self):
        """
        Convert lexicon file to a dictionary
        """
        lex_dict = {}
        for line in self.lexicon_file.split("\n"):
            (word, measure) = line.strip().split("\t")[0:2]
            lex_dict[word] = float(measure)
        return lex_dict


    def polarity_scores(self, text):
        """
        Return a float for sentiment strength based on the input text.
        Positive values are positive valence, negative value are negative
        valence.

        :note: Hashtags are not taken into consideration (e.g. #BAD is neutral). If you
            are interested in processing the text in the hashtags too, then we recommend
            preprocessing your data to remove the #, after which the hashtag text may be
            matched as if it was a normal word in the sentence.
        """
        # text, words_and_emoticons, is_cap_diff = self.preprocess(text)
        sentitext = SentiText(
            text, self.constants.PUNC_LIST, self.constants.REGEX_REMOVE_PUNCTUATION
        )
        sentiments = []
        words_and_emoticons = sentitext.words_and_emoticons
        for item in words_and_emoticons:
            valence = 0
            i = words_and_emoticons.index(item)
            if (
                i < len(words_and_emoticons) - 1
                and item.lower() == "kind"
                and words_and_emoticons[i + 1].lower() == "of"
            ) or item.lower() in self.constants.BOOSTER_DICT:
                sentiments.append(valence)
                continue

            sentiments = self.sentiment_valence(valence, sentitext, item, i, sentiments)

        sentiments = self._but_check(words_and_emoticons, sentiments)

        return self.score_valence(sentiments, text)


    def sentiment_valence(self, valence, sentitext, item, i, sentiments):
        is_cap_diff = sentitext.is_cap_diff
        words_and_emoticons = sentitext.words_and_emoticons
        item_lowercase = item.lower()
        if item_lowercase in self.lexicon:
            # get the sentiment valence
            valence = self.lexicon[item_lowercase]

            # check if sentiment laden word is in ALL CAPS (while others aren't)
            if item.isupper() and is_cap_diff:
                if valence > 0:
                    valence += self.constants.C_INCR
                else:
                    valence -= self.constants.C_INCR

            for start_i in range(0, 3):
                if (
                    i > start_i
                    and words_and_emoticons[i - (start_i + 1)].lower()
                    not in self.lexicon
                ):
                    # dampen the scalar modifier of preceding words and emoticons
                    # (excluding the ones that immediately preceed the item) based
                    # on their distance from the current item.
                    s = self.constants.scalar_inc_dec(
                        words_and_emoticons[i - (start_i + 1)], valence, is_cap_diff
                    )
                    if start_i == 1 and s != 0:
                        s = s * 0.95
                    if start_i == 2 and s != 0:
                        s = s * 0.9
                    valence = valence + s
                    valence = self._never_check(
                        valence, words_and_emoticons, start_i, i
                    )
                    if start_i == 2:
                        valence = self._idioms_check(valence, words_and_emoticons, i)

                        # future work: consider other sentiment-laden idioms
                        # other_idioms =
                        # {"back handed": -2, "blow smoke": -2, "blowing smoke": -2,
                        #  "upper hand": 1, "break a leg": 2,
                        #  "cooking with gas": 2, "in the black": 2, "in the red": -2,
                        #  "on the ball": 2,"under the weather": -2}

            valence = self._least_check(valence, words_and_emoticons, i)

        sentiments.append(valence)
        return sentiments


    def _least_check(self, valence, words_and_emoticons, i):
        # check for negation case using "least"
        if (
            i > 1
            and words_and_emoticons[i - 1].lower() not in self.lexicon
            and words_and_emoticons[i - 1].lower() == "least"
        ):
            if (
                words_and_emoticons[i - 2].lower() != "at"
                and words_and_emoticons[i - 2].lower() != "very"
            ):
                valence = valence * self.constants.N_SCALAR
        elif (
            i > 0
            and words_and_emoticons[i - 1].lower() not in self.lexicon
            and words_and_emoticons[i - 1].lower() == "least"
        ):
            valence = valence * self.constants.N_SCALAR
        return valence

    def _but_check(self, words_and_emoticons, sentiments):
        words_and_emoticons = [w_e.lower() for w_e in words_and_emoticons]
        but = {"but"} & set(words_and_emoticons)
        if but:
            bi = words_and_emoticons.index(next(iter(but)))
            for sidx, sentiment in enumerate(sentiments):
                if sidx < bi:
                    sentiments[sidx] = sentiment * 0.5
                elif sidx > bi:
                    sentiments[sidx] = sentiment * 1.5
        return sentiments

    def _idioms_check(self, valence, words_and_emoticons, i):
        onezero = f"{words_and_emoticons[i - 1]} {words_and_emoticons[i]}"

        twoonezero = "{} {} {}".format(
            words_and_emoticons[i - 2],
            words_and_emoticons[i - 1],
            words_and_emoticons[i],
        )

        twoone = f"{words_and_emoticons[i - 2]} {words_and_emoticons[i - 1]}"

        threetwoone = "{} {} {}".format(
            words_and_emoticons[i - 3],
            words_and_emoticons[i - 2],
            words_and_emoticons[i - 1],
        )

        threetwo = "{} {}".format(
            words_and_emoticons[i - 3], words_and_emoticons[i - 2]
        )

        sequences = [onezero, twoonezero, twoone, threetwoone, threetwo]

        for seq in sequences:
            if seq in self.constants.SPECIAL_CASE_IDIOMS:
                valence = self.constants.SPECIAL_CASE_IDIOMS[seq]
                break

        if len(words_and_emoticons) - 1 > i:
            zeroone = f"{words_and_emoticons[i]} {words_and_emoticons[i + 1]}"
            if zeroone in self.constants.SPECIAL_CASE_IDIOMS:
                valence = self.constants.SPECIAL_CASE_IDIOMS[zeroone]
        if len(words_and_emoticons) - 1 > i + 1:
            zeroonetwo = "{} {} {}".format(
                words_and_emoticons[i],
                words_and_emoticons[i + 1],
                words_and_emoticons[i + 2],
            )
            if zeroonetwo in self.constants.SPECIAL_CASE_IDIOMS:
                valence = self.constants.SPECIAL_CASE_IDIOMS[zeroonetwo]

        # check for booster/dampener bi-grams such as 'sort of' or 'kind of'
        if (
            threetwo in self.constants.BOOSTER_DICT
            or twoone in self.constants.BOOSTER_DICT
        ):
            valence = valence + self.constants.B_DECR
        return valence

    def _never_check(self, valence, words_and_emoticons, start_i, i):
        if start_i == 0:
            if self.constants.negated([words_and_emoticons[i - 1]]):
                valence = valence * self.constants.N_SCALAR
        if start_i == 1:
            if words_and_emoticons[i - 2] == "never" and (
                words_and_emoticons[i - 1] == "so"
                or words_and_emoticons[i - 1] == "this"
            ):
                valence = valence * 1.5
            elif self.constants.negated([words_and_emoticons[i - (start_i + 1)]]):
                valence = valence * self.constants.N_SCALAR
        if start_i == 2:
            if (
                words_and_emoticons[i - 3] == "never"
                and (
                    words_and_emoticons[i - 2] == "so"
                    or words_and_emoticons[i - 2] == "this"
                )
                or (
                    words_and_emoticons[i - 1] == "so"
                    or words_and_emoticons[i - 1] == "this"
                )
            ):
                valence = valence * 1.25
            elif self.constants.negated([words_and_emoticons[i - (start_i + 1)]]):
                valence = valence * self.constants.N_SCALAR
        return valence

    def _punctuation_emphasis(self, sum_s, text):
        # add emphasis from exclamation points and question marks
        ep_amplifier = self._amplify_ep(text)
        qm_amplifier = self._amplify_qm(text)
        punct_emph_amplifier = ep_amplifier + qm_amplifier
        return punct_emph_amplifier

    def _amplify_ep(self, text):
        # check for added emphasis resulting from exclamation points (up to 4 of them)
        ep_count = text.count("!")
        if ep_count > 4:
            ep_count = 4
        # (empirically derived mean sentiment intensity rating increase for
        # exclamation points)
        ep_amplifier = ep_count * 0.292
        return ep_amplifier

    def _amplify_qm(self, text):
        # check for added emphasis resulting from question marks (2 or 3+)
        qm_count = text.count("?")
        qm_amplifier = 0
        if qm_count > 1:
            if qm_count <= 3:
                # (empirically derived mean sentiment intensity rating increase for
                # question marks)
                qm_amplifier = qm_count * 0.18
            else:
                qm_amplifier = 0.96
        return qm_amplifier

    def _sift_sentiment_scores(self, sentiments):
        # want separate positive versus negative sentiment scores
        pos_sum = 0.0
        neg_sum = 0.0
        neu_count = 0
        for sentiment_score in sentiments:
            if sentiment_score > 0:
                pos_sum += (
                    float(sentiment_score) + 1
                )  # compensates for neutral words that are counted as 1
            if sentiment_score < 0:
                neg_sum += (
                    float(sentiment_score) - 1
                )  # when used with math.fabs(), compensates for neutrals
            if sentiment_score == 0:
                neu_count += 1
        return pos_sum, neg_sum, neu_count

    def score_valence(self, sentiments, text):
        if sentiments:
            sum_s = float(sum(sentiments))
            # compute and add emphasis from punctuation in text
            punct_emph_amplifier = self._punctuation_emphasis(sum_s, text)
            if sum_s > 0:
                sum_s += punct_emph_amplifier
            elif sum_s < 0:
                sum_s -= punct_emph_amplifier

            compound = self.constants.normalize(sum_s)
            # discriminate between positive, negative and neutral sentiment scores
            pos_sum, neg_sum, neu_count = self._sift_sentiment_scores(sentiments)

            if pos_sum > math.fabs(neg_sum):
                pos_sum += punct_emph_amplifier
            elif pos_sum < math.fabs(neg_sum):
                neg_sum -= punct_emph_amplifier

            total = pos_sum + math.fabs(neg_sum) + neu_count
            pos = math.fabs(pos_sum / total)
            neg = math.fabs(neg_sum / total)
            neu = math.fabs(neu_count / total)

        else:
            compound = 0.0
            pos = 0.0
            neg = 0.0
            neu = 0.0

        sentiment_dict = {
            "neg": round(neg, 3),
            "neu": round(neu, 3),
            "pos": round(pos, 3),
            "compound": round(compound, 4),
        }

        return sentiment_dict


In [None]:
%%time
import pandas as pd
!pip install xlrd
# Pull in data
mufon = pd.read_excel('/kaggle/input/scu-nlp-uap-lexicon/Powell with Comments.xls',sheet_name='mufon_cms_2017-04-09',header=0)
witnesses = pd.read_excel('/kaggle/input/scu-nlp-uap-lexicon/Powell with Comments.xls',sheet_name='Sheet1',header=0)
mufon = mufon.dropna(how='all').copy()
mufon.head()

In [None]:
# diminish the mufon dataset
import numpy as np
mufon1 = mufon[mufon.Witnesses>0]
mufon1 = mufon1[mufon1.Witnesses!=np.nan]
mufon1 = mufon1[mufon1.Length<3000]

In [None]:
#############################################
lex = pd.read_excel('/kaggle/input/scu-nlp-uap-lexicon/UFO lexicon rev2.xls',sheet_name='Sheet1',header=7)
lex = lex.dropna(how='all').drop(columns='Unnamed: 0').copy()

#lex[lex['RATING']!=0]
#lex[lex['Previous Rating']!=0]
# non-zero rating words
lex_nonzero = lex[lex['RATING']!=0]

#lex[lex['Previous Rating']>=3]
# rating words gerater than or equal to 3
#lex_nonzero = lex[lex['RATING']>=3]
lex_nonzero
#############################################

In [None]:
%%time
import time
from nltk.tokenize import word_tokenize

df_sample1 = mufon1.sample(1000)

# hash through each comment to find only those that include non-zero lexicon words
lexicon_favored1 = df_sample1
lexicon_favored1['rating'] = pd.Series()
lexicon_favored1['lexicon_word'] = pd.Series()
lexicon_favored1['word_count'] = pd.Series()

for i,word in lex_nonzero.WORD.items():
    for i2,piece in df_sample1['Detailed Description'].items():
        if word in word_tokenize(piece.lower()):
            #print('index',i2,'\nword',word, '\npiece',piece.lower(), '\nrating', lex_nonzero.RATING[i],'\n')

            # add rating from lexicon
            lexicon_favored1['rating'][i2] = lex_nonzero.RATING[i]

            # add up every word usage in comments
            lexicon_favored1['lexicon_word'][i2] = lex_nonzero.WORD[i]
            
            # word count
            lexicon_favored1['word_count'][i2] = len(word_tokenize(piece.lower()))
        else:
            # word count
            lexicon_favored1['word_count'][i2] = len(word_tokenize(piece.lower()))
#clear_output()

In [None]:
lexicon_favored1

In [None]:
# https://www.geeksforgeeks.org/python-sentiment-analysis-using-vader/
# https://github.com/cjhutto/vaderSentiment
#
# Natural Language Toolkit: vader - TAMPERED w/ SCU Lexicon
def sentiment_scores(sentence):

    # Create a SentimentIntensityAnalyzer object.
    sid_obj = SentimentIntensityAnalyzer()
    sentiment_dict = sid_obj.polarity_scores(sentence)
    
    # create a list
    results = []
    results.append({"% Positive":sentiment_dict['pos'],
                    "% Negative":sentiment_dict['neg'],
                    "% Neutral":sentiment_dict['neu'],
                    "% Compound":sentiment_dict['compound']
                   })
    results = pd.DataFrame(results)
    return results

# Apply to df['comments'] column.
def NLP_PowellScore(commentsColumns):
    
    # obtain each comment for 'comments' column
    eachComment = [eachComment for i,eachComment in commentsColumns.items()]
    eachComment = pd.Series(eachComment)
                               
    # vader.variables.PowellScore
    PowellPositive = [v for v in list([sentiment_scores(sentimentAnalyzedComment)["% Positive"][0] for i,sentimentAnalyzedComment in eachComment.items()])]
    PowellNegative = [v for v in list([sentiment_scores(sentimentAnalyzedComment)["% Negative"][0] for i,sentimentAnalyzedComment in eachComment.items()])]
    PowellNeutral = [v for v in list([sentiment_scores(sentimentAnalyzedComment)["% Neutral"][0] for i,sentimentAnalyzedComment in eachComment.items()])]
    PowellCompound = [v for v in list([sentiment_scores(sentimentAnalyzedComment)["% Compound"][0] for i,sentimentAnalyzedComment in eachComment.items()])]
    
    return PowellPositive,PowellNegative,PowellNeutral,PowellCompound

In [None]:
%%time
#mufon = mufon.sample(30)
# defining Powell Scores by sentiment outputs: Positive, Negative, & Neutral
lexicon_favored1["PowellPositive"] = NLP_PowellScore(lexicon_favored1['Detailed Description'])[0]
lexicon_favored1["PowellNegative"] = NLP_PowellScore(lexicon_favored1['Detailed Description'])[1]
lexicon_favored1["PowellNeutral"] = NLP_PowellScore(lexicon_favored1['Detailed Description'])[2]
lexicon_favored1["PowellCompound"] = NLP_PowellScore(lexicon_favored1['Detailed Description'])[3]

# PowellScore 
lexicon_favored1["PowellScore"] = (lexicon_favored1["PowellPositive"]-lexicon_favored1["PowellNegative"])/lexicon_favored1["PowellNeutral"]

# veracity
lexicon_favored1["veracity"] = lexicon_favored1["PowellScore"]*lexicon_favored1["Length"]*lexicon_favored1["Score"]

df3 = lexicon_favored1.sort_values('veracity',ascending=False).reset_index(drop=True)
df3.head(100)\
        .style.background_gradient(cmap ='seismic').set_properties(**{'font-size': '11px'})

In [None]:
lexicon_favored1.to_csv('lexicon_favored2.csv',index=True)

In [None]:
# en fin