In [1]:
# Import packages
import nltk
from pprint import pprint
import yaml
import sys
import os
import re
import pandas as pd

dict_tagged_sentences = ''
# Below indicates the relative path to
# positive/negative/inverter/incrementer/decrementer files
DICTIONARY_DIR_PREFIX = '../dicts/'

import nltk
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\ZY\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [2]:
dell = pd.read_csv('../../../scrapy-hackathon/webscrape/linkedin/credit_suisse_hackathon_selenium.csv')
sentiment_analysis_creditS = pd.DataFrame(dell)
sentiment_analysis_creditS = sentiment_analysis_creditS['post']
sentiment_analysis_creditS[1]

'FIS in India is proud to announce the successful completion of CODE FEST 2022.  The event was hosted virtually over a period of two months and 1500 participants worked on specific challenges and leveraged skills such as Java and .Net to develop innovative solutions for the challenges given to them. We had 100+ coding challenge qualifiers from companies like Google, Amazon, Adobe, Morgan Stanley, Walmart, KPMG, Credit Suisse, Deloitte, and Oracle, and the top 3 winners were announced in a virtual award ceremony in presence of our leaders. Many congratulations to all the winners and participants and special thanks to our event hosting partner Mercer Mettl and all our colleagues for their support. Meet the winners:'

In [3]:
class Splitter(object):
    def __init__(self):
        self.nltk_splitter = nltk.data.load('tokenizers/punkt/english.pickle')
        self.nltk_tokenizer = nltk.tokenize.TreebankWordTokenizer()

    def split(self, text):
        """
        input format: a paragraph of text
        output format: a list of lists of words.
            e.g.: [['this', 'is', 'a', 'sentence'], ['this', 'is', 'another', 'one']]
        """
        sentences = self.nltk_splitter.tokenize(text)
        tokenized_sentences = [self.nltk_tokenizer.tokenize(sent) for sent in sentences]
        return tokenized_sentences


class POSTagger(object):
    def __init__(self):
        pass

    def pos_tag(self, sentences):
        """
        input format: list of lists of words
            e.g.: [['this', 'is', 'a', 'sentence'], ['this', 'is', 'another', 'one']]
        output format: list of lists of tagged tokens. Each tagged tokens has a
        form, a lemma, and a list of tags
            e.g: [[('this', 'this', ['DT']), ('is', 'be', ['VB']), ('a', 'a', ['DT']), ('sentence', 'sentence', ['NN'])],
                    [('this', 'this', ['DT']), ('is', 'be', ['VB']), ('another', 'another', ['DT']), ('one', 'one', ['CARD'])]]
        """

        pos = [nltk.pos_tag(sentence) for sentence in sentences]
        #adapt format
        pos = [[(word, word, [postag]) for (word, postag) in sentence] for sentence in pos]
        return pos

class DictionaryTagger(object):
    def __init__(self, dictionary_paths):
        """

        :rtype : object
        """
        files = [open(path, 'r') for path in dictionary_paths]
        dictionaries = [yaml.full_load(dict_file) for dict_file in files]
        #print(dictionaries)
        map(lambda x: x.close(), files)
        self.dictionary = {}
        self.max_key_size = 0
        for curr_dict in dictionaries:
            for key in curr_dict:
                #print(key)
                if key in self.dictionary:
                    self.dictionary[key].extend(curr_dict[key])
                else:
                    #print(len(key))
                    self.dictionary[key] = curr_dict[key]
                    self.max_key_size = max(self.max_key_size, len(key))

    def tag(self, postagged_sentences):
        return [self.tag_sentence(sentence) for sentence in postagged_sentences]

    def tag_sentence(self, sentence, tag_with_lemmas=False):
        """
        the result is only one tagging of all the possible ones.
        The resulting tagging is determined by these two priority rules:
            - longest matches have higher priority
            - search is made from left to right
        """
        tag_sentence = []
        N = len(sentence)
        if self.max_key_size == 0:
            self.max_key_size = N
        i = 0
        while (i < N):
            j = min(i + self.max_key_size, N) #avoid overflow
            tagged = False
            while (j > i):
                expression_form = ' '.join([word[0] for word in sentence[i:j]]).lower()
                expression_lemma = ' '.join([word[1] for word in sentence[i:j]]).lower()
                if tag_with_lemmas:
                    literal = expression_lemma
                else:
                    literal = expression_form
                if literal in self.dictionary:
                    #self.logger.debug("found: %s" % literal)
                    is_single_token = j - i == 1
                    original_position = i
                    i = j
                    taggings = [tag for tag in self.dictionary[literal]]
                    tagged_expression = (expression_form, expression_lemma, taggings)
                    if is_single_token: #if the tagged literal is a single token, conserve its previous taggings:
                        original_token_tagging = sentence[original_position][2]
                        tagged_expression[2].extend(original_token_tagging)
                    tag_sentence.append(tagged_expression)
                    tagged = True
                else:
                    j = j - 1
            if not tagged:
                tag_sentence.append(sentence[i])
                i += 1
        return tag_sentence

def value_of(sentiment):
    if sentiment == 'positive': return 1
    if sentiment == 'negative': return -1
    return 0

def sentiment_score(review):
    return sum ([value_of(tag) for sentence in dict_tagged_sentences for token in sentence for tag in token[2]])

def sentence_score(sentence_tokens, previous_token, acum_score):
    if not sentence_tokens:
        return acum_score
    else:
        current_token = sentence_tokens[0]
        tags = current_token[2]
        token_score = sum([value_of(tag) for tag in tags])
        if previous_token is not None:
            previous_tags = previous_token[2]
            if 'inc' in previous_tags:
                token_score *= 2.0
            elif 'dec' in previous_tags:
                token_score /= 2.0
            elif 'inv' in previous_tags:
                token_score *= -1.0
        return sentence_score(sentence_tokens[1:], current_token, acum_score + token_score)

def sentiment_score(sentences):
    return sum([sentence_score(sentence, None, 0.0) for sentence in sentences])


def run_analysis(text):
    splitter = Splitter() # This boy will split a long single string into sentences.
    postagger = POSTagger() # This boy is the Part-Of-Speech tagger.

    # If text contains multiple sentences, this line splits it into individual sentences.
    splitted_sentences = splitter.split(text)
    #print (splitted_sentences)
    #exit(1)

    #print ("########## This performs Part-Of-Speech tagging. ##########")
    # This performs Part-Of-Speech tagging.
    pos_tagged_sentences = postagger.pos_tag(splitted_sentences)
    #pprint (pos_tagged_sentences)
    #exit(1)

    # print ("########## This line loads Positive word and Negative word lexicons. ##########")
    # # This line loads Positive word and Negative word lexicons.
    # dicttagger = DictionaryTagger([ DICTIONARY_DIR_PREFIX + 'positive.yml', DICTIONARY_DIR_PREFIX + 'negative.yml'])
    # dict_tagged_sentences = dicttagger.tag(pos_tagged_sentences)
    # print(dict_tagged_sentences)
    # #exit(1)

    # print ("########## [Baseline Analysis] Using only Positive/Negative lexicon. ##########")
    # score = sentiment_score(dict_tagged_sentences)
    # print ("Score: %d" % score)
    # #exit(1)

    # print ("########## This line loads Positve/Negative lexicon + incrementer/decrementer lexicon. ##########")
    # dicttagger = DictionaryTagger([ DICTIONARY_DIR_PREFIX + 'positive.yml', DICTIONARY_DIR_PREFIX + 'negative.yml', DICTIONARY_DIR_PREFIX + 'inc.yml', DICTIONARY_DIR_PREFIX + 'dec.yml'])
    # dict_tagged_sentences = dicttagger.tag(pos_tagged_sentences)
    # pprint(dict_tagged_sentences)
    # score = sentiment_score(dict_tagged_sentences)
    # print ("Score: %d" % score)
    # #exit(1)

    #print ("########## This line loads Positve/Negative lexicon + incrementer/decrementer/inverter lexicon. ##########")
    dicttagger = DictionaryTagger([ DICTIONARY_DIR_PREFIX + 'positive.yml', DICTIONARY_DIR_PREFIX + 'negative.yml', DICTIONARY_DIR_PREFIX + 'inc.yml', DICTIONARY_DIR_PREFIX + 'dec.yml', DICTIONARY_DIR_PREFIX + 'inv.yml'])
    dict_tagged_sentences = dicttagger.tag(pos_tagged_sentences)
    pprint(dict_tagged_sentences)
    score = sentiment_score(dict_tagged_sentences)
    #print ("Score: %d" % score)
    return score
    #exit(1)

In [4]:
################### This is the MAIN section ###################
if __name__ == "__main__":

    #print ("###############################")

    sentiment_array = []
    total_score = 0
    length = len(sentiment_analysis_creditS)
    for post in sentiment_analysis_creditS:
        #print ("###############################")
        # Run sentiment scoring
        score = run_analysis(post)
        if (score == 0):
            sentiment_array.append("Neutral")
        elif (score > 0 and score < 6):
            sentiment_array.append("Positive")
        elif (score >= 6):
            sentiment_array.append("Very Positive")
        elif (score < 0):
            sentiment_array.append("Negative")
        total_score += score
    average_score = total_score/length
    # Run sentiment scoring

[[('Last', 'Last', ['JJ']),
  ('weekend', 'weekend', ['NN']),
  (',', ',', [',']),
  ('Perlyn', 'Perlyn', ['NNP']),
  ('Chew', 'Chew', ['NNP']),
  (',', ',', [',']),
  ('Ryan', 'Ryan', ['NNP']),
  ('Wong', 'Wong', ['NNP']),
  ('and', 'and', ['CC']),
  ('I', 'I', ['PRP']),
  ('took', 'took', ['VBD']),
  ('part', 'part', ['NN']),
  ('in', 'in', ['IN']),
  ('the', 'the', ['DT']),
  ('Credit', 'Credit', ['NNP']),
  ('Suisse', 'Suisse', ['NNP']),
  ('CodeIT', 'CodeIT', ['NNP']),
  ('Challenge', 'Challenge', ['NNP']),
  ('2022', '2022', ['CD']),
  ('.', '.', ['.'])],
 [('We', 'We', ['PRP']),
  ('came', 'came', ['VBD']),
  ('in', 'in', ['IN']),
  ('1st', '1st', ['CD']),
  (',', ',', [',']),
  ('and', 'and', ['CC']),
  ('had', 'had', ['VBD']),
  ('lots', 'lots', ['NNS']),
  ('of', 'of', ['IN']),
  ('fun', 'fun', ['positive', 'NN']),
  ('working', 'working', ['VBG']),
  ('on', 'on', ['IN']),
  ('the', 'the', ['DT']),
  ('coding', 'coding', ['NN']),
  ('challenges', 'challenges', ['NNS']),
  ('.

In [7]:
sentiment_df = pd.DataFrame(sentiment_array, columns=['Sentiment Score'])
sentiment_df.to_csv('creditS_scores.csv')

In [8]:
sentiment_df

Unnamed: 0,Sentiment Score
0,Positive
1,Very Positive
2,Neutral
3,Positive
4,Very Positive
...,...
64,Neutral
65,Negative
66,Negative
67,Neutral
