# Oxford Man Institute NLP Tutorial 

## 1. Introduction 

blah blah blah

## 2. Traditional sentiment analysis

### Import packages and load dictionaries

In [8]:
import numpy as np
import re

Describe the dictionary 

In [9]:
lmdict = np.load('data/LoughranMcDonald_dict.npy', allow_pickle='TRUE').item()

Negation of positive words

In [9]:
negate = ["aint", "arent", "cannot", "cant", "couldnt", "darent", "didnt", "doesnt", "ain't", "aren't", "can't",
          "couldn't", "daren't", "didn't", "doesn't", "dont", "hadnt", "hasnt", "havent", "isnt", "mightnt", "mustnt",
          "neither", "don't", "hadn't", "hasn't", "haven't", "isn't", "mightn't", "mustn't", "neednt", "needn't",
          "never", "none", "nope", "nor", "not", "nothing", "nowhere", "oughtnt", "shant", "shouldnt", "wasnt",
          "werent", "oughtn't", "shan't", "shouldn't", "wasn't", "weren't", "without", "wont", "wouldnt", "won't",
          "wouldn't", "rarely", "seldom", "despite", "no", "nobody"]

In [44]:
def negated(word):
    """
    Determine if preceding word is a negation word
    """
    if word.lower() in negate:
        return True
    else:
        return False

Overall output 

In [44]:
def tone_count_with_negation_check(dict, article):
    """
    Count positive and negative words with negation check. Account for simple negation only for positive words.
    Simple negation is taken to be observations of one of negate words occurring within three words
    preceding a positive words.
    """
    pos_count = 0
    neg_count = 0
 
    pos_words = []
    neg_words = []
 
    input_words = re.findall(r'\b([a-zA-Z]+n\'t|[a-zA-Z]+\'s|[a-zA-Z]+)\b', article.lower())
 
    word_count = len(input_words)
 
    for i in range(0, word_count):
        if input_words[i] in dict['Negative']:
            neg_count += 1
            neg_words.append(input_words[i])
        if input_words[i] in dict['Positive']:
            if i >= 3:
                if negated(input_words[i - 1]) or negated(input_words[i - 2]) or negated(input_words[i - 3]):
                    neg_count += 1
                    neg_words.append(input_words[i] + ' (with negation)')
                else:
                    pos_count += 1
                    pos_words.append(input_words[i])
            elif i == 2:
                if negated(input_words[i - 1]) or negated(input_words[i - 2]):
                    neg_count += 1
                    neg_words.append(input_words[i] + ' (with negation)')
                else:
                    pos_count += 1
                    pos_words.append(input_words[i])
            elif i == 1:
                if negated(input_words[i - 1]):
                    neg_count += 1
                    neg_words.append(input_words[i] + ' (with negation)')
                else:
                    pos_count += 1
                    pos_words.append(input_words[i])
            elif i == 0:
                pos_count += 1
                pos_words.append(input_words[i])
 
    print('The results with negation check:', end='\n\n')
    print('The # of positive words:', pos_count)
    print('The # of negative words:', neg_count)
    print('The list of found positive words:', pos_words)
    print('The list of found negative words:', neg_words)
    print('\n', end='')
 
    results = [word_count, pos_count, neg_count, pos_words, neg_words]
 
    return results
 
    
# A sample output
article = '''Pharmaceuticals group Orion Corp reported a fall in its third-quarter earnings that were hit by larger expenditures on R&D and marketing'''
 
tone_count_with_negation_check(lmdict, article)

The results with negation check:

The # of positive words: 0
The # of negative words: 0
The list of found positive words: []
The list of found negative words: []



[23, 0, 0, [], []]

## 3. BERT classification

In [81]:
from datasets import load_dataset

fin_dataset = load_dataset('financial_phrasebank', 'sentences_allagree')

Downloading and preparing dataset financial_phrasebank/sentences_allagree (download: 665.91 KiB, generated: 296.26 KiB, post-processed: Unknown size, total: 962.17 KiB) to C:\Users\felix\.cache\huggingface\datasets\financial_phrasebank\sentences_allagree\1.0.0\a6d468761d4e0c8ae215c77367e1092bead39deb08fbf4bffd7c0a6991febbf0...


ConnectionError: Couldn't reach https://www.researchgate.net/profile/Pekka_Malo/publication/251231364_FinancialPhraseBank-v10/data/0c96051eee4fb1d56e000000/FinancialPhraseBank-v10.zip (error 429)

## 4. Visualisation

In [36]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers_interpret import SequenceClassificationExplainer

To visualise which words in each phrase are the most important for the prediction we will use the python package transformers_interpret 

In [75]:
fin_model_name = "ProsusAI/finbert"
model_name = "textattack/bert-base-uncased-SST-2"


fin_model = AutoModelForSequenceClassification.from_pretrained(fin_model_name)
fin_tokenizer = AutoTokenizer.from_pretrained(fin_model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Downloading:   0%|          | 0.00/477 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/418M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [76]:
# With both the model and tokenizer initialized we are now able to get explanations on an example text.
cls_explainer = SequenceClassificationExplainer(model,
                                                tokenizer)

fin_cls_explainer = SequenceClassificationExplainer(fin_model,
                                                    fin_tokenizer)

In [77]:
word_attributions = cls_explainer("Pharmaceuticals group Orion Corp reported a fall in its third-quarter earnings that were hit by larger expenditures on R&D and marketing")
word_attributions = fin_cls_explainer("Pharmaceuticals group Orion Corp reported a fall in its third-quarter earnings that were hit by larger expenditures on R&D and marketing")

In [78]:
cls_explainer.predicted_class_name

'LABEL_0'

In [79]:
bert_vis = cls_explainer.visualize()

True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
0.0,LABEL_0 (0.99),LABEL_0,1.37,[CLS] pharmaceuticals group orion corp reported a fall in its third - quarter earnings that were hit by larger expenditures on r & d and marketing [SEP]
,,,,


In [80]:
fin_bert_vis = fin_cls_explainer.visualize()

True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
1.0,negative (0.98),negative,2.61,[CLS] pharmaceuticals group orion corp reported a fall in its third - quarter earnings that were hit by larger expenditures on r & d and marketing [SEP]
,,,,
