In [1]:
!pip install transformers
!pip install sentence-transformers



In [7]:
!pip install spacy

Collecting spacy
  Downloading spacy-3.1.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.9 MB)
     |████████████████████████████████| 5.9 MB 2.9 MB/s            
[?25hCollecting jinja2
  Using cached Jinja2-3.0.2-py3-none-any.whl (133 kB)
Collecting murmurhash<1.1.0,>=0.28.0
  Using cached murmurhash-1.0.6-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (21 kB)
Collecting srsly<3.0.0,>=2.4.1
  Downloading srsly-2.4.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (452 kB)
     |████████████████████████████████| 452 kB 3.2 MB/s            
Collecting thinc<8.1.0,>=8.0.9
  Downloading thinc-8.0.11-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (635 kB)
     |████████████████████████████████| 635 kB 4.5 MB/s            
Collecting pathy>=0.3.5
  Downloading pathy-0.6.1-py3-none-any.whl (42 kB)
     |████████████████████████████████| 42 kB 1.7 MB/s            
[?25hCollecting wasabi<1.1.0,>=0.8.1
  Using c

In [1]:
# Transformer libraries
from transformers import pipeline # For sentiment analysis
from sentence_transformers import SentenceTransformer # For estimating the distance between (sub)sequences
from sentence_transformers import util

In [2]:
# Other libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer # For extracting n-grams from a sentence
import spacy

**Section 1**

Sentiment Analysis using FinBERT 

FinBERT is fine-tuned on financial textsets and returns three output classes - positive, neutral, and negative

In [3]:
classifier = pipeline('sentiment-analysis', model='ProsusAI/finbert')

# # 3 classes - positive, neutral, and negative. However, the output corresponds to the class with the highest probability
# classifier(['Biden comes to Europe with a goodwill gesture, a planned announcement that the United States will donate 500 million dollars to Pfizer Inc/BioNTech (PFE.N)',
#             'US will provide coronavirus vaccine doses to about 100 countries over the next two years, three sources familiar with the matter told Reuters',
#             'The Canadian minister for trade stated that the business sentiment was down 30% in Canada'])

In [5]:
input_list = ['Biden comes to Europe with a goodwill gesture, a planned announcement that the United States will donate 500 million dollars to Pfizer Inc/BioNTech (PFE.N)',
            'US will provide coronavirus vaccine doses to about 100 countries over the next two years, three sources familiar with the matter told Reuters',
            'The Canadian minister for trade stated that the business sentiment was down 30% in Canada',
            'For your second question we would launch around F4 to F5 products next year and in the FY24 we would have similar plans']

classifier(input_list)

[{'label': 'positive', 'score': 0.7007909417152405},
 {'label': 'neutral', 'score': 0.8183404207229614},
 {'label': 'negative', 'score': 0.9747421145439148},
 {'label': 'neutral', 'score': 0.8846039175987244}]

In [5]:
input_list = ['Biden comes to Europe with a goodwill gesture, a planned announcement that the United States will donate 500 million dollars to Pfizer Inc/BioNTech (PFE.N)',
            'US will provide coronavirus vaccine doses to about 100 countries over the next two years, three sources familiar with the matter told Reuters',
            'The Canadian minister for trade stated that the business sentiment was down 30% in Canada',
            'For your second question we would launch around F4 to F5 products next year and in the FY24 we would have similar plans']

num_sentences = len(input_list)
out_matrix = pd.DataFrame(0, index=np.arange(num_sentences), columns=['Label', 'Score'])

out_list = classifier(input_list)
for idx, per_sentence_sentiment in enumerate(out_list):
    out_matrix.iloc[idx, 0:] = (per_sentence_sentiment['label'], per_sentence_sentiment['score'])

out_matrix  

Unnamed: 0,Label,Score
0,positive,0.700791
1,neutral,0.81834
2,negative,0.974742
3,neutral,0.884604


**Section 2**

Identify the subsequence that has the highest semantic similarity to the original sequence



In [7]:
# Load the spaCy model for POS tags
nlp = spacy.load("en_core_web_sm")

In [8]:
# Instantiate SBERT
sentence_model = SentenceTransformer('all-mpnet-base-v2')

Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/10.1k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [9]:
orig_corpus = ['For your second question we would launch around 4 to 5 products next year and in the FY24 we would have similar plans']

Section 2.1 -- Convert numbers to strings so that they are not removed during the generation of n-grams

In [14]:
a = nlp(orig_corpus[0])
for _ in a:
    print('TEXT', _.text, ' TOKEN', _.pos_)

TEXT For  TOKEN ADP
TEXT your  TOKEN PRON
TEXT second  TOKEN ADJ
TEXT question  TOKEN NOUN
TEXT we  TOKEN PRON
TEXT would  TOKEN AUX
TEXT launch  TOKEN VERB
TEXT around  TOKEN ADV
TEXT 4  TOKEN NUM
TEXT to  TOKEN PART
TEXT 5  TOKEN NUM
TEXT products  TOKEN NOUN
TEXT next  TOKEN ADJ
TEXT year  TOKEN NOUN
TEXT and  TOKEN CCONJ
TEXT in  TOKEN ADP
TEXT the  TOKEN DET
TEXT FY24  TOKEN NOUN
TEXT we  TOKEN PRON
TEXT would  TOKEN AUX
TEXT have  TOKEN VERB
TEXT similar  TOKEN ADJ
TEXT plans  TOKEN NOUN


In [11]:
# Infer numbers using POS tags for conversion to character strings
corpus_tokens = nlp(orig_corpus[0])
print('tokens', corpus_tokens)
col_names = ['Text', 'POS']
output_df = pd.DataFrame([], columns = col_names)

for token in corpus_tokens:
    inlist = pd.DataFrame([[token.text, token.pos_]], columns=col_names)
    output_df = pd.concat([output_df, inlist], ignore_index=True)

output_df

tokens For your second question we would launch around 4 to 5 products next year and in the FY24 we would have similar plans


Unnamed: 0,Text,POS
0,For,ADP
1,your,PRON
2,second,ADJ
3,question,NOUN
4,we,PRON
5,would,AUX
6,launch,VERB
7,around,ADV
8,4,NUM
9,to,PART


In [15]:
num_idx = np.where(output_df['POS'] == 'NUM')[0]
print(num_idx)

output_df.iloc[num_idx, 0]  = 'F' + output_df.iloc[num_idx, 0]
print(output_df)

revised_corpus = output_df['Text'].str.cat(sep=' ')
revised_corpus

[ 8 10]
        Text    POS
0        For    ADP
1       your   PRON
2     second    ADJ
3   question   NOUN
4         we   PRON
5      would    AUX
6     launch   VERB
7     around    ADV
8         F4    NUM
9         to   PART
10        F5    NUM
11  products   NOUN
12      next    ADJ
13      year   NOUN
14       and  CCONJ
15        in    ADP
16       the    DET
17      FY24   NOUN
18        we   PRON
19     would    AUX
20      have   VERB
21   similar    ADJ
22     plans   NOUN


'For your second question we would launch around F4 to F5 products next year and in the FY24 we would have similar plans'

Section 2.2 -- Generate n-grams and estimate the similarity between the n-grams and the original sentence

In [29]:
a = CountVectorizer(ngram_range=(11,13))
b = a.fit_transform([revised_corpus])
c = a.get_feature_names_out()
print(b.toarray())
print(c)
# CountVectorizer(ngram_range=(11,13))

[[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]]
['around f4 to f5 products next year and in the fy24'
 'around f4 to f5 products next year and in the fy24 we'
 'around f4 to f5 products next year and in the fy24 we would'
 'f4 to f5 products next year and in the fy24 we'
 'f4 to f5 products next year and in the fy24 we would'
 'f4 to f5 products next year and in the fy24 we would have'
 'f5 products next year and in the fy24 we would have'
 'f5 products next year and in the fy24 we would have similar'
 'f5 products next year and in the fy24 we would have similar plans'
 'for your second question we would launch around f4 to f5'
 'for your second question we would launch around f4 to f5 products'
 'for your second question we would launch around f4 to f5 products next'
 'launch around f4 to f5 products next year and in the'
 'launch around f4 to f5 products next year and in the fy24'
 'launch around f4 to f5 products next year and in the fy24 we'
 'next year 

In [17]:
# Generate n-grams (tokens) of a predefined size from a single sentence
# corpus = ['For your second question we would launch around 4 to 5 products next year and in the FY24 we would have similar plans']

# Modify numbers in the sentence using the prefix "F"
# corpus = ['For your second question we would launch around F4 to F5 products next year and in the FY24 we would have similar plans']
corpus = [revised_corpus]
vectorizer = CountVectorizer(ngram_range=(11,13))
temp = vectorizer.fit_transform(corpus)
sub_sequence_list = vectorizer.get_feature_names()

# for list_element in sub_sequence_list:
#   print(list_element)

len(sub_sequence_list)  



36

In [18]:
sub_sequence_list

['around f4 to f5 products next year and in the fy24',
 'around f4 to f5 products next year and in the fy24 we',
 'around f4 to f5 products next year and in the fy24 we would',
 'f4 to f5 products next year and in the fy24 we',
 'f4 to f5 products next year and in the fy24 we would',
 'f4 to f5 products next year and in the fy24 we would have',
 'f5 products next year and in the fy24 we would have',
 'f5 products next year and in the fy24 we would have similar',
 'f5 products next year and in the fy24 we would have similar plans',
 'for your second question we would launch around f4 to f5',
 'for your second question we would launch around f4 to f5 products',
 'for your second question we would launch around f4 to f5 products next',
 'launch around f4 to f5 products next year and in the',
 'launch around f4 to f5 products next year and in the fy24',
 'launch around f4 to f5 products next year and in the fy24 we',
 'next year and in the fy24 we would have similar plans',
 'products next

In [30]:
# Calculate pairwise cosine similarity between elements in the query list and passage list
# Out[i][j] corresponds to the cosine similarity between query_list[i] and passagelist[j]
query_embedding = sentence_model.encode(corpus)
passage_embedding = sentence_model.encode(sub_sequence_list)
similarity_list =  util.cos_sim(query_embedding, passage_embedding).numpy()
# similarity_list

# Sort in ascending order and obtain the top 3 subsequences
top_3_idx = np.argsort(similarity_list)[0, -3:]

In [55]:
top_3_idx

array([22, 23, 35])

In [31]:
imp_array = pd.DataFrame({'sequence':sub_sequence_list, 'cosine_similarity':similarity_list[0, :]})
imp_array

Unnamed: 0,sequence,cosine_similarity
0,around f4 to f5 products next year and in the ...,0.779239
1,around f4 to f5 products next year and in the ...,0.808505
2,around f4 to f5 products next year and in the ...,0.827823
3,f4 to f5 products next year and in the fy24 we,0.7892
4,f4 to f5 products next year and in the fy24 we...,0.777601
5,f4 to f5 products next year and in the fy24 we...,0.750036
6,f5 products next year and in the fy24 we would...,0.726326
7,f5 products next year and in the fy24 we would...,0.768127
8,f5 products next year and in the fy24 we would...,0.851965
9,for your second question we would launch aroun...,0.501838


In [35]:
# Use POS tag to determine the start of the best subsequence
best_sentence = imp_array.iloc[top_3_idx[2],0]
print(best_sentence)

# Obtain POS per token
best_sent_tokens = nlp(best_sentence)
col_names = ['Text', 'POS', 'STOPWORD']
output_df = pd.DataFrame([], columns = col_names)

for token in best_sent_tokens:
    inlist = pd.DataFrame([[token.text, token.pos_, token.is_stop]], columns=col_names)
    output_df = pd.concat([output_df, inlist], ignore_index=True)
    
output_df

your second question we would launch around f4 to f5 products next year


Unnamed: 0,Text,POS,STOPWORD
0,your,PRON,True
1,second,ADJ,False
2,question,NOUN,False
3,we,PRON,True
4,would,AUX,True
5,launch,VERB,False
6,around,ADP,True
7,f4,PROPN,False
8,to,PART,True
9,f5,VERB,False


Section 2.3 -- Potentially use the locations of "verbs" and "stopwords" to generate bullet points

In [54]:
verb_pos = np.where(output_df['POS'] == 'VERB')[0]
print(set(verb_pos))
stop_pos = np.where(output_df['STOPWORD'] == True)[0]
print(stop_pos)
first_verb = list(set(verb_pos) - set(stop_pos))
first_verb.reverse()  # Reversing as difference in set reverses the positions
print(first_verb)
output_df.iloc[first_verb[0]:, 0].str.cat(sep=' ')

{9, 5}
[ 0  3  4  6  8 11]
[5, 9]


'launch around f4 to f5 products next year'

**Section 3**

Use Maximal Marginal Relevance (MMR) metric to identify the most distinct subsequences from the same sequence/sentence

In [56]:
def mmr(in_sequence_list, in_best_sequence, in_query, in_alpha, in_count_elements):
  num_elements = min(in_count_elements, len(in_sequence_list))

  # Ensure that the original lists are not modified
  rem_sequence_list = in_sequence_list.copy()
  current_list = in_best_sequence.copy()

  query_embedding = sentence_model.encode(in_query)

  # Add new elements to 'current_list'
  for idx in range(num_elements):
    print(idx)
    passage_embedding = sentence_model.encode(rem_sequence_list)
    current_list_embedding = sentence_model.encode(current_list)

    # sim(D_i, Q)
    seq_to_query_sim = util.cos_sim(query_embedding, passage_embedding).numpy()

    # sim(D_i, D_j)
    seq_to_seq_sim = util.cos_sim(passage_embedding, current_list_embedding).numpy()
    seq_to_seq_max = np.amax(seq_to_seq_sim, axis=1) # Obtain the maximum per row or max[sim(D_i, D_j)]

    # Identify the sequence with MMR
    per_seq_val = in_alpha*seq_to_query_sim[0, :] - (1-in_alpha)*seq_to_seq_max
    max_idx = np.argmax(per_seq_val)

    # Add the best sequence to the current_list
    current_list = current_list + [rem_sequence_list[max_idx]]
    rem_sequence_list.pop(max_idx)

  return current_list

Section 3.1 -- Test MMR function

In [20]:
# Split the sequences into the best sequence and the remaining sequences
best_sequence = [sub_sequence_list[top_3_idx[2]]]

sequence_list = sub_sequence_list.copy()
sequence_list.pop(top_3_idx[2])
len(sequence_list)

35

(1) If alpha=1, then subsequences with the highest match to the original query will be selected

In [57]:
mmr(sequence_list, best_sequence, revised_corpus, 1, 3)

NameError: name 'sequence_list' is not defined

In [22]:
zz = imp_array.sort_values(by=['cosine_similarity'], ascending=False, ignore_index=True)
for idx in range(4):
  print(zz['sequence'][idx])

your second question we would launch around f4 to f5 products next year
second question we would launch around f4 to f5 products next year and
second question we would launch around f4 to f5 products next year
question we would launch around f4 to f5 products next year and in


(2) If alpha=0, then subsequences with the lowest match to the existing subsequences will be selected

In [23]:
temp_1 = sentence_model.encode(best_sequence)
temp_2 = sentence_model.encode(sequence_list)
temp_3 = util.cos_sim(temp_1, temp_2).numpy()

In [24]:
np.argmax(-temp_3[0, :])

15

In [25]:
sequence_list[15]

'next year and in the fy24 we would have similar plans'

In [26]:
mmr(sequence_list, best_sequence, revised_corpus, 0, 3)

0
1
2


['your second question we would launch around f4 to f5 products next year',
 'next year and in the fy24 we would have similar plans',
 'for your second question we would launch around f4 to f5',
 'products next year and in the fy24 we would have similar']