In [1]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
import spacy
from copy import deepcopy

from pdfminer.high_level import extract_text
# import pdfplumber
import re
import pandas as pd
import itertools
from sentence_transformers import SentenceTransformer, util

In [2]:
classifier = pipeline('sentiment-analysis', model='ProsusAI/finbert')

In [3]:
# For BERT based NER
# tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
# model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")

In [4]:
# For spaCy based NER

# !python -m spacy download en_core_web_trf
import spacy_transformers
nlp_spacy_trf = spacy.load("en_core_web_trf")
nlp_spacy_trf.add_pipe("merge_noun_chunks")
# model = SentenceTransformer('distilbert-base-nli-mean-tokens')

<function spacy.pipeline.functions.merge_noun_chunks(doc: spacy.tokens.doc.Doc) -> spacy.tokens.doc.Doc>

In [3]:
import en_core_web_trf
nlp_spacy_trf = en_core_web_trf.load()
nlp_spacy_trf.add_pipe("merge_noun_chunks")

<function spacy.pipeline.functions.merge_noun_chunks(doc: spacy.tokens.doc.Doc) -> spacy.tokens.doc.Doc>

In [6]:
# sample text 
text = "If you could talk us through, how is the truck profitability looking like, purely from a gross margin perspective, I understand there will be operating effects there." 

# create a spaCy object 
doc = nlp_spacy_trf(text)
a = []
for phrase in doc:
    if phrase.pos_ == "NOUN":
        print("------------")
        print(phrase.text)
        print(phrase.tag_)
        print(phrase.pos_)
        print("------------")
        a.append(phrase.text)
print(", ".join(a))

------------
the truck profitability
NN
NOUN
------------
------------
a gross margin perspective
NN
NOUN
------------
------------
operating effects
NNS
NOUN
------------
the truck profitability, a gross margin perspective, operating effects




#### pdfminer.six

In [4]:
text = extract_text('../data/ashok_leyland_concall_transcript.pdf')
# text = extract_text('../data/AGM_Transcript_Revised.pdf')

#### pdfplumber: Better results in pdf extraction -> Check later

with pdfplumber.open("../data/AGM_Transcript_Revised.pdf") as file:
    text = file.pages[10].extract_text()

In [5]:
# Idea -> Creating tokens as Points of Interest (POI) in the text extracted from the PDF
# and using these tokens to extract Entities of Interest (EOI)
string = 'I now invite Mr. A.V. Mani Sundaram, CLID IN30163741521740.<mask>A.V. Mani Sundaram:'
rege = r'(<mask>(.*)<mask>(.*?):)|(<mask>(.*?):)'

re.findall(rege, string)[0][-1]

'A.V. Mani Sundaram'

In [9]:
sentences = [
    " Thank you very much. We will now begin the question and answer session. We will wait for a moment while the question can be assembled. ",
    " Dear Shareholders, good morning and a very warm welcome to the Annual General Meeting of City   Union   Bank   Limited   for   FY'2021   through   video   conferencing   or   other   audio-visual facility. As a reminder, for the smooth conduct of the meeting, the members will be in the mute mode and audio and video will be opened when they will speak at the AGM as per the pre-registration. Please note that as per the requirements, the proceedings of the annual general meeting will be recorded  and  available on the website  of the Bank.  I now hand over the proceedings to Shri R. Mohan, the Chairman of City Union Bank Limited. Thank you and over ",
    "ask, question and answer",
    ]
sentence_embeddings = model.encode(sentences)

for sentence, embedding in zip(sentences, sentence_embeddings):
    print("Sentence:", sentence)
    print("Embedding:", embedding)
    print("")

# doc1 = nlp_spacy_trf(u)
# doc2 = nlp_spacy_trf(u'ask, question and answer')
print(util.pytorch_cos_sim(sentence_embeddings[0], sentence_embeddings[1]))
print(util.pytorch_cos_sim(sentence_embeddings[1], sentence_embeddings[2]))
print(util.pytorch_cos_sim(sentence_embeddings[0], sentence_embeddings[2]))

Sentence:  Thank you very much. We will now begin the question and answer session. We will wait for a moment while the question can be assembled. 
Embedding: [ 0.06344783  0.11630968  0.576895   -0.05625724 -0.38268468 -0.48887476
  0.01866261  0.22074395  1.7286305  -1.1689817   0.23807026  0.46779093
  0.6454454   0.04822326  0.6396784  -0.4240498  -0.37041032  0.05249115
 -0.37773827 -0.68010116  0.27323407 -0.3261881  -0.31360036  1.0595748
  0.5199989  -0.02574236  0.06939525  0.38551262 -0.05191303  0.8558748
  0.02803756 -0.2245875  -0.35413888  0.756421    0.1604854   0.04266265
  0.7750803  -0.16950706 -0.53763425 -0.20274737 -0.11885174 -0.61751074
  0.58929455 -0.3541158  -0.34435958 -0.22837614 -0.14803869 -0.04544735
 -1.3836113  -0.3095633  -0.8094896   0.27334154  0.16742463 -0.2938665
 -0.53182346 -0.38980478  0.33786342  0.7349423   0.71322507 -0.11272969
 -0.16684145  0.25240716 -0.1544134  -0.90846854  1.1012332  -0.2501768
  0.27942017 -0.02505153 -0.05847352 -0.914

#### NOTE: The processed data's accuracy will depend on the quality of pdf extraction 

In [6]:
# Step 1: Creating POI
text = text.replace('\n\n', '<mask>').replace('\n', '<m>').replace('\x0c', '')

In [7]:
# Step 2: Extracting EOI

all_eoi = re.findall(r'<mask>(.*?):', text)
all_eoi = [str(_.strip()) + ':' for _ in all_eoi]

list_of_entities = []
for entity in all_eoi:
    match_group = re.findall(r'(<mask>(.*)<mask>(.*?):)|(<mask>(.*?):)', entity)
    non_empty_group = [ele for inner_group in match_group for ele in inner_group if ele]
    # print(non_empty_group)
    if match_group:
        if len(non_empty_group[0]) < 2:
            pass
        else:
#             print(match_group[0][1])
            list_of_entities.append(non_empty_group[-1])
#     list_of_entities.extend(re.findall(r'<mask><mask>(.*)<mask><mask>(.*?):', entity)[-1])

# Approach: Preliminary weeding out entities based on length of the entity
# Assumption: A typical name would not exceed 40 characters
filtered_entity_list = [filtered_element for filtered_element in list_of_entities if len(filtered_element) <= 40]
print(filtered_entity_list)


['MANAGEMENT', 'MODERATOR', 'Moderator', 'Jinesh Gandhi', 'Gopal Mahadevan', 'But coming to the Quarterly Results now', 'seeing happening', 'Moderator', 'Kapil Singh', 'Gopal Mahadevan', 'Kapil Singh', 'Gopal Mahadevan', 'Kapil Singh', 'Gopal Mahadevan', 'Moderator', 'Gunjan Pritiyani', 'Gopal Mahadevan', 'Gunjan Pritiyani', 'Gopal Mahadevan', 'Gunjan Pritiyani', 'Gopal Mahadevan', 'Moderator', 'Hitesh Goel', 'Gopal Mahadevan', 'Hitesh Goel', 'Gopal Mahadevan', 'K. M. Balaji', 'Hitesh Goel', 'Gopal Mahadevan', 'Moderator', 'Sachin Trivedi', 'Gopal Mahadevan', 'Sachin Trivedi', 'Gopal Mahadevan', 'Moderator', 'Shyam Sundar Sriram', 'Gopal Mahadevan', 'Shyam Sundar Sriram', 'Gopal Mahadevan', 'K. M. Balaji', 'Gopal Mahadevan', 'K. M. Balaji', 'Gopal Mahadevan', 'Shyam Sundar Sriram', 'Gopal Mahadevan', 'Moderator']


In [10]:
# Tentative Approach: Trying to filter out PER entities  (discarded)

nlp = pipeline("ner", model=model, tokenizer=tokenizer)

for each_element in filtered_entity_list:
    ner = nlp(each_element.replace('.', ''))
#     print(ner)
# Cons: Does not work as expected; Moderator, Management does not fit under any entity; breaks down single words
# and provides entities (is this expected?) -> It is expected

NameError: name 'model' is not defined

In [8]:
# Tentative Approach: Trying to filter out PER entities  (using spaCy based NEW approach)
# Selected approach

ner_list = []
for word_entity in nlp_spacy_trf(" | ".join(filtered_entity_list)).ents:
#     print(word_entity.text)
    name_property = word_entity.text, word_entity.label_
    ner_list.append(name_property)
print(ner_list)



[('Jinesh Gandhi', 'PERSON'), ('Gopal Mahadevan', 'PERSON'), ('Kapil Singh', 'PERSON'), ('Gopal Mahadevan', 'PERSON'), ('Kapil Singh', 'PERSON'), ('Gopal Mahadevan', 'PERSON'), ('Kapil Singh', 'PERSON'), ('Gopal Mahadevan', 'PERSON'), ('Gunjan Pritiyani', 'PERSON'), ('Gopal Mahadevan', 'PERSON'), ('Gunjan Pritiyani', 'PERSON'), ('Gopal Mahadevan', 'PERSON'), ('Gunjan Pritiyani', 'PERSON'), ('Gopal Mahadevan', 'PERSON'), ('Hitesh Goel', 'PERSON'), ('Gopal Mahadevan', 'PERSON'), ('Hitesh Goel', 'PERSON'), ('Gopal Mahadevan', 'PERSON'), ('K. M. Balaji', 'PERSON'), ('Hitesh Goel', 'PERSON'), ('Gopal Mahadevan', 'PERSON'), ('Sachin Trivedi', 'PERSON'), ('Gopal Mahadevan', 'PERSON'), ('Sachin Trivedi', 'PERSON'), ('Gopal Mahadevan', 'PERSON'), ('Shyam Sundar Sriram', 'PERSON'), ('Gopal Mahadevan', 'PERSON'), ('Shyam Sundar Sriram', 'PERSON'), ('Gopal Mahadevan', 'PERSON'), ('K. M. Balaji', 'PERSON'), ('Gopal Mahadevan', 'PERSON'), ('K. M. Balaji', 'PERSON'), ('Gopal Mahadevan', 'PERSON'), ('

In [9]:
# Step 3: Extracting location of all EOI

entity_span = []
for element in filtered_entity_list:
    for find_result in re.finditer(str(element) + ':', text):
        span = (element, int(find_result.start()), int(find_result.end()))
        entity_span.append(span)
entity_span = list(set(entity_span))
entity_span = sorted(entity_span, key=lambda span_list: span_list[1])
print(entity_span)

[('MANAGEMENT', 85, 96), ('MODERATOR', 237, 247), ('Moderator', 337, 347), ('Jinesh Gandhi', 511, 525), ('Gopal Mahadevan', 629, 645), ('But coming to the Quarterly Results now', 2188, 2228), ('seeing happening', 4774, 4791), ('Moderator', 9805, 9815), ('Kapil Singh', 10047, 10059), ('Gopal Mahadevan', 10491, 10507), ('Kapil Singh', 14650, 14662), ('Gopal Mahadevan', 14912, 14928), ('Kapil Singh', 15668, 15680), ('Gopal Mahadevan', 15764, 15780), ('Moderator', 16514, 16524), ('Gunjan Pritiyani', 16645, 16662), ('Gopal Mahadevan', 17089, 17105), ('Gunjan Pritiyani', 21856, 21873), ('Gopal Mahadevan', 22363, 22379), ('Gunjan Pritiyani', 24746, 24763), ('Gopal Mahadevan', 24934, 24950), ('Moderator', 26092, 26102), ('Hitesh Goel', 26203, 26215), ('Gopal Mahadevan', 26456, 26472), ('Hitesh Goel', 27557, 27569), ('Gopal Mahadevan', 27676, 27692), ('K. M. Balaji', 27855, 27868), ('Hitesh Goel', 27932, 27944), ('Gopal Mahadevan', 28229, 28245), ('Moderator', 33334, 33344), ('Sachin Trivedi', 

In [10]:
# Step 4: Filtering out irrelevant EOI

filtered_entity_span = []
for entity_of_interest in entity_span:
    name = entity_of_interest[0].lower()
    for ner_result in ner_list:
        if ("management" in name) or (
            "moderator" in name) or (
            "invites" in name) or (
            "special" in name) or (
            "board" in name) or (
            "director" in name):
            filtered_entity_span.append(entity_of_interest)
            break
        elif (name in ner_result[0].lower()) or (ner_result[0].lower() in name):
            filtered_entity_span.append(entity_of_interest)
            break
        else:
            pass
print(filtered_entity_span)

[('MANAGEMENT', 85, 96), ('MODERATOR', 237, 247), ('Moderator', 337, 347), ('Jinesh Gandhi', 511, 525), ('Gopal Mahadevan', 629, 645), ('Moderator', 9805, 9815), ('Kapil Singh', 10047, 10059), ('Gopal Mahadevan', 10491, 10507), ('Kapil Singh', 14650, 14662), ('Gopal Mahadevan', 14912, 14928), ('Kapil Singh', 15668, 15680), ('Gopal Mahadevan', 15764, 15780), ('Moderator', 16514, 16524), ('Gunjan Pritiyani', 16645, 16662), ('Gopal Mahadevan', 17089, 17105), ('Gunjan Pritiyani', 21856, 21873), ('Gopal Mahadevan', 22363, 22379), ('Gunjan Pritiyani', 24746, 24763), ('Gopal Mahadevan', 24934, 24950), ('Moderator', 26092, 26102), ('Hitesh Goel', 26203, 26215), ('Gopal Mahadevan', 26456, 26472), ('Hitesh Goel', 27557, 27569), ('Gopal Mahadevan', 27676, 27692), ('K. M. Balaji', 27855, 27868), ('Hitesh Goel', 27932, 27944), ('Gopal Mahadevan', 28229, 28245), ('Moderator', 33334, 33344), ('Sachin Trivedi', 33446, 33461), ('Gopal Mahadevan', 33851, 33867), ('Sachin Trivedi', 35743, 35758), ('Gopal

In [11]:
# Step 5: Extracting relevant information based on the EOI

desired_columns = ["Sr.No.", "Name", "GroupOfSentences"]
information_frame = pd.DataFrame(None, columns=desired_columns)
for sequence, entity_information in enumerate(filtered_entity_span):
    start_index = entity_information[1]
    end_index = entity_information[2]
    if entity_information[0].lower() == "management":
        management_string = text[
            end_index: filtered_entity_span[sequence+1][1]
        ].replace("<mask>", "|").replace("<m>", "|")
        management_list = management_string.split("|")
        
        frame = pd.DataFrame([[sequence, entity_information[0], management_string]], columns=desired_columns)

    elif ("directors" in entity_information[0].lower()) and ("board" in entity_information[0].lower()):
        director_string = text[
            end_index: filtered_entity_span[sequence+1][1]
        ].replace("<mask>", "|").replace("<m>", "|")
        director_list = director_string.split("|")

        frame = pd.DataFrame([[sequence, entity_information[0], director_string]], columns=desired_columns)

    else:
        try:
            relevant_string = text[
                end_index: filtered_entity_span[sequence+1][1]
            ].replace("<mask>", " ").replace("<m>", " ")
        except IndexError:
            relevant_string = text[
                end_index:
            ].replace("<mask>", " ").replace("<m>", " ")
    
        frame = pd.DataFrame([[sequence, entity_information[0], relevant_string]], columns=desired_columns)
    
    information_frame = pd.concat([information_frame, frame], axis=0)
    information_frame.reset_index(drop=True, inplace=True)

In [12]:
# information_frame.to_excel("../data/ashok_leyland_concall_transcript.xlsx", index=False)

In [13]:
information_frame

Unnamed: 0,Sr.No.,Name,GroupOfSentences
0,0,MANAGEMENT,MR. GOPAL MAHADEVAN – WHOLE TIME DIRECTOR |& ...
1,1,MODERATOR,MR. JINESH GANDHI– MOTILAL OSWAL FINANCIAL S...
2,2,Moderator,"Ladies and gentlemen, Good day, and..."
3,3,Jinesh Gandhi,"Thank you, Mallika. Good morning, everyone. O..."
4,4,Gopal Mahadevan,"Thank you Jinesh and a very warm, good mornin..."
5,5,Moderator,Thank you very much. We will now begin the qu...
6,6,Kapil Singh,"Yes. So, firstly, I wanted to check we have s..."
7,7,Gopal Mahadevan,"Okay, you have almost covered, on ..."
8,8,Kapil Singh,"Sure sir. So, sir overall, how much is the co..."
9,9,Gopal Mahadevan,"No, I will, see the other expenses there ..."


In [14]:
management_string

' MR. GOPAL MAHADEVAN – WHOLE TIME DIRECTOR |& CHIEF FINANCIAL OFFICER.|MR. K. M. BALAJI – VICE PRESIDENT (CORPORATE |FINANCE).|'

In [15]:
# Step 6: Identify conversations initiated by the Management

management_corpus = management_string.lower()

information_frame["type_of_conversation"] = None

question_count = 0
outlook_count = 0
for i, conversation_initiator in enumerate(information_frame["Name"].values):
    if conversation_initiator.strip().lower() in management_corpus:
        if question_count != 0:
            if outlook_count == 0:
                information_frame.loc[i, "type_of_conversation"] = "Outlook"
                outlook_count += 1
            else:
                information_frame.loc[i, "type_of_conversation"] = "Answer"
        else:
            information_frame.loc[i, "type_of_conversation"] = "Outlook"
            outlook_count += 1
            
    elif (conversation_initiator.strip().lower() != "management") and (
          conversation_initiator.strip().lower() != "moderator") and (
          "special invites" not in conversation_initiator.strip().lower()) and (
          "board" not in conversation_initiator.strip().lower()) and (
          "directors" not in conversation_initiator.strip().lower()):
        information_frame.loc[i, "type_of_conversation"] = "Question"
        question_count += 1

information_frame.loc[:information_frame[information_frame['type_of_conversation'] == "Outlook"].index.values.tolist()[0]-1, "type_of_conversation"] = None        

information_frame

Unnamed: 0,Sr.No.,Name,GroupOfSentences,type_of_conversation
0,0,MANAGEMENT,MR. GOPAL MAHADEVAN – WHOLE TIME DIRECTOR |& ...,
1,1,MODERATOR,MR. JINESH GANDHI– MOTILAL OSWAL FINANCIAL S...,
2,2,Moderator,"Ladies and gentlemen, Good day, and...",
3,3,Jinesh Gandhi,"Thank you, Mallika. Good morning, everyone. O...",
4,4,Gopal Mahadevan,"Thank you Jinesh and a very warm, good mornin...",Outlook
5,5,Moderator,Thank you very much. We will now begin the qu...,
6,6,Kapil Singh,"Yes. So, firstly, I wanted to check we have s...",Question
7,7,Gopal Mahadevan,"Okay, you have almost covered, on ...",Answer
8,8,Kapil Singh,"Sure sir. So, sir overall, how much is the co...",Question
9,9,Gopal Mahadevan,"No, I will, see the other expenses there ...",Answer


In [16]:
# Step 7: Filtering Answers and constructing a corpus

answer_corpus = [''.join(
    information_frame[
        information_frame["type_of_conversation"] == "Answer"
    ]["GroupOfSentences"].values.tolist()
)]

# Splitting the statements on a full stop (.)
answer_list = answer_corpus[0].split(".")
print(len(answer_list))

275


In [17]:
answer_list

[' Okay,   you   have   almost   covered,   on   a   lighter   note   almost   80%   to   82%   of   the   P&L',
 '   Now, basically, I will tell you, why, see the demand has not come in full-fledged',
 ' But on the positive side, we have seen that in the month of April, when things were opened up, we did see a sudden spurt   in   demand',
 '   Pricing   continues   to   be   a   challenge,   because   you   know,   the   industry unfortunately seems to be using pricing as a method of acquiring customers',
 " We Ashok Leyland, will continue to grow its share of business, and that's what is one of the most important, I would say target that it has, but at the same time, we just need to ensure that we are doing it profitably",
 ' We have taken price increases both in the 4 th Quarter and in the 1st Quarter, I think approximately of about 2% each',
 " And the second one is, that's not sufficient for the material cost increase that has happened",
 ' But at the same time, we must understand

In [19]:
# Step 8: Conducting sentiment analysis for each sentence in the answer corpus

sentiment_matrix = pd.DataFrame(None, columns=['Sentence', 'Label', 'Score'])
sentiment_analysis_results = classifier(answer_list)

for idx, per_sentence_sentiment in enumerate(zip(answer_list, sentiment_analysis_results)):
    sentiment_matrix.loc[idx, ['Sentence', 'Label', 'Score']] = (per_sentence_sentiment[0],
                                                                 per_sentence_sentiment[1]['label'], 
                                                                 per_sentence_sentiment[1]['score'])

In [20]:
sentiment_matrix

Unnamed: 0,Sentence,Label,Score
0,"Okay, you have almost covered, on ...",neutral,0.631744
1,"Now, basically, I will tell you, why, see t...",neutral,0.799171
2,"But on the positive side, we have seen that i...",positive,0.899589
3,Pricing continues to be a challen...,negative,0.877691
4,"We Ashok Leyland, will continue to grow its s...",positive,0.768902
...,...,...,...
270,"Again, thank you very much for the interest i...",positive,0.527386
271,I think there is a lot of interest that we ha...,positive,0.679218
272,And all we can say at the moment from managem...,positive,0.906389
273,And with those words I will now hand it over ...,neutral,0.932606


In [27]:
text = " We Ashok Leyland, will continue to grow its share of business, and that's what is one of the most important, I would say target that it has, but at the same time, we just need to ensure that we are doing it profitably"
doc = nlp_spacy_trf(text)

for token in doc:
    if token.pos_ == "NUM":
        print(f"{token.text} -> {token.pos_}")

one -> NUM


In [21]:
def number_present(sentence):
    doc = nlp_spacy_trf(sentence)
    for token in doc:
        if (token.pos_ == "NUM") or (token.pos_ == "DATE"):
            return 1
        else:
            continue
    return 0

In [23]:
sentiment_matrix["Num_present"] = None
sentiment_matrix["Relevance"] = None

for _index in sentiment_matrix.index.tolist():
    num_present = number_present(sentiment_matrix.loc[_index, "Sentence"])
    sentiment_matrix.loc[_index, "Num_present"] = num_present

sentiment_matrix.loc[(sentiment_matrix["Score"] >= 0.8) | 
                     (sentiment_matrix["Num_present"] == 1), "Relevance"] = "Yes"

# temp_sentence_list = []
# corpus_list = []
# index_list = []
# for _index in sentiment_matrix.index.tolist():
#     if sentiment_matrix.loc[_index, "Relevance"] == "Yes":
#         if sentiment_matrix.loc[_index, "Label"] == "positive":
#             temp_sentence_list.append(answer_list[_index])
#             if index_list[-1] 
#             index_list.append(1)
#         elif sentiment_matrix.loc[_index, "Label"] == "negative":
#     elif sentiment_matrix.loc[_index, "Label"] == "negative":
#         negative_corpus.append(answer_list[_index])
#     else:
#         neutral_corpus.append(answer_list[_index])

In [25]:
sentiment_matrix[sentiment_matrix["Relevance"] == "Yes"].loc[4, 'Sentence']

" We Ashok Leyland, will continue to grow its share of business, and that's what is one of the most important, I would say target that it has, but at the same time, we just need to ensure that we are doing it profitably"

In [39]:
# Step 9: Segregating statements by their sentiment >= 0.8

positive_corpus = []
neutral_corpus = []
negative_corpus = []

filtered_sentiment_matrix = sentiment_matrix[sentiment_matrix["Score"] >= 0.8]

for _index in filtered_sentiment_matrix.index.tolist():
    if sentiment_matrix.loc[_index, "Label"] == "positive":
        positive_corpus.append(answer_list[_index])
    elif sentiment_matrix.loc[_index, "Label"] == "negative":
        negative_corpus.append(answer_list[_index])
    else:
        neutral_corpus.append(answer_list[_index])

relevant_sentence_pool = [positive_corpus, neutral_corpus, negative_corpus]
strong_sentences = pd.DataFrame(
    (each_corpus for each_corpus in itertools.zip_longest(*relevant_sentence_pool)),
    columns=['Positive', 'Neutral', 'Negative'])

strong_sentences

Unnamed: 0,Positive,Neutral,Negative
0,865 crores for the financial year '19-20 which...,"Esteemed shareholders of our Bank, ...",But whenever there is a decreasing interest r...
1,And also congratulated on 4% net interest ...,Answering the questions from the shareholders...,"Yes, it was a very difficult year"
2,And congratulated for performance i...,Each and every shareholders and every stakeho...,"As chairman also mentioned, since you had mor..."
3,And also congratulated for the first quar...,And I would like to give my answers and expla...,Saraf spoke about RBI penalty on cash deposits
4,"So, thanks for your concern for our ...",So we had Mr,"He also asked, “Any salary cut or..."
...,...,...,...
150,,D,
151,,and CEO for his response,
152,,D,
153,,and CEO for his replies,


# -------------------------------------------------------------------------------------------

In [15]:
# Idea: Try to retain context.
# 1. Parse through the content and bin Questions and their supposed answers (Questions and all answers 
# following the questions.)
# 2. For each question -> Perform sentiment analysis on the answer corpus.
# 3. Select the top sentiment sentences > 0.8
# 4. Cluster sentences based on the sentiment (positive - positive - neutral - positive)
# 5. Cluster sentences based on the sentiment (negative - negative - neutral - negative)

In [20]:
outlook_indexes = information_frame[information_frame['type_of_conversation'] == "Outlook"].index.tolist()
question_indexes = information_frame[information_frame['type_of_conversation'] == "Question"].index.tolist()
answer_indexes = information_frame[information_frame['type_of_conversation'] == "Answer"].index.tolist()
indexes_to_exclude = information_frame[information_frame['type_of_conversation'].isnull()].index.tolist()

In [21]:
question_indexes

[9, 11, 13, 15, 17, 19, 23]

In [22]:
# Create groups of Question and answers
qna_groups = []
answer_added = 0  # Counter for answer added into each section
grouping = []
for i, question_index in enumerate(question_indexes):
    # For concall summaries having a format of Questions and then the answers, 
    # we group all the questions and answers in two groups, respectively.
    try:
        if len(grouping) >= 1:  # Will only be non-empty if answers are not yet added
            grouping.append({"type": "question",
                             "index": question_index,
                             "paragraph": information_frame.loc[question_index]['GroupOfSentences'].strip()
                            })
        else:
            grouping = [{"type": "question",
                         "index": question_index,
                         "paragraph": information_frame.loc[question_index]['GroupOfSentences'].strip()
                        }]
    except (KeyError, NameError):
        grouping = [{"type": "question",
                     "index": question_index,
                     "paragraph": information_frame.loc[question_index]['GroupOfSentences'].strip()
                    }]
    for j, answer_index in enumerate(answer_indexes):
        try:
            if (answer_index > question_index) and (answer_index < question_indexes[i+1]):
                grouping.append({"type": "answer", 
                                 "index": answer_index,
                                 "paragraph": information_frame.loc[answer_index]['GroupOfSentences'].strip()
                                })
                answer_added += 1
        except IndexError:
            if answer_index > question_index:
                grouping.append({"type": "answer", 
                                 "index": answer_index,
                                 "paragraph": information_frame.loc[answer_index]['GroupOfSentences'].strip()
                                })
                answer_added += 1
    if (len(grouping) > 1) and (answer_added > 0):
        qna_groups.append(grouping)
        grouping = []

#         print(qna_groups)

In [22]:
a = list(filter(lambda group: group["type"] == "question", qna_groups[0]))
aa = []
for each_q in a:
    aa.append(each_q.get("paragraph"))
aa
for para in aa:
    doc = nlp_spacy_trf(para)
    for token in doc:
        
#         if token.pos_ == "NOUN":
        print(f"{token.text} -> {token.pos_} -> {token.ent_type_}")



Chairman -> PROPN -> 
Sir -> PROPN -> 
, -> PUNCT -> 
I -> PRON -> 
'm -> VERB -> 
from -> ADP -> 
Bombay -> PROPN -> GPE
. -> PUNCT -> 
Sir Resolution No.8 -> NOUN -> LAW
is -> AUX -> 
the QIP -> NOUN -> 
of -> ADP -> 
Rs.500 crores -> NOUN -> MONEY
, -> PUNCT -> 
I -> PRON -> 
request -> VERB -> 
you -> PRON -> 
to -> PART -> 
come -> VERB -> 
with -> ADP -> 
the rights issue -> NOUN -> 
rather -> ADV -> 
than -> SCONJ -> 
a QIP -> NOUN -> 
. -> PUNCT -> 
The -> DET -> 
dividend -> NOUN -> 
of -> ADP -> 
50 paisa -> NOUN -> MONEY
, -> PUNCT -> 
that -> ADV -> 
is -> ADV -> 
50 -> NUM -> PERCENT
% -> PUNCT -> PERCENT
, -> PUNCT -> 
I -> PRON -> 
would -> AUX -> 
like -> VERB -> 
to -> PART -> 
know -> VERB -> 
whether -> SCONJ -> 
you -> PRON -> 
prefer -> VERB -> 
YES Bank model -> NOUN -> 
where -> ADV -> 
to -> PART -> 
give -> VERB -> 
high dividend -> NOUN -> 
and -> CCONJ -> 
QIP -> NOUN -> 
regularly -> ADV -> 
and -> CCONJ -> 
one -> NUM -> DATE
fine -> ADJ -> DATE
day -> NOUN

I -> PRON -> 
am -> AUX -> 
Abhishek -> PROPN -> PERSON
, -> PUNCT -> 
shareholder -> NOUN -> 
of -> ADP -> 
the company -> NOUN -> 
. -> PUNCT -> 
My DP ID -> PROPN -> 
: -> NOUN -> 
IN30163741359155 -> NUM -> 
. -> PUNCT -> 
Congratulations -> NOUN -> 
management -> NOUN -> 
on -> ADP -> 
the eve -> NOUN -> 
of -> ADP -> 
annual general body meeting -> NOUN -> 
. -> PUNCT -> 
  -> SPACE -> 
Sir -> NOUN -> 
, -> PUNCT -> 
I -> PRON -> 
would -> AUX -> 
like -> VERB -> 
to -> PART -> 
know -> VERB -> 
how -> ADV -> 
is -> AUX -> 
our Bank -> PROPN -> 
   -> SPACE -> 
impacted -> VERB -> 
   -> SPACE -> 
in -> ADP -> 
   -> SPACE -> 
these   past   two   years -> NOUN -> DATE
   -> SPACE -> 
of -> ADP -> 
  COVID   time -> NOUN -> 
. -> PUNCT -> 
   -> SPACE -> 
After -> ADP -> 
   -> SPACE -> 
this -> DET -> 
   -> SPACE -> 
Corona -> PROPN -> 
   -> SPACE -> 
Virus -> PROPN -> 
   -> SPACE -> 
and -> CCONJ -> 
subsequent -> ADJ -> 
lock -> NOUN -> 
down -> NOUN -> 
there -> PRON -> 
i

I -> PRON -> 
congratulate -> VERB -> 
the management -> NOUN -> 
for -> ADP -> 
the satisfactory performance -> NOUN -> 
for -> ADP -> 
the year -> NOUN -> DATE
2021 -> NUM -> DATE
. -> PUNCT -> 
All the City Union Bank Limited -> PROPN -> ORG
August -> PROPN -> DATE
19 -> NUM -> DATE
, -> PUNCT -> DATE
2021 -> NUM -> DATE
them -> PRON -> 
come -> VERB -> 
to -> ADP -> 
office -> NOUN -> 
and -> CCONJ -> 
work -> VERB -> 
. -> PUNCT -> 
Because -> SCONJ -> 
in -> ADP -> 
Kerala -> PROPN -> GPE
, -> PUNCT -> 
the situation -> NOUN -> 
is -> AUX -> 
very -> ADV -> 
bad -> ADJ -> 
. -> PUNCT -> 
In -> ADP -> 
India -> PROPN -> GPE
, -> PUNCT -> 
50% patients -> NOUN -> 
come -> VERB -> 
from -> ADP -> 
Kerala -> PROPN -> GPE
. -> PUNCT -> 
I -> PRON -> 
hope -> VERB -> 
if -> SCONJ -> 
they -> PRON -> 
come -> VERB -> 
without -> ADP -> 
vaccine -> NOUN -> 
, -> PUNCT -> 
the others -> NOUN -> 
will -> AUX -> 
be -> AUX -> 
affected -> VERB -> 
. -> PUNCT -> 
I -> PRON -> 
request -> VER

In [36]:
%run document_summary.ipynb

In [37]:
for each_group in qna_groups:
    ExtractSummary(paragraph_meta=each_group, 
                   nlp_lang_model=nlp_spacy_trf, 
                   sentiment_model=classifier, 
                   question_answer=True)

  COVID   time
the notice
channels
my team
net worth
various places
shares
the year
the   expectation
fresh water consumption
more slippages
Operational efficiency
either business organization
education
today
our chairman
today’s meeting
5,040 employees
the shareholders money
the early recovery
the cost cutting initiatives
8% growth
our offices
25% growth
a special session
oxygen concentrators
each and every account
prevailing market rate
the ECLGS scheme
non
the   family members
the interest rate
the residential outs
account
colleagues
return
net NPA
the provision coverage ratio
day
this opportunity
the need-based manner
4%
micro, small and medium enterprises
50% patients
all the accounts
fear
solar panel
our Bank
shareholder
the repayment
critical circumstances
you long life
the donation
the long-term
the   Bank   loan
MSME sector
the coming quarters
essentials
15%
asset
agriculture sector
1 crore
my name
the impact
physical
 One Mrs
equity
any employee
corona period
retailing   busi

In [29]:
qna_display = deepcopy(qna_groups)
for each_dict in qna_display:
    print(each_dict)
    each_dict.pop("paragraph")

print(qna_display)

[{'type': 'question', 'index': 9, 'paragraph': "Chairman Sir, I'm from Bombay. Sir Resolution No.8 is the QIP of Rs.500 crores, I request you to come with the rights issue rather than a QIP. The dividend of 50 paisa, that is 50%, I would like to know whether you prefer YES Bank model where to give high dividend and QIP regularly and one fine day everybody knows what happened. Another model is the Kotak Bank which gives less than one rupee dividend on Rs.1,700 share price. Rarely does a QIP and grow with its own funds. Pages 4 of annual report, in each of the five years, deposits are more than advances. If more deposits are coming, reduce the interest rate. Deposit grew by 9% where the advances grew by 7%. During the year Rs.41.7 lakhs of dividend and 14,709 shares were transferred   to   IEPF.   It's   the   shareholders   money.   You   should   try   to   identify   the   genuine shareholder. Even if you have to send someone from the nearest branch on 28th September next month Rs.64 

TypeError: 'str' object cannot be interpreted as an integer

In [20]:
list(filter(lambda group: group["type"] == "question", qna_groups[0]))

[{'type': 'question',
  'index': 9,
  'paragraph': "Chairman Sir, I'm from Bombay. Sir Resolution No.8 is the QIP of Rs.500 crores, I request you to come with the rights issue rather than a QIP. The dividend of 50 paisa, that is 50%, I would like to know whether you prefer YES Bank model where to give high dividend and QIP regularly and one fine day everybody knows what happened. Another model is the Kotak Bank which gives less than one rupee dividend on Rs.1,700 share price. Rarely does a QIP and grow with its own funds. Pages 4 of annual report, in each of the five years, deposits are more than advances. If more deposits are coming, reduce the interest rate. Deposit grew by 9% where the advances grew by 7%. During the year Rs.41.7 lakhs of dividend and 14,709 shares were transferred   to   IEPF.   It's   the   shareholders   money.   You   should   try   to   identify   the   genuine shareholder. Even if you have to send someone from the nearest branch on 28th September next month Rs

In [None]:
summary_matrix = pd.DataFrame(None, columns=['Question', 'Answer'])
summaries = []
for i, each_group in enumerate(groups):
    summary_matrix.loc[i, 'Question'] = question_indexes[i]
    summary_matrix.loc[i, 'Answer'] = answer_indexes[]
    