In [1]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
import spacy

from pdfminer.high_level import extract_text
# import pdfplumber
import re
import pandas as pd
import itertools

In [2]:
# For BERT based NER
tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")

In [3]:
# For spaCy based NER

# !python -m spacy download en_core_web_trf
import spacy_transformers
nlp_spacy_trf = spacy.load("en_core_web_trf")

#### pdfminer.six

In [4]:
text = extract_text('../data/ashok_leyland_concall_transcript.pdf')
# text1 = extract_text('../data/AGM_Transcript_Revised.pdf')

#### pdfplumber: Better results in pdf extraction -> Check later

with pdfplumber.open("../data/AGM_Transcript_Revised.pdf") as file:
    text = file.pages[10].extract_text()

In [5]:
# Idea -> Creating tokens as Points of Interest (POI) in the text extracted from the PDF
# and using these tokens to extract Entities of Interest (EOI)
string = 'I now invite Mr. A.V. Mani Sundaram, CLID IN30163741521740.<mask>A.V. Mani Sundaram:'
rege = r'(<mask>(.*)<mask>(.*?):)|(<mask>(.*?):)'

re.findall(rege, string)[0][-1]

'A.V. Mani Sundaram'

#### NOTE: The processed data's accuracy will depend on the quality of pdf extraction 

In [6]:
# Step 1: Creating POI
text = text.replace('\n\n', '<mask>').replace('\n', '<m>').replace('\x0c', '')

In [7]:
# Step 2: Extracting EOI

all_eoi = re.findall(r'<mask>(.*?):', text)
all_eoi = [str(_.strip()) + ':' for _ in all_eoi]

list_of_entities = []
for entity in all_eoi:
    match_group = re.findall(r'(<mask>(.*)<mask>(.*?):)|(<mask>(.*?):)', entity)
    non_empty_group = [ele for inner_group in match_group for ele in inner_group if ele]
    # print(non_empty_group)
    if match_group:
        if len(non_empty_group[0]) < 2:
            pass
        else:
#             print(match_group[0][1])
            list_of_entities.append(non_empty_group[-1])
#     list_of_entities.extend(re.findall(r'<mask><mask>(.*)<mask><mask>(.*?):', entity)[-1])

# Approach: Preliminary weeding out entities based on length of the entity
# Assumption: A typical name would not exceed 40 characters
filtered_entity_list = [filtered_element for filtered_element in list_of_entities if len(filtered_element) <= 40]
print(filtered_entity_list)

['MANAGEMENT', 'MODERATOR', 'Moderator', 'Jinesh Gandhi', 'Gopal Mahadevan', 'But coming to the Quarterly Results now', 'seeing happening', 'Moderator', 'Kapil Singh', 'Gopal Mahadevan', 'Kapil Singh', 'Gopal Mahadevan', 'Kapil Singh', 'Gopal Mahadevan', 'Moderator', 'Gunjan Pritiyani', 'Gopal Mahadevan', 'Gunjan Pritiyani', 'Gopal Mahadevan', 'Gunjan Pritiyani', 'Gopal Mahadevan', 'Moderator', 'Hitesh Goel', 'Gopal Mahadevan', 'Hitesh Goel', 'Gopal Mahadevan', 'K. M. Balaji', 'Hitesh Goel', 'Gopal Mahadevan', 'Moderator', 'Sachin Trivedi', 'Gopal Mahadevan', 'Sachin Trivedi', 'Gopal Mahadevan', 'Moderator', 'Shyam Sundar Sriram', 'Gopal Mahadevan', 'Shyam Sundar Sriram', 'Gopal Mahadevan', 'K. M. Balaji', 'Gopal Mahadevan', 'K. M. Balaji', 'Gopal Mahadevan', 'Shyam Sundar Sriram', 'Gopal Mahadevan', 'Moderator']


In [11]:
# Tentative Approach: Trying to filter out PER entities  (discarded)

nlp = pipeline("ner", model=model, tokenizer=tokenizer)

for each_element in filtered_entity_list:
    ner = nlp(each_element.replace('.', ''))
#     print(ner)
# Cons: Does not work as expected; Moderator, Management does not fit under any entity; breaks down single words
# and provides entities (is this expected?) -> It is expected

In [8]:
# Tentative Approach: Trying to filter out PER entities  (using spaCy based NEW approach)
# Selected approach

ner_list = []
for word_entity in nlp_spacy_trf(" | ".join(filtered_entity_list)).ents:
    name_property = word_entity.text, word_entity.label_
    ner_list.append(name_property)
# print(ner_list)



In [9]:
# Step 3: Extracting location of all EOI

entity_span = []
for element in filtered_entity_list:
    for find_result in re.finditer(str(element) + ':', text):
        span = (element, int(find_result.start()), int(find_result.end()))
        entity_span.append(span)
entity_span = list(set(entity_span))
entity_span = sorted(entity_span, key=lambda span_list: span_list[1])
# print(entity_span)

In [10]:
# Step 4: Filtering out irrelevant EOI

filtered_entity_span = []
for entity_of_interest in entity_span:
    name = entity_of_interest[0].lower()
    for ner_result in ner_list:
        if ("management" in name) or (
            "moderator" in name) or (
            "invites" in name) or (
            "special" in name) or (
            "board" in name) or (
            "director" in name):
            filtered_entity_span.append(entity_of_interest)
            break
        elif name in ner_result[0].lower():
            filtered_entity_span.append(entity_of_interest)
            break
        else:
            pass
print(filtered_entity_span)

[('MANAGEMENT', 85, 96), ('MODERATOR', 237, 247), ('Moderator', 337, 347), ('Jinesh Gandhi', 511, 525), ('Gopal Mahadevan', 629, 645), ('Moderator', 9805, 9815), ('Kapil Singh', 10047, 10059), ('Gopal Mahadevan', 10491, 10507), ('Kapil Singh', 14650, 14662), ('Gopal Mahadevan', 14912, 14928), ('Kapil Singh', 15668, 15680), ('Gopal Mahadevan', 15764, 15780), ('Moderator', 16514, 16524), ('Gunjan Pritiyani', 16645, 16662), ('Gopal Mahadevan', 17089, 17105), ('Gunjan Pritiyani', 21856, 21873), ('Gopal Mahadevan', 22363, 22379), ('Gunjan Pritiyani', 24746, 24763), ('Gopal Mahadevan', 24934, 24950), ('Moderator', 26092, 26102), ('Hitesh Goel', 26203, 26215), ('Gopal Mahadevan', 26456, 26472), ('Hitesh Goel', 27557, 27569), ('Gopal Mahadevan', 27676, 27692), ('K. M. Balaji', 27855, 27868), ('Hitesh Goel', 27932, 27944), ('Gopal Mahadevan', 28229, 28245), ('Moderator', 33334, 33344), ('Sachin Trivedi', 33446, 33461), ('Gopal Mahadevan', 33851, 33867), ('Sachin Trivedi', 35743, 35758), ('Gopal

In [11]:
# Step 5: Extracting relevant information based on the EOI

desired_columns = ["Sr.No.", "Name", "GroupOfSentences"]
information_frame = pd.DataFrame(None, columns=desired_columns)
for sequence, entity_information in enumerate(filtered_entity_span):
    start_index = entity_information[1]
    end_index = entity_information[2]
    if entity_information[0].lower() == "management":
        management_string = text[
            end_index: filtered_entity_span[sequence+1][1]
        ].replace("<mask>", "|").replace("<m>", "|")
        management_list = management_string.split("|")
        
        frame = pd.DataFrame([[sequence, entity_information[0], management_string]], columns=desired_columns)

    elif ("directors" in entity_information[0].lower()) and ("board" in entity_information[0].lower()):
        director_string = text[
            end_index: filtered_entity_span[sequence+1][1]
        ].replace("<mask>", "|").replace("<m>", "|")
        director_list = director_string.split("|")

        frame = pd.DataFrame([[sequence, entity_information[0], director_string]], columns=desired_columns)

    else:
        try:
            relevant_string = text[
                end_index: filtered_entity_span[sequence+1][1]
            ].replace("<mask>", " ").replace("<m>", " ")
        except IndexError:
            relevant_string = text[
                end_index:
            ].replace("<mask>", " ").replace("<m>", " ")
    
        frame = pd.DataFrame([[sequence, entity_information[0], relevant_string]], columns=desired_columns)
    
    information_frame = pd.concat([information_frame, frame], axis=0)
    information_frame.reset_index(drop=True, inplace=True)

In [15]:
# information_frame.to_excel("../data/ashok_leyland_concall_transcript.xlsx", index=False)

In [12]:
information_frame

Unnamed: 0,Sr.No.,Name,GroupOfSentences
0,0,MANAGEMENT,MR. GOPAL MAHADEVAN – WHOLE TIME DIRECTOR |& ...
1,1,MODERATOR,MR. JINESH GANDHI– MOTILAL OSWAL FINANCIAL S...
2,2,Moderator,"Ladies and gentlemen, Good day, and..."
3,3,Jinesh Gandhi,"Thank you, Mallika. Good morning, everyone. O..."
4,4,Gopal Mahadevan,"Thank you Jinesh and a very warm, good mornin..."
5,5,Moderator,Thank you very much. We will now begin the qu...
6,6,Kapil Singh,"Yes. So, firstly, I wanted to check we have s..."
7,7,Gopal Mahadevan,"Okay, you have almost covered, on ..."
8,8,Kapil Singh,"Sure sir. So, sir overall, how much is the co..."
9,9,Gopal Mahadevan,"No, I will, see the other expenses there ..."


In [13]:
management_string

' MR. GOPAL MAHADEVAN – WHOLE TIME DIRECTOR |& CHIEF FINANCIAL OFFICER.|MR. K. M. BALAJI – VICE PRESIDENT (CORPORATE |FINANCE).|'

In [14]:
# Step 6: Identify conversations initiated by the Management

management_corpus = management_string.lower()

information_frame["type_of_conversation"] = None


for i, conversation_initiator in enumerate(information_frame["Name"].values):
    if conversation_initiator.strip().lower() in management_corpus:
        information_frame.loc[i, "type_of_conversation"] = "Answer"
    elif (conversation_initiator.strip().lower() != "management") and (
          conversation_initiator.strip().lower() != "moderator"):
        information_frame.loc[i, "type_of_conversation"] = "Question"

information_frame

Unnamed: 0,Sr.No.,Name,GroupOfSentences,type_of_conversation
0,0,MANAGEMENT,MR. GOPAL MAHADEVAN – WHOLE TIME DIRECTOR |& ...,
1,1,MODERATOR,MR. JINESH GANDHI– MOTILAL OSWAL FINANCIAL S...,
2,2,Moderator,"Ladies and gentlemen, Good day, and...",
3,3,Jinesh Gandhi,"Thank you, Mallika. Good morning, everyone. O...",Question
4,4,Gopal Mahadevan,"Thank you Jinesh and a very warm, good mornin...",Answer
5,5,Moderator,Thank you very much. We will now begin the qu...,
6,6,Kapil Singh,"Yes. So, firstly, I wanted to check we have s...",Question
7,7,Gopal Mahadevan,"Okay, you have almost covered, on ...",Answer
8,8,Kapil Singh,"Sure sir. So, sir overall, how much is the co...",Question
9,9,Gopal Mahadevan,"No, I will, see the other expenses there ...",Answer


In [19]:
# Step 7: Filtering Answers and constructing a corpus

answer_corpus = [''.join(
    information_frame[
        information_frame["type_of_conversation"] == "Answer"
    ]["GroupOfSentences"].values.tolist()
)]

# Splitting the statements on a full stop (.)
answer_list = answer_corpus[0].split(".")
print(len(answer_list))

364


In [20]:
# Step 8: Conducting sentiment analysis for each sentence in the answer corpus

classifier = pipeline('sentiment-analysis', model='ProsusAI/finbert')

In [21]:
sentiment_matrix = pd.DataFrame(None, columns=['Label', 'Score'])
sentiment_analysis_results = classifier(answer_list)

for idx, per_sentence_sentiment in enumerate(sentiment_analysis_results):
    sentiment_matrix.loc[idx, ['Label', 'Score']] = (per_sentence_sentiment['label'], per_sentence_sentiment['score'])


In [24]:
# Step 9: Segregating statements by their sentiment >= 0.8

positive_corpus = []
neutral_corpus = []
negative_corpus = []

filtered_sentiment_matrix = sentiment_matrix[sentiment_matrix["Score"] >= 0.8]

for _index in filtered_sentiment_matrix.index.tolist():
    if sentiment_matrix.loc[_index, "Label"] == "positive":
        positive_corpus.append(answer_list[_index])
    elif sentiment_matrix.loc[_index, "Label"] == "negative":
        negative_corpus.append(answer_list[_index])
    else:
        neutral_corpus.append(answer_list[_index])

relevant_sentence_pool = [positive_corpus, neutral_corpus, negative_corpus]
strong_sentences = pd.DataFrame(
    (each_corpus for each_corpus in itertools.zip_longest(*relevant_sentence_pool)),
    columns=['Positive', 'Neutral', 'Negative'])

strong_sentences

Unnamed: 0,Positive,Neutral,Negative
0,"On the positive side, again, we're seeing rea...","As a remainder, all participant lines ...","But then suddenly, we had the lockdown again,..."
1,"But again, on the positive side, we believe t...",And there will be an opportunity for you to a...,Pricing continues to be a challen...
2, The second bit is on the Light Commercial V...,Should you need assistance during the confere...,I've been mentioning that over the last few q...
3,"But we, at the moment, believe that with the ...",Please note that this conference is being rec...,"At the same time, what we have been doing is ..."
4,"So, LCV business has been doing well it has b...",I now hand the conference over to Mr,"So, practically if you look at it schools wer..."
...,...,...,...
187,,"Yes, because at the level of volume, it is b...",
188,,"See, understand one thing, inflation Ashok Le...",
189,,"So, there are a multiple ways to do this",
190,,"So, if I may take that as the last question",


# -------------------------------------------------------------------------------------------

In [15]:
# Idea: Try to retain context.
# 1. Parse through the content and bin Questions and their supposed answers (Questions and all answers 
# following the questions.)
# 2. For each question -> Perform sentiment analysis on the answer corpus.
# 3. Select the top sentiment sentences > 0.8
# 4. Cluster sentences based on the sentiment (positive - positive - neutral - positive)
# 5. Cluster sentences based on the sentiment (negative - negative - neutral - negative)

In [30]:
question_indexes = information_frame[information_frame['type_of_conversation'] == "Question"].index.tolist()
answer_indexes = information_frame[information_frame['type_of_conversation'] == "Answer"].index.tolist()
indexes_to_exclude = information_frame[information_frame['type_of_conversation'].isnull()].index.tolist()

In [None]:
# question_index, answer_index (lower than next question index)

In [35]:
# Create groups of Question and answers
groups = []
for i, question_index in enumerate(question_indexes):
    grouping = [question_index]
    for j, answer_index in enumerate(answer_indexes):
        try:
            if (answer_index > question_index) and (answer_index < question_indexes[i+1]):
                grouping.append(answer_index)
        except IndexError:
            if answer_index > question_index:
                grouping.append(answer_index)
    groups.append(grouping)
print(groups)

[[3, 4], [6, 7], [8, 9], [10, 11], [13, 14], [15, 16], [17, 18], [20, 21], [22, 23, 24], [25, 26], [28, 29], [30, 31], [33, 34], [35, 36, 37, 38, 39, 40], [41, 42]]


In [None]:
summary_matrix = pd.DataFrame(None, columns=['Question', 'Answer'])
summaries = []
for i, each_group in enumerate(groups):
    summary_matrix.loc[i, 'Question'] = summaries