In [1]:
# from transformers import AutoTokenizer, AutoModelForTokenClassification
# from transformers import pipeline
import spacy

from pdfminer.high_level import extract_text
import pdfplumber
import re
import pandas as pd

In [2]:
# For BERT based NER
tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")

#### pdfminer.six

In [2]:
text = extract_text('../data/AGM_Transcript_Revised.pdf')
# text1 = extract_text('../data/AGM_Transcript_Revised.pdf')

#### pdfplumber: Better results -> Check later

In [31]:
with pdfplumber.open("../data/AGM_Transcript_Revised.pdf") as file:
    text = file.pages[10].extract_text()

In [34]:
# Idea -> Creating tokens as Points of Interest (POI) in the text extracted from the PDF
# and using these tokens to extract Entities of Interest (EOI)
string = 'I now invite Mr. A.V. Mani Sundaram, CLID IN30163741521740.<mask>A.V. Mani Sundaram:'
rege = r'(<mask>(.*)<mask>(.*?):)|(<mask>(.*?):)'

re.findall(rege, string)[0][-1]

'A.V. Mani Sundaram'

#### NOTE: The processed data's accuracy will depend on the quality of pdf extraction 

In [3]:
# Step 1: Creating POI
text = text.replace('\n\n', '<mask>').replace('\n', '<m>').replace('\x0c', '')

In [4]:
# Step 2: Extracting EOI
a = re.findall(r'<mask>(.*?):', text)
a = [str(_.strip()) + ':' for _ in a]

final_list = []
for string in a:
    match_group = re.findall(r'(<mask>(.*)<mask>(.*?):)|(<mask>(.*?):)', string)
    non_empty_group = [ele for inner_group in match_group for ele in inner_group if ele]
    # print(non_empty_group)
    if match_group:
        if len(non_empty_group[0]) < 2:
            pass
        else:
#             print(match_group[0][1])
            final_list.append(non_empty_group[-1])
#     final_list.extend(re.findall(r'<mask><mask>(.*)<mask><mask>(.*?):', string)[-1])

# Approach: Weed out entities based on length of the string
# Assumption: A typical name would not exceed 40 characters
filtered_list = [filtered_element for filtered_element in final_list if len(filtered_element) <= 40]

In [5]:
filtered_list

['MANAGEMENT',
 'BOARD OF DIRECTORS PRESENT',
 'SPECIAL INVITES',
 'Moderator',
 'Shri R. Mohan',
 'V. Ramesh',
 'Shri R. Mohan',
 'Moderator',
 'Moderator',
 'Aspi Bhesania',
 'Moderator',
 'A.V. Mani Sundaram',
 'Moderator',
 'J Abhishek',
 'Moderator',
 'G. Sankaran',
 'Moderator',
 'Santosh Kumar Saraf',
 'Moderator',
 'K.S. Balasubramanian',
 'Moderator',
 'Dr. N. Kamakodi',
 'Shri R. Mohan',
 'K. Vaidyanathan']

In [6]:
# Tentative Approach: Trying to filter out PER entities  (discarded)
nlp = pipeline("ner", model=model, tokenizer=tokenizer)

for each_element in filtered_list:
    ner = nlp(each_element.replace('.', ''))
#     print(ner)
# Cons: Does not work as expected; Moderator, Management does not fit under any entity; breaks down single words
# and provides entities (is this expected?)

NameError: name 'pipeline' is not defined

In [7]:
# Tentative Approach: Trying to filter out PER entities  (using spaCy based NEW approach)

# !python -m spacy download en_core_web_trf

import spacy_transformers
nlp_spacy_trf = spacy.load("en_core_web_trf")

In [17]:
ner_list = []
for word_entity in nlp_spacy_trf(" | ".join(filtered_list)).ents:
    name_property = word_entity.text, word_entity.label_
    ner_list.append(name_property)



In [26]:
ner_list

[('Shri R. Mohan', 'PERSON'),
 ('V. Ramesh', 'PERSON'),
 ('Shri R. Mohan', 'PERSON'),
 ('Aspi Bhesania', 'PERSON'),
 ('A.V. Mani Sundaram', 'PERSON'),
 ('J Abhishek', 'PERSON'),
 ('G. Sankaran', 'PERSON'),
 ('Santosh Kumar Saraf', 'PERSON'),
 ('K.S. Balasubramanian', 'PERSON'),
 ('N. Kamakodi', 'PERSON'),
 ('Shri R. Mohan', 'PERSON'),
 ('K. Vaidyanathan', 'PERSON')]

In [122]:
[True for _li in ner_list if "shRi" in _li[0].lower()]

[]

In [111]:
# Step 3: Extracting location of all EOI
entity_span = []
for element in filtered_list:
    for find_result in re.finditer(str(element) + ':', text):
        span = (element, int(find_result.start()), int(find_result.end()))
        entity_span.append(span)
entity_span = list(set(entity_span))
entity_span = sorted(entity_span, key=lambda span_list: span_list[1])
print(entity_span)

[('MANAGEMENT', 77, 88), ('BOARD OF DIRECTORS PRESENT', 222, 249), ('SPECIAL INVITES', 588, 604), ('Moderator', 799, 809), ('Shri R. Mohan', 1504, 1518), ('V. Ramesh', 3540, 3550), ('Shri R. Mohan', 4947, 4961), ('Moderator', 27547, 27557), ('Moderator', 27708, 27718), ('Aspi Bhesania', 27783, 27797), ('Moderator', 30098, 30108), ('A.V. Mani Sundaram', 30179, 30198), ('Moderator', 30729, 30739), ('J Abhishek', 30803, 30814), ('Moderator', 32751, 32761), ('G. Sankaran', 32948, 32960), ('Moderator', 37088, 37098), ('Santosh Kumar Saraf', 37170, 37190), ('Moderator', 37290, 37300), ('K.S. Balasubramanian', 37373, 37394), ('Moderator', 43089, 43099), ('Dr. N. Kamakodi', 43195, 43211), ('Shri R. Mohan', 70303, 70317), ('K. Vaidyanathan', 70483, 70499)]


In [108]:
# Step 4: Extracting relevant information based on the EOI

desired_columns = ["Sr.No.", "Name", "GroupOfSentences"]
information_frame = pd.DataFrame(None, columns=desired_columns)
for sequence, entity_information in enumerate(entity_span):
    start_index = entity_information[1]
    end_index = entity_information[2]
    if entity_information[0].lower() == "management":
        management_string = text[
            end_index: entity_span[sequence+1][1]
        ].replace("<mask>", "|").replace("<m>", "|")
        management_list = management_string.split("|")
        
        frame = pd.DataFrame([[sequence, entity_information[0], management_string]], columns=desired_columns)

    elif ("directors" in entity_information[0].lower()) and ("board" in entity_information[0].lower()):
        director_string = text[
            end_index: entity_span[sequence+1][1]
        ].replace("<mask>", "|").replace("<m>", "|")
        director_list = director_string.split("|")

        frame = pd.DataFrame([[sequence, entity_information[0], director_string]], columns=desired_columns)

    else:
        try:
            relevant_string = text[
                end_index: entity_span[sequence+1][1]
            ].replace("<mask>", " ").replace("<m>", " ")
        except IndexError:
            relevant_string = text[
                end_index:
            ].replace("<mask>", " ").replace("<m>", " ")
    
        frame = pd.DataFrame([[sequence, entity_information[0], relevant_string]], columns=desired_columns)
    
    information_frame = pd.concat([information_frame, frame], axis=0)

In [21]:
# information_frame.to_excel("../data/ashok_leyland_concall_transcript.xlsx", index=False)

In [109]:
# Add the NERs to the frame and add a few known NERs
for index, row in information_frame.iterrows():
#     print(row['Name'], row['Sr.No.'])
    for name_properties in ner_list:
        if name_properties[0] in row["Name"]:
            information_frame.loc[information_frame["Name"] == row["Name"], "entity"] = name_properties[1]
            break
        else:
            pass
information_frame.loc[
                      (information_frame["Name"].str.lower() == "management") |
                      (information_frame["Name"].str.lower().str.contains("moderator")) |
                      (information_frame["Name"].str.lower().str.contains("invites")) | 
                      (information_frame["Name"].str.lower().str.contains("directors")) | 
                      (information_frame["Name"].str.lower().str.contains("board")),
    "entity"] = "PERSON"

In [110]:
information_frame

Unnamed: 0,Sr.No.,Name,GroupOfSentences,entity
0,0,MANAGEMENT,SHRI R. MOHAN – NON-EXECUTIVE CHAIRMAN|DR. N....,PERSON
0,1,BOARD OF DIRECTORS PRESENT,|SMT. ABARNA BHASKAR – INDEPENDENT DIRECTOR MR...,PERSON
0,2,SPECIAL INVITES,"M/S. SUNDARAM & SRINIVASAN, STATUTORY CENTRA...",PERSON
0,3,Moderator,"Dear Shareholders, good morning and a very wa...",PERSON
0,4,Shri R. Mohan,Thank you! Let us commence this AGM with Inau...,PERSON
0,5,V. Ramesh,"Hello. Good Morning. Dear members, you are re...",PERSON
0,6,Shri R. Mohan,"Thank you, Ramesh. CHAIRMAN SPEECH Esteemed ...",PERSON
0,7,Moderator,Thank you very much. We will now begin the qu...,PERSON
0,8,Moderator,I now invite Mr. Aspi Bhesania CLID 120125000...,PERSON
0,9,Aspi Bhesania,"Chairman Sir, I'm from Bombay. Sir Resolution...",PERSON
