In [171]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

from pdfminer.high_level import extract_text
import re
import pandas as pd

In [None]:
# For BERT based NER
tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")

In [2]:
text = extract_text('../data/e546b4e7-9efe-4a5a-8e31-09a09f9487fb.pdf')
text1 = extract_text('../data/AGM_Transcript_Revised.pdf')

In [3]:
# Idea -> Creating tokens as Points of Interest (POI) in the text extracted from the PDF
# and using these tokens to extract Entities of Interest (EOI)
string = 'I now invite Mr. A.V. Mani Sundaram, CLID IN30163741521740.<mask>A.V. Mani Sundaram:'
rege = r'(<mask>(.*)<mask>(.*?):)|(<mask>(.*?):)'

re.findall(rege, string)[0][-1]

'A.V. Mani Sundaram'

In [122]:
# Step 1: Creating POI
text = text.replace('\n\n', '<mask>').replace('\n', '<m>').replace('\x0c', '')
text1 = text1.replace('\n\n', '<mask>').replace('\n', '<m>').replace('\x0c', '')

In [123]:
# Step 2: Extracting EOI
a = re.findall(r'<mask>(.*?):', text1)
a = [str(_.strip()) + ':' for _ in a]

final_list = []
for string in a:
    match_group = re.findall(r'(<mask>(.*)<mask>(.*?):)|(<mask>(.*?):)', string)
    non_empty_group = [ele for inner_group in match_group for ele in inner_group if ele]
    # print(non_empty_group)
    if match_group:
        if len(non_empty_group[0]) < 2:
            pass
        else:
#             print(match_group[0][1])
            final_list.append(non_empty_group[-1])
#     final_list.extend(re.findall(r'<mask><mask>(.*)<mask><mask>(.*?):', string)[-1])

# Approach: Weed out entities based on length of the string
# Assumption: A typical name would not exceed 40 characters
filtered_list = [filtered_element for filtered_element in final_list if len(filtered_element) <= 40]

In [138]:
# Tentative Approach: Trying to filter out PER entities  (discarded)
nlp = pipeline("ner", model=model, tokenizer=tokenizer)

for each_element in filtered_list:
    ner = nlp(each_element.replace('.', ''))
#     print(ner)
# Cons: Does not work as expected; Moderator, Management does not fit under any entity; breaks down single words
# and provides entities (is this expected?)

In [139]:
# Step 3: Extracting location of all EOI
entity_span = []
for element in filtered_list:
    for find_result in re.finditer(str(element) + ':', text1):
        span = (element, int(find_result.start()), int(find_result.end()))
        entity_span.append(span)
entity_span = list(set(entity_span))
entity_span = sorted(entity_span, key=lambda span_list: span_list[1])
# print(entity_span)

In [172]:
# Step 4: Extracting relevant information based on the EOI

desired_columns = ["Sr.No.", "Name", "GroupOfSentences"]
information_frame = pd.DataFrame(None, columns=desired_columns)
for sequence, entity_information in enumerate(entity_span):
    start_index = entity_information[1]
    end_index = entity_information[2]
    if entity_information[0].lower() == "management":
        management_string = text1[
            end_index: entity_span[sequence+1][1]
        ].replace("<mask>", "|").replace("<m>", "|")
        management_list = management_string.split("|")
        
        frame = pd.DataFrame([[sequence, entity_information[0], management_string]], columns=desired_columns)

    elif ("directors" in entity_information[0].lower()) and ("board" in entity_information[0].lower()):
        director_string = text1[
            end_index: entity_span[sequence+1][1]
        ].replace("<mask>", "|").replace("<m>", "|")
        director_list = director_string.split("|")

        frame = pd.DataFrame([[sequence, entity_information[0], director_string]], columns=desired_columns)

    else:
        try:
            relevant_string = text1[
                end_index: entity_span[sequence+1][1]
            ].replace("<mask>", " ").replace("<m>", " ")
        except IndexError:
            relevant_string = text1[
                end_index:
            ].replace("<mask>", " ").replace("<m>", " ")
    
        frame = pd.DataFrame([[sequence, entity_information[0], relevant_string]], columns=desired_columns)
    
    information_frame = pd.concat([information_frame, frame], axis=0)

In [176]:
information_frame.to_excel("../data/output_dataframe.xlsx")

In [20]:
text[text.find('MANAGEMENT:'): text.find('MANAGEMENT:')+1000]

'MANAGEMENT:  MR.  R.  K.  SHETTY  —  MANAGING  DIRECTOR,  HERANBA \n\nINDUSTRIES  LIMITED \n\nMR.  RAUNAK  SHETTY  —  EXECUTIVE  DIRECTOR, \n\nHERANBA  INDUSTRIES  LIMITED \n\nMR.  RAJ  KUMAR  BAFNA  —  CHIEF  FINANCIAL  OFFICER, \n\nHERANBA  INDUSTRIES  LIMITED \n\nPage  1  of  18\n\n\x0cHERANBA \n\nINDUSTRIES  LIMITED \nA  Govt,  Recognised  Export  House \n\nHeranba  Industries  Limited \nAugust  17,  2027 \n\nModerator: \n\nLadies  and  Gentlemen,  Good  Day  and  Welcome  to  the  Ql  FY22  Earnings  Conference  Call  for \n\nHeranba  Industries  Limited.  As  a  reminder,  all  participants’  lines  will  be  in  the  listen-only  mode, \n\nand  there  will  be  an  opportunity  for  you  to  ask  questions  after  the  presentation  concludes.  Should \n\nyou  need  assistance  during  this  conference  call,  please  signal  an  operator  by  pressing  ‘*’  and \n\nthen  ‘0’  on  your  touchtone  phone.  Please  note  that  this  conference  is  being  recorded.  I  now  hand 

In [50]:
pattern = "management:((.|\n)*?):"
sub_string = re.search(pattern, text.lower()).groups()

print(sub_string[0].split('\n\n'))

['  mr.  r.  k.  shetty  —  managing  director,  heranba ', 'industries  limited ', 'mr.  raunak  shetty  —  executive  director, ', 'heranba  industries  limited ', 'mr.  raj  kumar  bafna  —  chief  financial  officer, ', 'heranba  industries  limited ', 'page  1  of  18', '\x0cheranba ', 'industries  limited \na  govt,  recognised  export  house ', 'heranba  industries  limited \naugust  17,  2027 ', 'moderator']


In [56]:
pattern = "management:((.|\n)*?):"
sub_string = re.search(pattern, text1.lower()).groups()

print(sub_string[0].split('\n\n'))

[' shri r. mohan – non-executive chairman', 'dr. n. kamakodi – managing director & ceo\nmr. v. ramesh – cfo & company secretary', 'board of directors present']


In [1]:
import camelot

In [3]:
tables = camelot.read_pdf("../data/e546b4e7-9efe-4a5a-8e31-09a09f9487fb.pdf")
tables.n

0

In [6]:
import tabula
import os

In [8]:
tables = tabula.read_pdf("../data/AGM_Transcript_Revised.pdf", pages="all")
tables

[Empty DataFrame
 Columns: [Unnamed: 0]
 Index: [],
 Empty DataFrame
 Columns: [Karmanye Vadhikarast, , Ma Phaleshou Kada Chana”]
 Index: []]

In [None]:
<mask>(.*)<mask><mask>(.*?):