In [22]:
from PyPDF2 import PdfReader
from langchain.embeddings import GPT4AllEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.prompts import PromptTemplate, FewShotPromptTemplate
from langchain_groq import ChatGroq
import os
from constants import openai_key
from langchain.chains import LLMChain, SequentialChain

In [23]:
os.environ['GROQ_API_KEY'] = openai_key

In [82]:
pdf_path='8983765_ua 07 k 5553.pdf'
pdf_reader=PdfReader(pdf_path)
print(pdf_reader)

<PyPDF2._reader.PdfReader object at 0x000001C6851D9670>


In [83]:
import re

In [84]:
raw_text=''
for i, pages in enumerate(pdf_reader.pages):
    content=pages.extract_text()
    if content:
        raw_text+=content
    

In [85]:
text_splitter=CharacterTextSplitter(
    separator='\n',
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len
)

In [86]:
text=text_splitter.split_text(raw_text)

In [87]:
text

['DEHRADUN\n24/04/2023Place :\nDate  :\nThe Oriental Insurance Company Limited\nPage 1 of 2253200/31/2024/238 Policy No               : 253200/31/2022/191 Prev Policy No     :\n-\nFROM 00:00  ON 25/05/2023 TO MIDNIGHT OF 24/05/2024UTTRAKHAND JAL VIDYUT NIGAM LTD \n(GSTIN: 0)DO DEHRADUN (GSTIN: \n05AAACT0627R4Z2)\n 9,079  1,634 .5  10,713PRIVATE CAR PACKAGE POLICY - ZONE B\nDC_I_IND 2226000245 - 24/04/2023          GST INVOICE NO :05229613        UIN :0\nValue of CNG\nLPG KitTotal Value For the Vehicle For Trailers Non Electrical\nAccessoriesElectrical\nAccessories\n 51,656  0  0  51,656UA 07 K 5553 2523 OTHERS 9 + 1 GAS54D29039 2005\nMA1NN2GAK52D13\n485\nThe Policy covers use  of the vehicle for any purpose other than  a) Hire or Reward  b)  Carriage of goods (other than samples or \npersonal luggage) c) Organized racing d)       Pace making e) Speed testing  f) Reliability Trials  g) Use in connection with  Motor \nTrade1',
 "personal luggage) c) Organized racing d)       Pace making 

In [88]:
templates={"name":'''from the {text} extract the name of policy holder''',
"contact_numbr":'''from the {text} extract the contact_number of policy holder''',
"policy_number":'''from the {text} extract the policy_number of policy''',
"Insurance_company_name":'''from the {text} extract the Insurance_company_name''',
"type_of_policy":'''from the {text} extract the Type_of_policy''',
"Start_date":'''from the {text} extract the Start_date of policy''',
"expiry_date":'''from the {text} extract the expiry_date of policy''',
"registration_number":'''from the {text} extract the registration_number of car''',
"engine_number":'''from the {text} extract the engine_number of car''',
"chassis_number":'''from the {text} extract the chassis_number of car''',
"body_type":'''from the {text} extract the body_type of car''',
"vehicle_make":'''from the {text} extract the vehicle_make of car''',
"model":'''from the {text} extract the model of car''',
"manufacturing_year":'''from the {text} extract the manufacturing_year of car''',
"total_premium_paid":'''from the {text} extract the total_premium_paid by the policy holder''',
"address":'''from the {text} extract the address of policy holder'''}

In [89]:
template='''from the {text} give me the details like name of the customer, contact number of the customer, policy_number, insurance_company_name, type_of_policy, start_date of policy, expiry_date of policy, registration_number, engine_number, chassis_number, body_type, vehicle_make, model, manufacturing_year, total_premium_paid, address of the consumer, in the format of a json file, no other text needed'''

In [90]:
llm = ChatGroq(api_key=os.environ['GROQ_API_KEY'], model_name="llama-3.1-8b-instant", temperature=0.1 )

In [91]:
key_information={}

In [92]:

def information_extractor(llm, templates, text):
    global key_information
    for key in templates:
        key_information[key]= LLMChain(llm=llm, prompt=PromptTemplate(input_variables=['text'], output_key=key, template=templates[key])).run(text)
    return key_information


In [94]:
#information_extractor(llm, templates,text)

In [95]:
chain1=LLMChain(llm=llm, prompt=PromptTemplate(template=template,input_variables=['text']), output_key='dict')

In [96]:
chain1.run(text)

'```json\n{\n  "customer_name": "UJJWAL MAHARANI",\n  "customer_contact_number": "Not Available",\n  "policy_number": "253200/31/2022/191",\n  "insurance_company_name": "The Oriental Insurance Company Limited",\n  "type_of_policy": "PRIVATE CAR PACKAGE POLICY - ZONE B",\n  "start_date_of_policy": "25/05/2023",\n  "expiry_date_of_policy": "24/05/2024",\n  "registration_number": "UA 07 K 5553",\n  "engine_number": "54D29039",\n  "chassis_number": "MA1NN2GAK52D13",\n  "body_type": "OTHERS 9 + 1 GAS",\n  "vehicle_make": "MAHINDRA & MAHINDRA",\n  "model": "MARSHAL STD (10)",\n  "manufacturing_year": "2005",\n  "total_premium_paid": "10,713.00",\n  "address": "UJJWAL MAHARANI BAGH,,GMS ROAD\\nDEHRADUN\\nDEHRADUN UTTARAKHAND 248001"\n}\n```'

In [97]:
import json
import re

def extract_json(text):
    """Extract JSON content from a text response and convert it into a dictionary."""
    
    # Use regex to extract JSON content between the triple backticks
    match = re.search(r'```json\n(.*?)\n```', text, re.DOTALL)
    
    if match:
        json_content = match.group(1)  # Extract JSON part
        try:
            return json.loads(json_content)  # Convert to dictionary
        except json.JSONDecodeError:
            print("Error: Invalid JSON format")
            return None
    else:
        print("Error: No JSON found in the text")
        return None

In [98]:
extract_json(chain1.run(text))

{'customer_name': 'UJJWAL MAHARANI',
 'customer_contact_number': 'Not Available',
 'policy_number': '253200/31/2022/191',
 'insurance_company_name': 'The Oriental Insurance Company Limited',
 'type_of_policy': 'PRIVATE CAR PACKAGE POLICY - ZONE B',
 'start_date_of_policy': '25/05/2023',
 'expiry_date_of_policy': '24/05/2024',
 'registration_number': 'UA 07 K 5553',
 'engine_number': '54D29039',
 'chassis_number': 'MA1NN2GAK52D13',
 'body_type': 'OTHERS 9 + 1 GAS',
 'vehicle_make': 'MAHINDRA & MAHINDRA',
 'model': 'MARSHAL STD (10)',
 'manufacturing_year': '2005',
 'total_premium_paid': '10,713.00',
 'address': 'UJJWAL MAHARANI BAGH,,GMS ROAD\nDEHRADUN\nDEHRADUN UTTARAKHAND 248001'}

In [None]:
key_info={}
for ele in key_information:
    key_info[ele]=key_information[ele].split()[-6:]

In [None]:
key_information

{'name': 'The name of the policy holder is SHASHANK R.',
 'contact_numbr': 'The contact number of the policy holder is not explicitly mentioned in the provided text. However, there is a phone number mentioned for the policy holder\'s address:\n\n"Address:  A206 Amrutha Grandeur, Katyayani Nivas, Behind MEBP Rachenahalli Main Road Date of Issuance 12/03/2016 11:20:14\\nPeriod of Insurance From: 19/03/2016 00:00:00\\nTo: Midnight On 18/03/2017 23:59:59 BANGALORE KARNATAKA\\n INDIAPin Code 560077\\nPhone #:  9741474849"\n\nSo, the contact number of the policy holder is 9741474849.',
 'policy_number': 'The policy number is 1-4G1ERON P400.',
 'Insurance_company_name': 'The Insurance company name is:\n\nIFFCO-TOKIO General Insurance Co. Ltd',
 'type_of_policy': 'The type of policy is a "PRIVATE CAR CERTIFICATE OF INSURANCE CUM SCHEDULE" which is a Motor Insurance Policy.',
 'Start_date': 'The start date of the policy is 19/03/2016 00:00:00.',
 'expiry_date': 'The expiry date of the policy is

In [None]:
key_info

{'name': ['the', 'policy', 'holder', 'is', 'SHASHANK', 'R.'],
 'contact_numbr': ['of', 'the', 'policy', 'holder', 'is', '9741474849.'],
 'policy_number': ['The', 'policy', 'number', 'is', '1-4G1ERON', 'P400.'],
 'Insurance_company_name': ['is:',
  'IFFCO-TOKIO',
  'General',
  'Insurance',
  'Co.',
  'Ltd'],
 'type_of_policy': ['which', 'is', 'a', 'Motor', 'Insurance', 'Policy.'],
 'Start_date': ['of', 'the', 'policy', 'is', '19/03/2016', '00:00:00.'],
 'expiry_date': ['policy', 'is:', 'Midnight', 'On', '18/03/2017', '23:59:59'],
 'registration_number': ['number', 'of', 'the', 'car', 'is', 'KA41N2842.'],
 'engine_number': ['number', 'of', 'the', 'car', 'is', '1589589.'],
 'chassis_number': ['number', 'of', 'the', 'car', 'is:', '1589589'],
 'body_type': ['ALTO', 'LXI"', 'which', 'is', 'a', 'Hatchback.'],
 'vehicle_make': ['make', 'of', 'the', 'car', 'is', '"MARUTI".'],
 'model': ['No.', '5', 'MARUTI', 'ALTO', 'LXI', '1589589"'],
 'manufacturing_year': ['year', 'of', 'the', 'car', 'is', 

In [104]:
path='390836363-Car-Insurance-pdf.pdf'
reader=PdfReader(path)
text_second=''
for pages in reader.pages:
    text_second+=pages.extract_text()
text_final=text_splitter.split_text(text_second)

In [None]:
#x=re.search(r"```json\n(.*?)\n```",s,re.DOTALL).group(1)

In [105]:
def read_text(chain, text):
    try:    
        return json.loads(re.search(r"```json\n(.*?)\n```",chain.run(text), re.DOTALL).group(1))
        
    except  Exception as e:
        return e

In [107]:
read_text(chain1,text)

{'customer_name': 'UJJWAL MAHARANI',
 'customer_contact_number': 'Not Available',
 'policy_number': '253200/31/2022/191',
 'insurance_company_name': 'The Oriental Insurance Company Limited',
 'type_of_policy': 'PRIVATE CAR PACKAGE POLICY - ZONE B',
 'start_date_of_policy': '25/05/2023',
 'expiry_date_of_policy': '24/05/2024',
 'registration_number': 'UA 07 K 5553',
 'engine_number': '54D29039',
 'chassis_number': 'MA1NN2GAK52D13',
 'body_type': 'OTHERS 9 + 1 GAS',
 'vehicle_make': 'MAHINDRA & MAHINDRA',
 'model': 'MARSHAL STD (10)',
 'manufacturing_year': '2005',
 'total_premium_paid': '10,713.00',
 'address': 'UJJWAL MAHARANI BAGH,,GMS ROAD\nDEHRADUN\nDEHRADUN UTTARAKHAND 248001'}

In [106]:
read_text(chain1,text_final)

{'customer_name': 'SHASHANK R',
 'customer_contact_number': '9741474849',
 'policy_number': '1-4G1ERON P400',
 'insurance_company_name': 'IFFCO-TOKIO GENERAL INSURANCE CO. LTD',
 'type_of_policy': 'PRIVATE CAR CERTIFICATE OF INSURANCE CUM SCHEDULE',
 'start_date_of_policy': '19/03/2016',
 'expiry_date_of_policy': '18/03/2017',
 'registration_number': 'KA41N2842',
 'engine_number': '796',
 'chassis_number': '1589589',
 'body_type': 'MARUTI ALTO LXI',
 'vehicle_make': 'MARUTI',
 'model': 'ALTO LXI',
 'manufacturing_year': '2010',
 'total_premium_paid': '3295.92',
 'address': 'A206 Amrutha Grandeur, Katyayani Nivas, Behind MEBP Rachenahalli Main Road, BANGALORE KARNATAKA, INDIAPin Code 560077'}