In [1]:
! pip install -qU langchain-groq

In [4]:
from typing import List, Optional
import json
import os
from dotenv import load_dotenv
from pydantic import BaseModel, Field
from groq import Groq


In [2]:
load_dotenv()

True

In [5]:
groq_api = os.getenv('GROQ_API')
groq = Groq(api_key=groq_api)

In [10]:
from langchain_community.document_loaders import PyPDFLoader
loader = PyPDFLoader("SDS_GHS_JP_EN_2019-03-13.pdf")
pages = loader.load_and_split()

In [11]:
pages[0].page_content

'(PEMP ) 1/6 \n \nSAFETY DATA SHEET  \n \n \n1. IDENTIFICATION OF THE SUBSTANCE OR MIXTURE AND OF THE SUPPLIER \nProduct name  PEMP  \nManufacture  SC ORGANIC CHEMICAL CO., LTD. \nAddress  3-10-24, Tadaoka -kita, Tadaoka -cho, Senboku -gun, Osaka \nPrefecture 595 -0811 Japan  \nPhone Number  +81-725-33-0478  \nEmergency phone number  Department  : Technology department  \n Phone  : +81-725-33-0478  \n FAX : +81-725-33-0479  \nRecommended use of the chemical and  restrictions on use \nChain transfer agents , Cross -linking agents , Epoxy  curing \nagents , Photocuring monomer for thiol -ene polymerization \nsystem  \n \n2. HAZARDS IDENTIFICATION  \nGHS Classification  \nPhysical hazards  \nExplosive  Not applicable  \nFlammable  gases  (including chemically unstable gases)  \n Not applicable  \nAerosols  Not applicable  \nOxidizing gases  Not applicable  \nGases under  pressure  Not applicable  \nFlammable liquids  Not classified  \nFlammable  solids  Not applicable  \nSelf-reactive  su

In [4]:
class Extraction(BaseModel):
    product_name: str = Field(description="name of the product")
    manufacturer: str = Field(description="name of the manufacturer")
    address: str = Field(description="address of the manufacturer")
    use_of_the_chemical: str = Field(description="comma separated string values for recommended use of the chemical and restrictions on use")


class Hazards_Identification(BaseModel):
    physical_hazards: List[dict] = Field(description="list of key value pairs of physical hazards.")

In [6]:
class All_Data(BaseModel):
    information : Extraction
    hazards : Hazards_Identification

In [8]:
print(json.dumps(All_Data.model_json_schema(), indent=2))

{
  "$defs": {
    "Extraction": {
      "properties": {
        "product_name": {
          "description": "name of the product",
          "title": "Product Name",
          "type": "string"
        },
        "manufacturer": {
          "description": "name of the manufacturer",
          "title": "Manufacturer",
          "type": "string"
        },
        "address": {
          "description": "address of the manufacturer",
          "title": "Address",
          "type": "string"
        },
        "use_of_the_chemical": {
          "description": "comma separated string values for recommended use of the chemical and restrictions on use",
          "title": "Use Of The Chemical",
          "type": "string"
        }
      },
      "required": [
        "product_name",
        "manufacturer",
        "address",
        "use_of_the_chemical"
      ],
      "title": "Extraction",
      "type": "object"
    },
    "Hazards_Identification": {
      "properties": {
        "physical_haz

In [9]:
def extract_data(ocr_text: str) -> All_Data:
    chat_completion = groq.chat.completions.create(
        messages=[
            {
                "role": "system",
                "content": "You are an information extractor agent. Your task is to extract specific pieces of information from the given text and outputs in JSON.. If not explicitly provided do not guess. Return empty string for respective key if data not extracted.\n"
                f" The JSON object must use the schema: {json.dumps(All_Data.model_json_schema(), indent=2)}",
            },
            {
                "role": "user",
                "content": f"Given text : \n {ocr_text}",
            },
        ],
        model="llama3-8b-8192",
        temperature=0,
        stream=False,
        response_format={"type": "json_object"},
    )
    return All_Data.model_validate_json(chat_completion.choices[0].message.content)


In [12]:
extract_data(pages[0].page_content)

All_Data(information=Extraction(product_name='PEMP', manufacturer='SC ORGANIC CHEMICAL CO., LTD.', address='3-10-24, Tadaoka-kita, Tadaoka-cho, Senboku-gun, Osaka Prefecture 595-0811 Japan', use_of_the_chemical='Chain transfer agents, Cross-linking agents, Epoxy curing agents, Photocuring monomer for thiol-ene polymerization system'), hazards=Hazards_Identification(physical_hazards=[{'key': 'Explosive', 'value': 'Not applicable'}, {'key': 'Flammable gases (including chemically unstable gases)', 'value': 'Not applicable'}, {'key': 'Aerosols', 'value': 'Not applicable'}, {'key': 'Oxidizing gases', 'value': 'Not applicable'}, {'key': 'Gases under pressure', 'value': 'Not applicable'}, {'key': 'Flammable liquids', 'value': 'Not classified'}, {'key': 'Flammable solids', 'value': 'Not applicable'}, {'key': 'Self-reactive substances and mixtures', 'value': 'Classification not possible'}, {'key': 'Pyrophoric liquids', 'value': 'Classification not possible'}, {'key': 'Pyrophoric solids', 'value

In [14]:
output = extract_data(pages[0].page_content)


In [18]:


def print_data(output: All_Data):

    print("information:")
    for key, value in enumerate(output.information, start=1):
        print(f"{key}. {value}")
        
    print("\nhazards:")
    for key, value in enumerate(output.hazards, start=1):
        print(f"{key}. {value}")


In [19]:
print_data(output)

information:
1. ('product_name', 'PEMP')
2. ('manufacturer', 'SC ORGANIC CHEMICAL CO., LTD.')
3. ('address', '3-10-24, Tadaoka-kita, Tadaoka-cho, Senboku-gun, Osaka Prefecture 595-0811 Japan')
4. ('use_of_the_chemical', 'Chain transfer agents, Cross-linking agents, Epoxy curing agents, Photocuring monomer for thiol-ene polymerization system')

hazards:
1. ('physical_hazards', [{'key': 'Explosive', 'value': 'Not applicable'}, {'key': 'Flammable gases (including chemically unstable gases)', 'value': 'Not applicable'}, {'key': 'Aerosols', 'value': 'Not applicable'}, {'key': 'Oxidizing gases', 'value': 'Not applicable'}, {'key': 'Gases under pressure', 'value': 'Not applicable'}, {'key': 'Flammable liquids', 'value': 'Not classified'}, {'key': 'Flammable solids', 'value': 'Not applicable'}, {'key': 'Self-reactive substances and mixtures', 'value': 'Classification not possible'}, {'key': 'Pyrophoric liquids', 'value': 'Classification not possible'}, {'key': 'Pyrophoric solids', 'value': 'No