In [36]:
import os
import csv
import PyPDF2
from mistralai import Mistral  # Latest SDK import


PDF_FILE = "PDF for Python LLM.pdf"  # PDF file
CSV_FILE = "output_entities.csv"   # output data [Result]
MISTRAL_API_KEY = "*****IddRZCLmUWtAS6RWi**************"  # Used Mistral API KEY 
MODEL_NAME = "mistral-large-latest"  # Mere Model ka naam


In [37]:
def extract_pdf_text(pdf_path):
    text = ""  # Extract kiya Pdf file 
    with open(pdf_path, "rb") as f:
        reader = PyPDF2.PdfReader(f)
        for page in reader.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n" # text ko new line diya 
    return text.strip()

# Test extraction
pdf_text = extract_pdf_text(PDF_FILE)
print(pdf_text[:500])  # first 500 characters print kare


__________________________________________________________________________ __________________  
Order in the matter of Yamini Investment Company Limited .                    1  
WTM/AB/IVD/ID11/29124/2023 -24 
 
SECURITIES AND EXCHANGE BOARD OF INDIA 
 
ORDER  
 
UNDER SECTION 11(1),  11(4)  AND 11B  OF SEBI  ACT, 1992  IN THE MATTER OF YAMINI INVESTMENT 
COMPANY LTD. 
 
IN RESPECT OF  
 
S. NO.  NAME OF THE NOTICEE  PAN  
1.  MAHESHWARI FINANCIAL SERVI CES PVT. LTD. AAACM9185B  
2.  AUTOLITE AG


In [38]:
def extract_entities_relations(text):   # Mistral API client banaya
    client = Mistral(api_key=MISTRAL_API_KEY)

     # Prompt tayar kare (entities aur relations nikalne ke liye)
    prompt = f"""
You are an information extraction assistant.
From the following text, extract:
- Entities: Organisation, Name, PAN
- Relation: PAN_Of (linking PAN to the Name)

Output in CSV format with columns:
Entity(PAN),Relation,Entity(Person),Organisation

If no organisation, keep blank.

Text:
{text}
"""

    # Model ko prompt bheje
    response = client.chat.complete(
        model=MODEL_NAME,
        messages=[{"role": "user", "content": prompt}]
    )

    # Model ka output wapas de
    return response.choices[0].message.content.strip()


In [39]:
def save_csv(data_str, csv_file):
    lines = data_str.split("\n")   # Data ko line by line toda 
    
    # CSV file likha
    with open(csv_file, "w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        
        # Har line ko column me banaya
        for line in lines:
            row = [col.strip() for col in line.split(",")]
            writer.writerow(row)


In [40]:
# Extract PDF text
print("[INFO] Extracting text from PDF...")
pdf_text = extract_pdf_text(PDF_FILE)

# Call Mistral API
print("[INFO] Calling Mistral API...")
extracted_data = extract_entities_relations(pdf_text)
print("\n[RAW OUTPUT FROM MISTRAL]:\n", extracted_data)

# Save to CSV
print("\n[INFO] Saving to CSV...")
save_csv(extracted_data, CSV_FILE)

print(f"[DONE] Data saved in {CSV_FILE}")


[INFO] Extracting text from PDF...
[INFO] Calling Mistral API...

[RAW OUTPUT FROM MISTRAL]:
 Here is the extracted information in CSV format:

```csv
Entity(PAN),Relation,Entity(Person),Organisation
AAACM9185B,PAN_Of,MAHESHWARI FINANCIAL SERVICES PVT. LTD.,
AAECA1487G,PAN_Of,AUTOLITE AGENCIES PVT. LTD.,
AAACT5163G,PAN_Of,TOOR FINANCE COMPANY LTD.,
AAACS3356A,PAN_Of,STELLAR CAPITAL SERVICES LTD.,
ACRPR2362R,PAN_Of,PREMLAL ROY,
AAVFA6230F,PAN_Of,ARIES COMMERCIALS,
AAUFM6147N,PAN_Of,MOONLIGHT UDYOG,
ACDFS3492D,PAN_Of,SHRI RAM TRADERS,
AAGPL9225Q,PAN_Of,CHANDRA PRAKASH BALKISANJI LADDHA,
BFUPK6760D,PAN_Of,ANSHU KATARUKA,
AAEHG7205B,PAN_Of,GOPAL BANSAL (HUF),
AAEHH5347D,PAN_Of,HETAB S KANGAD (HUF),
AADCV8452B,PAN_Of,VINDYAVASINI AGENCY PVT LTD,
AAICS3344J,PAN_Of,MKR TRADING PVT. LTD.,
AABCF4418M,PAN_Of,FORTUNATE INFRA DEVELOPERS PVT. LTD.,
AABCL8020H,PAN_Of,LINKUP VINTRADE PVT. LTD.,
AABCO2405K,PAN_Of,OMKARA DEALER PVT. LTD.,
AABCO3506H,PAN_Of,OVERALL LOGISTICS PVT. LTD.,
AACCD9817B,PAN_Of