In [None]:
!git clone https://github.com/Aimitmk/actuarygpt-code.git

Cloning into 'actuarygpt-code'...
remote: Enumerating objects: 81, done.[K
remote: Counting objects: 100% (81/81), done.[K
remote: Compressing objects: 100% (56/56), done.[K
remote: Total 81 (delta 24), reused 78 (delta 21), pack-reused 0 (from 0)[K
Receiving objects: 100% (81/81), 90.12 KiB | 744.00 KiB/s, done.
Resolving deltas: 100% (24/24), done.


In [None]:
pip install openai PyPDF2

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


# 再保険契約のPDF解析とJSON変換

このノートブックは再保険契約のPDFからテキストを抽出し、OpenAI APIを使用してJSON形式に変換するプロセスを実装しています。

In [None]:
import json
import os
from datetime import datetime
from pathlib import Path

from openai import OpenAI
from PyPDF2 import PdfReader

CONTRACTS_DIR: Path = Path.cwd() / "actuarygpt-code/case-study-4/contracts"
CONTRACTS_DIR.mkdir(parents=True, exist_ok=True)
JSON_DIR: Path = Path.cwd() / "actuarygpt-code/case-study-4/json"
JSON_DIR.mkdir(parents=True, exist_ok=True)

In [None]:
SYSTEM_PROMPT: str = """
    You are a system that extracts information from reinsurance contracts
    and inserts this information into JSON files. Use the following schema strictly
    to convert the contract information provided into JSON format. The user will
    provide the text followed by the question: "What is the JSON representation of
    this reinsurance contract?"

    Schema:

    {
        "$schema": "http://json-schema.org/draft-07/schema#",
        "type": "object",
        "properties": {
            "treatyType": {
                "type": "string",
                "description": "Type of reinsurance treaty"
            },
            "insured": {
                "type": "string",
                "description": "Name of the insurance company being insured"
            },
            "reinsurer": {
                "type": "string",
                "description": "Name of the reinsurance company providing coverage"
            },
            "period": {
                "type": "object",
                "properties": {
                    "start": {
                        "type": "string",
                        "description": "Start date of the reinsurance period"
                    },
                    "end": {
                        "type": "string",
                        "description": "End date of the reinsurance period"
                    }
                },
                "required": [
                    "start",
                    "end"
                ],
                "description": "Period of reinsurance coverage"
            },
            "lossLayers": {
                "type": "array",
                "items": {
                    "type": "object",
                    "properties": {
                        "layer": {
                            "type": "integer",
                            "description": "Layer number"
                        },
                        "excessOf": {
                            "type": "integer",
                            "description": "Excess amount triggering reinsurance coverage"
                        },
                        "limit": {
                            "type": "integer",
                            "description": "Maximum coverage limit for the layer"
                        },
                        "reinsuredPercent": {
                            "type": "integer",
                            "description": "Percentage of loss reinsured for the layer"
                        }
                    },
                    "required": [
                        "layer",
                        "excessOf",
                        "limit",
                        "reinsuredPercent"
                    ]
                },
                "description": "Information about the loss layers of the reinsurance contract"
            },
            "interest": {
                "type": "string",
                "description": "Coverage interest and lines of business"
            },
            "sumInsured": {
                "type": "integer",
                "description": "Total sum insured under the reinsurance contract"
            },
            "commission": {
                "type": "object",
                "properties": {
                    "percent": {
                        "type": "integer",
                        "description": "Commission percentage"
                    },
                    "maxLossRatio": {
                        "type": "integer",
                        "description": "Maximum loss ratio for commission calculation"
                    }
                },
                "required": [
                    "percent",
                    "maxLossRatio"
                ],
                "description": "Commission details"
            },
            "exclusions": {
                "type": "array",
                "items": {
                    "type": "string"
                },
                "description": "List of exclusions or risks not covered by the reinsurance"
            },
            "claimsNotification": {
                "type": "integer",
                "description": "Timeframe for claims notification in days"
            },
            "arbitrationClause": {
                "type": "string",
                "description": "Clause describing arbitration process"
            },
            "currency": {
                "type": "string",
                "description": "Currency used for the reinsurance contract"
            }
        },
        "required": [
            "treatyType",
            "insured",
            "reinsurer",
            "period",
            "lossLayers",
            "interest",
            "sumInsured",
            "commission",
            "exclusions",
            "claimsNotification",
            "arbitrationClause",
            "currency",
        ]
    }
    """

## ユーティリティ関数

In [None]:
# 環境変数の取得
def get_env_variable(name: str) -> str:
    value: str | None = os.environ.get(name)
    if value is None:
        raise ValueError(f"Environment variable '{name}' is not set")
    return value

In [None]:
# PDFからテキスト抽出
def extract_text_from_pdf(pdf_path: Path) -> str:
    with open(pdf_path, "rb") as file:
        pdf_reader: PdfReader = PdfReader(file)
        num_pages: int = len(pdf_reader.pages)

        extracted_text: str = ""
        for page_num in range(num_pages):
            page = pdf_reader.pages[page_num]
            text: str = page.extract_text()
            extracted_text += text

    return extracted_text

In [None]:
# テキストをJSON形式に変換
def convert_text_to_json(text: str, client: OpenAI) -> dict:
    prompt: str = (
        f"Contract: {text}\n\n"
        "Q: What is the JSON representation of this reinsurance contract?"
    )

    gpt_response = client.chat.completions.create(
        model="gpt-4o-mini",  # 最新モデルを使用（必要に応じてgpt-3.5-turbo-0125等に変更可能）
        messages=[
            {
                "role": "system",
                "content": SYSTEM_PROMPT,
            },
            {"role": "user", "content": prompt},
        ],
    )

    # Extract JSON string from markdown code block if present
    content = gpt_response.choices[0].message.content
    if content.startswith("```json"):
        content = content.strip("```json").strip("```").strip()

    # Add error handling to print the raw response
    try:
        return json.loads(content)
    except json.JSONDecodeError as e:
        print("Error decoding JSON:")
        print(content)
        raise e

## メイン処理実行

In [None]:
from google.colab import userdata

In [None]:
# OpenAI APIキーの設定
OPENAI_API_KEY: str = userdata.get('OPENAI_API_KEY')
client = OpenAI(api_key=OPENAI_API_KEY)

# 処理実行
for contract in CONTRACTS_DIR.iterdir():
    if contract.suffix.lower() == '.pdf':
        pdf_text: str = extract_text_from_pdf(contract)
        json_data: dict = convert_text_to_json(pdf_text, client)

        with open(JSON_DIR / f"{contract.stem}.json", "w") as file:
            json.dump(json_data, file, indent=4)

        print(f"{datetime.now()} - {contract.stem} converted to JSON")

2025-06-12 15:56:47.932722 - ri4 converted to JSON
2025-06-12 15:56:56.356684 - ri2 converted to JSON
2025-06-12 15:57:06.408859 - ri3 converted to JSON
2025-06-12 15:57:15.665640 - ri5 converted to JSON
2025-06-12 15:57:25.945616 - ri1 converted to JSON
