In [1]:
from dotenv import load_dotenv
import os

load_dotenv()
from langchain_openai import ChatOpenAI

llm_4o_openai = ChatOpenAI(model="gpt-4o", temperature=1, api_key=os.getenv("openai_api_key"))  # type: ignore
llm_4o_mini_openai = ChatOpenAI(model="gpt-4o-mini", temperature=1, api_key=os.getenv("openai_api_key"))  # type: ignore
llm_5_nano_openai = ChatOpenAI(model="gpt-5-nano", temperature=1, api_key=os.getenv("openai_api_key"))  # type: ignore
llm_5_mini_openai = ChatOpenAI(model="gpt-5-mini", temperature=1, api_key=os.getenv("openai_api_key"))  # type: ignore

In [None]:
from bs4 import BeautifulSoup
import requests
from langchain_core.messages import HumanMessage, SystemMessage
from pydantic import BaseModel, Field
import json

# grammar, url = "おきに", "https://nihongokyoshi-net.com/2020/01/22/jlptn3-grammar-okini/"
# grammar, url = "うちに", "https://nihongokyoshi-net.com/2018/10/31/jpltn3-grammar-uchini/"
# grammar, url = "おかげで", "https://nihongokyoshi-net.com/2018/08/30/jlptn3-grammar-okagede/"
grammar, url = "こそ", "https://nihongokyoshi-net.com/2018/06/02/jlptn3-grammar-koso/"
html = requests.get(url).text
soup = BeautifulSoup(html, "lxml")

section = soup.find_all("section", class_="single-post-main")
divs = section[0].find_all("div", class_="content")

text = ""
for div in divs:
    for ul in div.find_all("ul", class_="toc_list"):
        ul.decompose()
    text = div.get_text(strip=True)
    text = text.split("この文型が登場する教科書")[0]


In [None]:
class JPGrammarPattern1(BaseModel):
    pattern: str = Field(description="the 接続 (pattern) for the grammar point.")
    examples: list[str] = Field(default=[], description="Make 2-3 example sentences that are strictly related to this pattern")
    spoken_explanation_jp: str = Field(description="A simple spoken explanation of this pattern in very simple Japanese (3-5 sentences).")
    spoken_explanation_en: str = Field(description="A simple spoken explanation of this pattern in very simple English (3-5 sentences).")

class JPGrammar1(BaseModel):
    version: str = Field(default="1.0")
    name: str = Field(description="The name of the grammar point in Japanese.")
    meanings_jp: list[str] = Field(description="The meaning of the grammar point in very simple Japanese.")
    meanings_en: list[str] = Field(description="The meaning of the grammar point in English.")
    patterns: list[JPGrammarPattern1] = Field(description="A list of all 接続 (patterns) one by one for this grammar point.")
    spoken_explanation_jp: str = Field(description="A simple spoken explanation of the grammar point in very simple Japanese (3-5 sentences).")
    spoken_explanation_en: str = Field(description="A simple spoken explanation of the grammar point in very simple English (3-5 sentences).")

    def model_post_init(self, __context):
        self.save_json()
    
    def save_json(self) -> None:
        os.makedirs(f"{self.name}", exist_ok=True)
        with open(f"{self.name}/{self.name}_1.json", "w", encoding="utf-8") as f:
            json.dump(self.model_dump(), f, ensure_ascii=False, indent=4)

structured_model = llm_4o_mini_openai.with_structured_output(JPGrammar1)

result = structured_model.invoke([
    SystemMessage(content="You are a helpful assistant that extracts Japanese grammar points and all examples from text and structures them into a predefined format."),
    HumanMessage(content=f"Extract the main Japanese grammar point and all the examples from the following text and structure it according to the JPGrammar model. Make sure to include all provided examples. Grammar: {grammar}\n\n Text:\n\n{text}")
]

)

In [None]:
class JPGrammarPattern2(BaseModel):
    pattern: str = Field(description="the 接続 (pattern) for the grammar point.")
    # spoken_explanation_jp: str = Field(description="A simple spoken explanation of this pattern in very simple Japanese (3-5 sentences).")
    # spoken_explanation_en: str = Field(description="A simple spoken explanation of this pattern in very simple English (3-5 sentences).")

    def model_post_init(self, __context):
        print(f"Pattern: {self.pattern}")

class JPGrammar2(BaseModel):
    version: str = "2.0"
    name: str = Field(description="The name of the grammar point in Japanese.")
    meanings_jp: list[str] = Field(description="The meaning of the grammar point in very simple Japanese.")
    meanings_en: list[str] = Field(description="The meaning of the grammar point in English.")
    patterns: list[JPGrammarPattern2] = Field(description="A list of all 接続 (patterns) one by one for this grammar point.")
    examples: list[str] = Field(description="List all the Japanese example sentences provided in the text (make sure to include all examples).")
    spoken_explanation_jp: str = Field(description="A simple spoken explanation of the grammar point in very simple Japanese (3-5 sentences).")
    spoken_explanation_en: str = Field(description="A simple spoken explanation of the grammar point in very simple English (3-5 sentences).")

    def model_post_init(self, __context):
        self.save_json()
    
    def save_json(self) -> None:
        os.makedirs(f"{self.name}", exist_ok=True)
        with open(f"{self.name}/{self.name}_2.json", "w", encoding="utf-8") as f:
            json.dump(self.model_dump(), f, ensure_ascii=False, indent=4)

structured_model = llm_4o_mini_openai.with_structured_output(JPGrammar2)

result = structured_model.invoke([
    SystemMessage(content="You are a helpful assistant that extracts Japanese grammar points and all examples from text and structures them into a predefined format."),
    HumanMessage(content=f"Extract the main Japanese grammar point and all the examples from the following text and structure it according to the JPGrammar model. Make sure to include all provided examples. Grammar: {grammar}\n\n Text:\n\n{text}")
]

)