In [3]:
from openai import OpenAI
from typing import List, Dict, Tuple

In [4]:
client = OpenAI(
    api_key="sk-CwoLPKLXZQwaTlhDD3DApnstzXS1GqiFm8hxmagkg9MQhzpy",
    base_url="https://api.chatanywhere.tech/v1"
)

In [42]:
import re
from loguru import logger

class Generator:
    def __init__(self, corpus: List[str]):
        self.corpus = corpus
        self.triplet_pattern = r'\((\w+),\s*(\w+),\s*(\w+)\)'

    def _retrieve_domain_text(self, entity: str) -> str:
        # 检索领域相关文本（简化版：按实体出现频率排序）
        sentences = [s for s in self.corpus if entity in s]
        sorted_sentences = sorted(sentences, key=lambda x: x.count(entity), reverse=True)
        return " ".join(sorted_sentences[:5])  # 截取前5段

    def _retrieve_open_kg(self, entity: str, is_retrieve:bool =True) -> List[str]:
        # 通过GPT-4o生成示例三元组
        if is_retrieve:
            result = ["(SO2, is_type_of, Gas)","(SO2, has_chemical_formula, SO_2)","(SO2, is_used_in, Wine_Production)","(SO2, causes, Air_Pollution)","(SO2, causes, Acid_Rain)"]
            logger.debug(f"ExampleTriplets: {result}")
            return result
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[{
                "role": "user",
                "content": f"Generate 5 example triples for entity '{entity}' in format (head, relation, tail). each split by \n"
            }]
        )
        logger.debug(f"ExampleTriplets: {response.choices[0].message.content.strip()}")
        return response.choices[0].message.content.strip().split("\n")

    def generate_triples(self, entity: str) -> List[Tuple[str, str, str]]:
        domain_text = self._retrieve_domain_text(entity)
        examples = self._retrieve_open_kg(entity)
        system_prompt = "You are ChatGPT, you will extract all triples related to a specific word from a piece of text. These triples must be output strictly in triple format"
        prompt = f"""
        Domain Context: {domain_text}
        Please extract all triples related to {entity} from the above text. The triples must have {entity} as the head entity and be output strictly in triple format.
        Examples: {examples}
        Output each triple as (head, relation, tail). Be concise and your reply should be all in English.
        """
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[{"role": "system", "content": system_prompt},
                      {"role": "user", "content": prompt}]
        )
        raw_output = response.choices[0].message.content
        logger.debug(f"ModelOutput: {raw_output}")
        # 解析三元组（示例：正则匹配或简单分割）
        matches = re.findall(self.triplet_pattern, raw_output)
        return [tuple(triple) for triple in matches]
        # return [tuple(triple.strip("()").split(", ")) for triple in raw_output.split("\n")]

In [49]:
class Verifier:
    def __init__(self):
        self.rules = {
            "min_triples": 3,
            "valid_relations": ["subclass_of", "part_of", ...]
        }

    def verify_and_correct(self, triples: List[Tuple], entity: str) -> List[Tuple]:
        # 检查数量
        # if len(triples) < self.rules["min_triples"]:
        #     new_triples = self._reprompt(entity, "Generate at least 3 triples.")
        #     return new_triples
        
        # 检查格式
        valid_triples = []
        for triple in triples:
            if triple[0] != entity:
                continue
            if triple[0] == triple[2]:
                continue
            valid_triples.append(triple)
        
        return valid_triples

    def _reprompt(self, entity: str, error_msg: str) -> List[Tuple]:
        # 根据错误重新生成
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[{
                "role": "user",
                "content": f"Correct the error: {error_msg}\nGenerate valid triples for {entity}."
            }]
        )
        raw_output = response.choices[0].message.content
        return [tuple(triple.strip("()").split(", ")) for triple in raw_output.split("\n")]

    def run(self, triples: List[Tuple], entity: str) -> bool:
        valid_triples = self.verify_and_correct(triples=triples, entity=entity)
        logger.debug(f"ValidTriplets: {valid_triples}")
        if valid_triples:
            return True, valid_triples
        return False, None

In [7]:
from abc import ABC, abstractmethod

class Corpus(ABC):
    """
    Corpus 基类，定义处理文本数据的基本接口。
    """
    def __init__(self, data: str, max_seq_len=1000):
        self.max_seq_len = max_seq_len
        # data = [dt[:1000] for dt in data]
        self.data = data  # 存储原始数据
    
    @abstractmethod
    def clean(self):
        """清洗数据"""
        pass
    
    @abstractmethod
    def get_samples(self, num_samples=5):
        """返回指定数量的文本样本"""
        pass
    
    def __len__(self):
        """返回数据的长度"""
        return len(self.data)

In [23]:
class SeeridiaChemistryNote_Corpus(Corpus):
    def clean(self):
        sections = []
        buffer = []

        for line in self.data.split('\n'): 
            if re.match(r'^#{1,5}', line):
                if buffer:
                    sections.append('\n'.join(buffer).strip())
                    buffer = []
            buffer.append(line)
        
        if buffer:
            sections.append('\n'.join(buffer).strip())
        
        self.data = [re.sub(r'<img[^>]*>', '', section).strip() for section in sections]
    def get_samples(self, num_samples=5):
        if num_samples > 0:
            return self.data[:num_samples]
        return self.data

In [24]:
with open("08 硫及其化合物.md", "r") as f:
    data = "\n".join(f.readlines())

corpus = SeeridiaChemistryNote_Corpus(data)
corpus.clean()
print(corpus)

<__main__.SeeridiaChemistryNote_Corpus object at 0x00000238A1861E10>


In [32]:
corpus.get_samples(-1)

['# 元素及其化合物 · 八 · 「硫 $(\\ce{S})$ 及其化合物」\n\n\n\n1. 游离态：硫单质俗称硫黄，主要存在于火山口附近或地壳的岩层中\n\n\n\n2. 化合态：主要以 硫化物 和 硫酸盐 的形式存在\n\n\n\n    |   黄铁矿    |    黄铜矿     |          石膏           |           芒硝            |\n\n    | :---------: | :-----------: | :---------------------: | :-----------------------: |\n\n    | $\\ce{FeS2}$ | $\\ce{CuFeS2}$ | $\\ce{CaSO4 \\cdot 2H2O}$ | $\\ce{Na2SO4 \\cdot 10H2O}$ |\n\n\n\n    > $\\ce{S}$ 的常见化合价：$-2,-1,0,+1,+2,+3,+4,+6,+7,+8$ （无 $+5$ 价）',
 '## 硫单质',
 '### 物理性质\n\n\n\n1. 色态：黄色晶体，质脆，易研成粉末\n\n\n\n2. 溶解性：**难溶于水，微溶于酒精，易溶于二硫化碳 $\\ce{CS2}$** ，易溶于热煤油（化工题常考）\n\n    > 因此二硫化碳可用于洗涤内壁附着硫单质的试管',
 '### 化学性质\n\n\n\n硫单质既表现 **氧化性** ，又表现 **还原性**\n\n\n\n1. 与 $\\ce{H2}$ 反应：$\\ce{H2 +S\\xlongequal{\\Delta}H2S}$\n\n\n\n    > 硫化氢，$\\ce{H2S}$，臭鸡蛋味，有毒\n\n\n\n2. 与 $\\ce{O2}$ 反应：$\\ce{O2 +S\\xlongequal{点燃}SO2}$\n\n\n\n    > 无论氧气是否过量，产物均为二氧化硫（三氧化硫只在特殊的催化条件下生成）。发出明亮的蓝紫色火焰\n\n\n\n3. 与金属反应\n\n\n\n    $\\ce{Fe +S\\xlongequal{\\Delta} \\overset{+2}{Fe}S}\\newline \\ce{2Cu +S\\xlongequal{\\Delta}

In [43]:
chem_generator = Generator(corpus.get_samples(-1))

In [46]:
generated_triplets = chem_generator.generate_triples("SO2")

[32m2025-03-04 19:28:39.725[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m_retrieve_open_kg[0m:[36m19[0m - [34m[1mExampleTriplets: ['(SO2, is_type_of, Gas)', '(SO2, has_chemical_formula, SO_2)', '(SO2, is_used_in, Wine_Production)', '(SO2, causes, Air_Pollution)', '(SO2, causes, Acid_Rain)'][0m
[32m2025-03-04 19:28:49.428[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mgenerate_triples[0m:[36m47[0m - [34m[1mModelOutput: 
1. (SO2, is_type_of, acidic_oxide)
2. (SO2, reacts_with, H2O)
3. (SO2, main_property, Reducing)
4. (SO2, can_be_oxidized_by, O2)
5. (SO2, can_be_oxidized_by, H2O2)
6. (SO2, can_be_oxidized_by, Cl2)
7. (SO2, can_be_oxidized_by, Br2)
8. (SO2, can_be_oxidized_by, I2)
9. (SO2, can_be_oxidized_by, Fe3+)
10. (SO2, can_be_oxidized_by, KMnO4)
11. (SO2, can_be_oxidized_by, HNO3)
12. (SO2, can_be_oxidized_by, ClO-)
13. (SO2, reacts_with, H2S)
14. (SO2, has_property, Bleaching)
15. (SO2, is_used_in, bleaching_paper_pulp)
16. (SO2, is_used_in, disinfecting

In [51]:
print(generated_triplets)

[('SO2', 'is_type_of', 'acidic_oxide'), ('SO2', 'reacts_with', 'H2O'), ('SO2', 'main_property', 'Reducing'), ('SO2', 'can_be_oxidized_by', 'O2'), ('SO2', 'can_be_oxidized_by', 'H2O2'), ('SO2', 'can_be_oxidized_by', 'Cl2'), ('SO2', 'can_be_oxidized_by', 'Br2'), ('SO2', 'can_be_oxidized_by', 'I2'), ('SO2', 'can_be_oxidized_by', 'KMnO4'), ('SO2', 'can_be_oxidized_by', 'HNO3'), ('SO2', 'reacts_with', 'H2S'), ('SO2', 'has_property', 'Bleaching'), ('SO2', 'is_used_in', 'bleaching_paper_pulp'), ('SO2', 'is_used_in', 'disinfecting'), ('SO2', 'is_generated_by', 'Na2SO3_reacting_with_H2SO4')]


In [50]:
verifier = Verifier()
verifier.run(generated_triplets, entity="SO2")

[32m2025-03-04 19:30:06.959[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mrun[0m:[36m39[0m - [34m[1mValidTriplets: [('SO2', 'is_type_of', 'acidic_oxide'), ('SO2', 'reacts_with', 'H2O'), ('SO2', 'main_property', 'Reducing'), ('SO2', 'can_be_oxidized_by', 'O2'), ('SO2', 'can_be_oxidized_by', 'H2O2'), ('SO2', 'can_be_oxidized_by', 'Cl2'), ('SO2', 'can_be_oxidized_by', 'Br2'), ('SO2', 'can_be_oxidized_by', 'I2'), ('SO2', 'can_be_oxidized_by', 'KMnO4'), ('SO2', 'can_be_oxidized_by', 'HNO3'), ('SO2', 'reacts_with', 'H2S'), ('SO2', 'has_property', 'Bleaching'), ('SO2', 'is_used_in', 'bleaching_paper_pulp'), ('SO2', 'is_used_in', 'disinfecting'), ('SO2', 'is_generated_by', 'Na2SO3_reacting_with_H2SO4')][0m


(True,
 [('SO2', 'is_type_of', 'acidic_oxide'),
  ('SO2', 'reacts_with', 'H2O'),
  ('SO2', 'main_property', 'Reducing'),
  ('SO2', 'can_be_oxidized_by', 'O2'),
  ('SO2', 'can_be_oxidized_by', 'H2O2'),
  ('SO2', 'can_be_oxidized_by', 'Cl2'),
  ('SO2', 'can_be_oxidized_by', 'Br2'),
  ('SO2', 'can_be_oxidized_by', 'I2'),
  ('SO2', 'can_be_oxidized_by', 'KMnO4'),
  ('SO2', 'can_be_oxidized_by', 'HNO3'),
  ('SO2', 'reacts_with', 'H2S'),
  ('SO2', 'has_property', 'Bleaching'),
  ('SO2', 'is_used_in', 'bleaching_paper_pulp'),
  ('SO2', 'is_used_in', 'disinfecting'),
  ('SO2', 'is_generated_by', 'Na2SO3_reacting_with_H2SO4')])

In [73]:
from pydantic import BaseModel
from typing import List, Optional, Dict, Union
from enum import Enum

class CompoundType(Enum):
    Elemental = "Elemental"
    Oxide = "Oxide"
    Acid = "Acid"
    Alkali = "Alkali"
    Salt = "Salt"

class CompoundProperty(Enum):
    Oxidation = "Oxidation"
    Reductiveness = "Reductiveness"
    Others = "Others"

class ReactionType(Enum):
    Redox = "Redox"
    Neutralization = "Neutralization"
    Others = "Others"

class EntityType(Enum):
    Element = "Element"
    Compound = "Compound"
    Ion = "Ion"

class StateOfMatter(Enum):
    Solid = "Solid"
    Liquid = "Liquid"
    Gas = "Gas"
    Others = "Others"

class Entity(BaseModel):
    entity_type: EntityType
    entity_name: str

class Entities(BaseModel):
    entities: List[Entity]

class PhysicalProperties(BaseModel):
    color: Union[str, None]
    state_of_matter: Union[StateOfMatter, None]
    molecule_weight: Union[float, None]
    melting_point: Union[float, None]
    boiling_point: Union[float, None]

class Element(BaseModel):
    element_name: str
    element_symbol: str
    atomic_number: Union[int, None]
    atomic_weight: Union[float, None]
    electron_configuration: Union[str, None]
    simple_substance_property: PhysicalProperties

class Compound(BaseModel):
    compound_name: Union[str, None]
    compound_formula: Union[str, None]
    compound_type: CompoundType
    physical_properties: PhysicalProperties
    chem_properties: Union[List[CompoundProperty], None]

class Reaction(BaseModel):
    reaction_type: ReactionType
    equation: str
    conditions: Optional[str]
    reactants: List[str]
    products: List[str]

class Reactions(BaseModel):
    reactions: List[Reaction]

At first we should extract all entities appear in this note.

These following instructions are useless for entities:

These entities must be output strictly in Latex output format

Requires strictly LaTeX output format.

In [76]:
domain_text = corpus.get_samples(-1)

system_prompt = "You are ChatGPT, you will extract all the entities from a piece of text."
prompt = f"""
Domain Context: {domain_text}
Please extract all the entities from the above text. these entities may include elements, compounds, ions, and so on.
Be concise and your reply should be all in English.
"""
response = client.beta.chat.completions.parse(
    model="gpt-4o-mini",
    messages=[{"role": "system", "content": system_prompt},
                {"role": "user", "content": prompt}],
    response_format=Entities
)

raw_output = response.choices[0].message
if raw_output.parsed:
    logger.debug(f"ModelOutput: {raw_output}")
    # 解析三元组（示例：正则匹配或简单分割）
    entities = raw_output.parsed
    entities = json.loads(entities.model_dump_json())
    pprint(entities)

[32m2025-03-10 11:59:26.628[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m<module>[0m:[36m18[0m - [34m[1mModelOutput: ParsedChatCompletionMessage[Entities](content='{"entities":[{"entity_name":"S","entity_type":"Element"},{"entity_name":"H2S","entity_type":"Compound"},{"entity_name":"SO2","entity_type":"Compound"},{"entity_name":"SO3","entity_type":"Compound"},{"entity_name":"H2SO4","entity_type":"Compound"},{"entity_name":"CaSO4·2H2O","entity_type":"Compound"},{"entity_name":"Na2SO4·10H2O","entity_type":"Compound"},{"entity_name":"FeS2","entity_type":"Compound"},{"entity_name":"CuFeS2","entity_type":"Compound"},{"entity_name":"NaHSO4","entity_type":"Compound"},{"entity_name":"H2O","entity_type":"Compound"},{"entity_name":"H3PO4","entity_type":"Compound"},{"entity_name":"Na2SO3","entity_type":"Compound"},{"entity_name":"BaSO4","entity_type":"Compound"},{"entity_name":"HCl","entity_type":"Compound"},{"entity_name":"HNO3","entity_type":"Compound"},{"entity_name":"KMnO4","ent

{'entities': [{'entity_name': 'S', 'entity_type': 'Element'},
              {'entity_name': 'H2S', 'entity_type': 'Compound'},
              {'entity_name': 'SO2', 'entity_type': 'Compound'},
              {'entity_name': 'SO3', 'entity_type': 'Compound'},
              {'entity_name': 'H2SO4', 'entity_type': 'Compound'},
              {'entity_name': 'CaSO4·2H2O', 'entity_type': 'Compound'},
              {'entity_name': 'Na2SO4·10H2O', 'entity_type': 'Compound'},
              {'entity_name': 'FeS2', 'entity_type': 'Compound'},
              {'entity_name': 'CuFeS2', 'entity_type': 'Compound'},
              {'entity_name': 'NaHSO4', 'entity_type': 'Compound'},
              {'entity_name': 'H2O', 'entity_type': 'Compound'},
              {'entity_name': 'H3PO4', 'entity_type': 'Compound'},
              {'entity_name': 'Na2SO3', 'entity_type': 'Compound'},
              {'entity_name': 'BaSO4', 'entity_type': 'Compound'},
              {'entity_name': 'HCl', 'entity_type': 'Compound

After that, we can find every property of entities from wikidata and PubChem and so on.

In [75]:
entities = json.loads(entities.model_dump_json())
pprint(entities)

{'entities': [{'entity_name': 'S', 'entity_type': 'Element'},
              {'entity_name': 'FeS2', 'entity_type': 'Compound'},
              {'entity_name': 'CuFeS2', 'entity_type': 'Compound'},
              {'entity_name': 'CaSO4 \\cdot 2H2O', 'entity_type': 'Compound'},
              {'entity_name': 'Na2SO4 \\cdot 10H2O', 'entity_type': 'Compound'},
              {'entity_name': 'CS2', 'entity_type': 'Compound'},
              {'entity_name': 'H2S', 'entity_type': 'Compound'},
              {'entity_name': 'FeS', 'entity_type': 'Compound'},
              {'entity_name': 'Cu2S', 'entity_type': 'Compound'},
              {'entity_name': 'HgS', 'entity_type': 'Compound'},
              {'entity_name': 'H2SO4', 'entity_type': 'Compound'},
              {'entity_name': 'HCl', 'entity_type': 'Compound'},
              {'entity_name': 'HNO3', 'entity_type': 'Compound'},
              {'entity_name': 'NaCl', 'entity_type': 'Compound'},
              {'entity_name': 'NaHSO4', 'entity_type':

And then we care about main entity with some reactions.

In [None]:
entity = "SO2"

examples = ["(SO2, is_type_of, Gas)","(SO2, has_chemical_formula, SO_2)","(SO2, is_used_in, Wine_Production)","(SO2, causes, Air_Pollution)","(SO2, causes, Acid_Rain)"]
triplet_pattern = r'\((\w+),\s*(\w+),\s*(\w+)\)'

domain_text = corpus.get_samples(-1)

system_prompt = "You are ChatGPT, you will extract all information related to a specific word from a piece of text. These information must be output strictly in given format"
prompt = f"""
Domain Context: {domain_text}
Please extract all information related to {entity} from the above text. The information must have strong relevance with {entity} and be output strictly in given format.
Be concise and your reply should be all in English.
"""
response = client.beta.chat.completions.parse(
    model="gpt-4o",
    messages=[{"role": "system", "content": system_prompt},
                {"role": "user", "content": prompt}],
    response_format=Reactions
)

raw_output = response.choices[0].message
if raw_output.parsed:
    logger.debug(f"ModelOutput: {raw_output}")
    # 解析三元组（示例：正则匹配或简单分割）
    matches = raw_output.parsed
    print([triple for triple in matches])

[32m2025-03-10 14:56:08.471[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m<module>[0m:[36m23[0m - [34m[1mModelOutput: ParsedChatCompletionMessage[Reactions](content='{"reactions":[{"conditions":"none","equation":"Na2SO3 + H2SO4(浓) ↔ Na2SO4 + SO2 + H2O","products":["Na2SO4","SO2","H2O"],"reactants":["Na2SO3","H2SO4(浓)"],"reaction_type":"Others"},{"conditions":"高温","equation":"2SO2 + O2 ↔[V2O5][Δ] 2SO3","products":["SO3"],"reactants":["SO2","O2"],"reaction_type":"Redox"},{"conditions":"适当温度并有催化剂存在","equation":"SO2 + 2H2S ↔ 3S + 2H2O","products":["S","H2O"],"reactants":["SO2","H2S"],"reaction_type":"Redox"}]}', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None, parsed=Reactions(reactions=[Reaction(reaction_type=<ReactionType.Others: 'Others'>, equation='Na2SO3 + H2SO4(浓) ↔ Na2SO4 + SO2 + H2O', conditions='none', reactants=['Na2SO3', 'H2SO4(浓)'], products=['Na2SO4', 'SO2', 'H2O']), Reaction(reaction_type=<ReactionType.Redox: 'Redox'>, equation='2S

[('reactions', [Reaction(reaction_type=<ReactionType.Others: 'Others'>, equation='Na2SO3 + H2SO4(浓) ↔ Na2SO4 + SO2 + H2O', conditions='none', reactants=['Na2SO3', 'H2SO4(浓)'], products=['Na2SO4', 'SO2', 'H2O']), Reaction(reaction_type=<ReactionType.Redox: 'Redox'>, equation='2SO2 + O2 ↔[V2O5][Δ] 2SO3', conditions='高温', reactants=['SO2', 'O2'], products=['SO3']), Reaction(reaction_type=<ReactionType.Redox: 'Redox'>, equation='SO2 + 2H2S ↔ 3S + 2H2O', conditions='适当温度并有催化剂存在', reactants=['SO2', 'H2S'], products=['S', 'H2O'])])]


In [78]:
import json
from pprint import pprint

result = json.loads(matches.model_dump_json())

pprint(result)

{'reactions': [{'conditions': 'none',
                'equation': 'Na2SO3 + H2SO4(浓) ↔ Na2SO4 + SO2 + H2O',
                'products': ['Na2SO4', 'SO2', 'H2O'],
                'reactants': ['Na2SO3', 'H2SO4(浓)'],
                'reaction_type': 'Others'},
               {'conditions': '高温',
                'equation': '2SO2 + O2 ↔[V2O5][Δ] 2SO3',
                'products': ['SO3'],
                'reactants': ['SO2', 'O2'],
                'reaction_type': 'Redox'},
               {'conditions': '适当温度并有催化剂存在',
                'equation': 'SO2 + 2H2S ↔ 3S + 2H2O',
                'products': ['S', 'H2O'],
                'reactants': ['SO2', 'H2S'],
                'reaction_type': 'Redox'}]}


In [68]:
with open("06-08-SO2-Reactions.json", "w") as f:
    json.dump(result, f, indent=4)