In [4]:
import os
import pandas as pd
import openai

# 设置代理环境（如果需要）
os.environ["http_proxy"] = "127.0.0.1:7890"
os.environ["https_proxy"] = "127.0.0.1:7890"

In [5]:
# config.py
import os
from dotenv import load_dotenv

# 加载 .env 文件
load_dotenv()

# 读取
ZetaTechs_api_key = os.getenv('ZetaTechs_api_key')
ZetaTechs_api_base = os.getenv('ZetaTechs_api_base')

In [6]:
# test connection
from openai import OpenAI

client = OpenAI(api_key=ZetaTechs_api_key, base_url=ZetaTechs_api_base)

completion = client.chat.completions.create(
  model="gpt-4o-mini",
  messages=[
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "Hello!"}
  ]
)

print(completion.choices[0].message)

ChatCompletionMessage(content='Hello! How can I assist you today?', refusal=None, role='assistant', function_call=None, tool_calls=None)


In [6]:
# 读取 CSV 文件并忽略空列，保留“采集时间”和“备注【疑问汇总】”
def load_csv(file_path):
    df = pd.read_csv(file_path)
    # 找到不为空的列名
    non_empty_columns = df.columns[df.notna().any()].tolist()
    # 加入需要保留的列
    required_columns = ["采集来源", "来源链接", "采集时间", "备注【疑问汇总】"]
    final_columns = [col for col in non_empty_columns if col not in required_columns] + required_columns
    # 根据有效列筛选数据，并保留“采集时间”和“备注【疑问汇总】”列
    df = df[final_columns]
    return df

# 自动生成 column_mapping，忽略 "采集时间" 和 "备注【疑问汇总】"
def generate_column_mapping(df):
    columns_to_include = df.columns[:-2]  # 忽略最后两列
    column_mapping = {col: col for col in columns_to_include}
    return column_mapping

# 创建大模型的输入
def create_model_input(df, column_mapping):
    input_data = []
    for _, row in df.iterrows():
        mapped_input = {column_mapping[key]: row[key] for key in column_mapping}
        input_data.append(mapped_input)
    return input_data

In [7]:
# test load_csv() and generate_column_mapping() and create_model_input()
file_path = "25-量子信息 - 潜在应用场景分析报告.csv"

# 第1步：加载 CSV 文件
df = load_csv(file_path)

df_columns = df.columns[:-2]
print(df_columns)

# 第2步：自动生成 column_mapping，忽略 "采集时间" 和 "备注【疑问汇总】"
column_mapping = generate_column_mapping(df)
# print("###", column_mapping, "###")

# 第3步：创建大模型输入
input_data = create_model_input(df, column_mapping)

# print(type(input_data), input_data)

# print(len((input_data[0].keys())), input_data[0].keys())

Index(['量子计算应用的市场规模（字符型）', '应用场景类型（数字型）', '领域（字符型）', '算法（数字型）', '采集来源',
       '来源链接'],
      dtype='object')


In [8]:
import openai
import pandas as pd

def create_system_prompt():
    # 动态生成system prompt, 用通用语言描述表格结构
    return """
    You are an expert in data augmentation. You will be provided with a table structure and sample data. 
    Based on this structure, you need to augment the dataset by generating more entries following the same format. 
    Focus on consistency and keep the original meaning of the columns intact.
    """

def create_user_prompt(input_data):
    # 构建user prompt，将所有input_data中的信息加入提示
    prompt = "Here is the structure of the dataset with sample data:\n\n"
    for row_idx, row in enumerate(input_data):
        prompt += f"Row {row_idx + 1}:\n"
        for col_name, value in row.items():
            prompt += f"  Column: {col_name}, Value: {value}\n"
        prompt += "\n"  # 每行数据后增加换行，区分不同行
    prompt += "Please generate more data in the same structure and format."
    return prompt

# 创建system_prompt 和 user_prompt
system_prompt = create_system_prompt()
user_prompt = create_user_prompt(input_data)

print(system_prompt)
print("################################")
print(user_prompt)


    You are an expert in data augmentation. You will be provided with a table structure and sample data. 
    Based on this structure, you need to augment the dataset by generating more entries following the same format. 
    Focus on consistency and keep the original meaning of the columns intact.
    
################################
Here is the structure of the dataset with sample data:

Row 1:
  Column: 量子计算应用的市场规模（字符型）, Value: 超过12.1亿美元
  Column: 应用场景类型（数字型）, Value: 机器学习
  Column: 领域（字符型）, Value: 金融
  Column: 算法（数字型）, Value: 1
  Column: 采集来源, Value: Research Nester
  Column: 来源链接, Value: https://www.researchnester.com/cn/reports/quantum-computing-market/4910

Row 2:
  Column: 量子计算应用的市场规模（字符型）, Value: 80.47亿元（人民币）
  Column: 应用场景类型（数字型）, Value: 优化
  Column: 领域（字符型）, Value: 医疗健康
  Column: 算法（数字型）, Value: 2
  Column: 采集来源, Value: 格隆汇
  Column: 来源链接, Value: https://m.gelonghui.com/p/932681

Row 3:
  Column: 量子计算应用的市场规模（字符型）, Value: 8.854亿美元
  Column: 应用场景类型（数字型）, Value: 生物医学模拟
  Colum

### 使用 gpt-4o-mini 模型生成数据

In [9]:
from openai import OpenAI

client = OpenAI(api_key=ZetaTechs_api_key, base_url=ZetaTechs_api_base)

messages_to_model=[
      {"role": "system", "content": system_prompt},
      {"role": "user", "content": user_prompt}
]

completion = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=messages_to_model,
)

generated_data = completion.choices[0].message.content
print(generated_data)

Here are additional rows of augmented data following the same structure and format:

Row 7:
  Column: 量子计算应用的市场规模（字符型）, Value: 25亿美元
  Column: 应用场景类型（数字型）, Value: 量子计算加速
  Column: 领域（字符型）, Value: 交通运输
  Column: 算法（数字型）, Value: 7
  Column: 采集来源, Value: Global Industry Analysts
  Column: 来源链接, Value: https://www.globalindustryanalysts.com/quantum-computing-market-report

Row 8:
  Column: 量子计算应用的市场规模（字符型）, Value: 超过50亿美元
  Column: 应用场景类型（数字型）, Value: 智能合约
  Column: 领域（字符型）, Value: 法律
  Column: 算法（数字型）, Value: 8
  Column: 采集来源, Value: Quantum Insights 
  Column: 来源链接, Value: https://www.quantuminsights.com/reports/quantum-in-legal

Row 9:
  Column: 量子计算应用的市场规模（字符型）, Value: 10.3亿美元
  Column: 应用场景类型（数字型）, Value: 量子模拟
  Column: 领域（字符型）, Value: 材料科学
  Column: 算法（数字型）, Value: 9
  Column: 采集来源, Value: Research and Markets
  Column: 来源链接, Value: https://www.researchandmarkets.com/reports/quantum-simulation-market

Row 10:
  Column: 量子计算应用的市场规模（字符型）, Value: 12.8亿美元
  Column: 应用场景类型（数字型）, Value: 密码

### 使用 gpt-4o-mini-2024-07-18 提取数据（结构化输出）

In [14]:
# test1

In [18]:
from pydantic import BaseModel, create_model
from openai import OpenAI

client = OpenAI(api_key=ZetaTechs_api_key, base_url=ZetaTechs_api_base)

class Extraction(BaseModel):
    column1: list[str]
    column2: list[str]
    column3: list[str]
    column4: list[str]
    column5: list[str]
    column6: list[str]

# 动态生成Extraction类 #################################################
# def generate_extraction_model(columns):
#     fields = {f'column{i+1}': (list[str], ...) for i in range(len(columns))}
#     return create_model('Extraction', **fields)
# # 假设 df 是数据框的列名
# columns = input_data.columns[:-2]  # 忽略最后两列
# Extraction = generate_extraction_model(columns)
#######################################################################

completion = client.beta.chat.completions.parse(
    model="gpt-4o-mini-2024-07-18",
    messages=[
        {"role": "system", "content": "You are an expert at structured data extraction. You will be given unstructured text from a research paper and should convert it into the given structure."},
        {"role": "user", "content": generated_data}
    ],
    response_format=Extraction,
)

research_paper = completion.choices[0].message.parsed

### 动态Extraction类

In [None]:
# # 动态Extraction类
# import inspect
# from pydantic import BaseModel, create_model

# input_data = pd.read_csv("25-量子信息 - 潜在应用场景分析报告.csv")
# print(len(input_data.columns))

# # 动态生成Extraction类
# def generate_extraction_model(columns):
#     fields = {f'column{i+1}': (list[str], ...) for i in range(len(columns))}
#     return create_model('Extraction', **fields)

# # 假设 df 是数据框的列名
# columns = input_data.columns[:-2]  # 忽略最后两列
# Extraction = generate_extraction_model(columns)

# # 查看 Extraction 类的内部结构
# print("Extraction 类的结构:")
# for field_name, field_type in Extraction.__annotations__.items():
#     print(f"{field_name}: {field_type}")

In [None]:
# test
print(type(research_paper), "\n\n", research_paper, "\n\n", research_paper.column1)

### 合并生成数据与现有数据

In [19]:
import pandas as pd

# 假设 input_data 是原始 CSV 文件读取的数据框
# 加载已有数据的CSV文件
input_data = pd.read_csv('25-量子信息 - 潜在应用场景分析报告.csv')

# 找到不为空的列名
non_empty_columns = input_data.columns[input_data.notna().any()].tolist()
# 加入需要保留的列
required_columns = ["采集来源", "来源链接", "采集时间", "备注【疑问汇总】"]
final_columns = [col for col in non_empty_columns if col not in required_columns] + required_columns
# 根据有效列筛选数据，并保留“采集时间”和“备注【疑问汇总】”列
input_data = input_data[final_columns]

print(non_empty_columns)
input_data.head(5)

['量子计算应用的市场规模（字符型）', '应用场景类型（数字型）', '领域（字符型）', '算法（数字型）', '采集来源', '来源链接', '采集时间']


Unnamed: 0,量子计算应用的市场规模（字符型）,应用场景类型（数字型）,领域（字符型）,算法（数字型）,采集来源,来源链接,采集时间,备注【疑问汇总】
0,超过12.1亿美元,机器学习,金融,1,Research Nester,https://www.researchnester.com/cn/reports/quan...,2024-10-12,
1,80.47亿元（人民币）,优化,医疗健康,2,格隆汇,https://m.gelonghui.com/p/932681,2024-10-12,
2,8.854亿美元,生物医学模拟,化工,3,Fortune Business Insights,https://www.fortunebusinessinsights.com/zh/qua...,2024-10-12,
3,47亿美元,数据分析,能源与公用事业,4,ICV,https://pdf.dfcfw.com/pdf/H3_AP202405201633905...,2024-10-12,
4,16.5亿美元,电子材料发现,制造业,5,行业研究报告,https://example.com/research_report,2024-10-12,


In [20]:
# 确定生成的列数与input_data的一致性
num_columns = len(input_data.columns) - 2  # 原始数据的列数 - 2（采集时间 和 备注【疑问汇总】）

# 遍历生成的数据，逐行添加到原始数据框
for i in range(len(research_paper.column1)):  # 这里我们使用column1的长度为循环范围
    # 生成的数据
    new_row = [
        research_paper.column1[i],  # 第1列的数据
        research_paper.column2[i],  # 第2列的数据
        research_paper.column3[i],  # 第3列的数据
        research_paper.column4[i],  # 第4列的数据
        research_paper.column5[i],  # 第5列的数据
        research_paper.column6[i],  # 第6列的数据
    ]
    print(new_row)
    # 用None填充“采集时间”和“备注【疑问汇总】”
    while len(new_row) < num_columns+2:
        new_row.append(None)
    print(len(new_row))
    # 将新行添加到原数据框中
    input_data.loc[len(input_data)] = new_row

# 保存更新后的数据框
input_data.to_csv('更新后的数据.csv', index=False)

['23.6亿美元', '量子通信', '航空航天', '7', 'Research and Markets', 'https://www.researchandmarkets.com/reports/quantum-communications-market']
8
['50亿元（人民币）', '财务建模', '企业管理', '8', '产业研究网', 'https://www.chinaindustryresearch.com/finance-modeling']
8
['超过30亿美元', '量子密码学', '信息安全', '9', 'MarketsandMarkets', 'https://www.marketsandmarkets.com/quantum-cryptography-market']
8
['95亿美元', '量子模拟', '材料科学', '10', 'Grand View Research', 'https://www.grandviewresearch.com/industry-analysis/quantum-simulation-market']
8
['12.3亿美元', '风险分析', '保险', '11', 'Research Nester', 'https://www.researchnester.com/reports/risk-analysis-quantum-computing']
8
['89.5亿元（人民币）', '供应链优化', '物流', '12', 'Qianzhan Industry Research Institute', 'https://www.qianzhan.com/research/logistics-supply-chain']
8


In [22]:
research_paper.column1[0]

'23.6亿美元'

In [27]:
type(research_paper.column1), research_paper.column1

(list, ['23.6亿美元', '50亿元（人民币）', '超过30亿美元', '95亿美元', '12.3亿美元', '89.5亿元（人民币）'])

In [26]:
type(research_paper), research_paper

(__main__.Extraction,
 Extraction(column1=['23.6亿美元', '50亿元（人民币）', '超过30亿美元', '95亿美元', '12.3亿美元', '89.5亿元（人民币）'], column2=['量子通信', '财务建模', '量子密码学', '量子模拟', '风险分析', '供应链优化'], column3=['航空航天', '企业管理', '信息安全', '材料科学', '保险', '物流'], column4=['7', '8', '9', '10', '11', '12'], column5=['Research and Markets', '产业研究网', 'MarketsandMarkets', 'Grand View Research', 'Research Nester', 'Qianzhan Industry Research Institute'], column6=['https://www.researchandmarkets.com/reports/quantum-communications-market', 'https://www.chinaindustryresearch.com/finance-modeling', 'https://www.marketsandmarkets.com/quantum-cryptography-market', 'https://www.grandviewresearch.com/industry-analysis/quantum-simulation-market', 'https://www.researchnester.com/reports/risk-analysis-quantum-computing', 'https://www.qianzhan.com/research/logistics-supply-chain']))

In [34]:
for col in research_paper:
    print(col)
    print(type(col))
    print(col[1])
    print(type(col[1]))


('column1', ['23.6亿美元', '50亿元（人民币）', '超过30亿美元', '95亿美元', '12.3亿美元', '89.5亿元（人民币）'])
<class 'tuple'>
['23.6亿美元', '50亿元（人民币）', '超过30亿美元', '95亿美元', '12.3亿美元', '89.5亿元（人民币）']
<class 'list'>
('column2', ['量子通信', '财务建模', '量子密码学', '量子模拟', '风险分析', '供应链优化'])
<class 'tuple'>
['量子通信', '财务建模', '量子密码学', '量子模拟', '风险分析', '供应链优化']
<class 'list'>
('column3', ['航空航天', '企业管理', '信息安全', '材料科学', '保险', '物流'])
<class 'tuple'>
['航空航天', '企业管理', '信息安全', '材料科学', '保险', '物流']
<class 'list'>
('column4', ['7', '8', '9', '10', '11', '12'])
<class 'tuple'>
['7', '8', '9', '10', '11', '12']
<class 'list'>
('column5', ['Research and Markets', '产业研究网', 'MarketsandMarkets', 'Grand View Research', 'Research Nester', 'Qianzhan Industry Research Institute'])
<class 'tuple'>
['Research and Markets', '产业研究网', 'MarketsandMarkets', 'Grand View Research', 'Research Nester', 'Qianzhan Industry Research Institute']
<class 'list'>
('column6', ['https://www.researchandmarkets.com/reports/quantum-communications-market', 'https://www.ch

In [36]:
def extracted_data_to_df(extracted_generated_data):
    # 初始化一个空字典，用于存储列名和对应的列数据
    data_dict = {}
    
    # 遍历 extracted_generated_data，每个元素是一个 (列名, 列数据) 的元组
    for col_name, col_data in extracted_generated_data:
        # 将列名和对应的列数据添加到字典中
        data_dict[col_name] = col_data
    
    # 将字典转化为 DataFrame
    df = pd.DataFrame(data_dict)

    return df


In [37]:
df = extracted_data_to_df(research_paper)
df.head()

Unnamed: 0,column1,column2,column3,column4,column5,column6
0,23.6亿美元,量子通信,航空航天,7,Research and Markets,https://www.researchandmarkets.com/reports/qua...
1,50亿元（人民币）,财务建模,企业管理,8,产业研究网,https://www.chinaindustryresearch.com/finance-...
2,超过30亿美元,量子密码学,信息安全,9,MarketsandMarkets,https://www.marketsandmarkets.com/quantum-cryp...
3,95亿美元,量子模拟,材料科学,10,Grand View Research,https://www.grandviewresearch.com/industry-ana...
4,12.3亿美元,风险分析,保险,11,Research Nester,https://www.researchnester.com/reports/risk-an...


In [38]:
df_columns = df.columns[:-2]
df_columns

Index(['column1', 'column2', 'column3', 'column4'], dtype='object')