In [1]:
import os
import pandas as pd
from pydantic import BaseModel, create_model
from openai import OpenAI

# 设置代理环境（如果需要）
os.environ["http_proxy"] = "127.0.0.1:7890"
os.environ["https_proxy"] = "127.0.0.1:7890"

In [2]:
# config.py
from dotenv import load_dotenv

# 加载 .env 文件
load_dotenv()

# 读取
ZetaTechs_api_key = os.getenv('ZetaTechs_api_key')
ZetaTechs_api_base = os.getenv('ZetaTechs_api_base')

client = OpenAI(api_key=ZetaTechs_api_key, base_url=ZetaTechs_api_base)

In [3]:
# 读取 CSV 文件并忽略空列，保留“采集时间”和“备注【疑问汇总】”
def load_csv(file_path):
    df = pd.read_csv(file_path)
    # 找到不为空的列名
    non_empty_columns = df.columns[df.notna().any()].tolist()
    # 加入需要保留的列
    required_columns = ["采集来源", "来源链接", "采集时间", "备注【疑问汇总】"]
    final_columns = [col for col in non_empty_columns if col not in required_columns] + required_columns
    # 根据有效列筛选数据，并保留“采集时间”和“备注【疑问汇总】”列
    df = df[final_columns]
    return df

# 自动生成 column_mapping，忽略 "采集时间" 和 "备注【疑问汇总】"
def generate_column_mapping(df):
    columns_to_include = df.columns[:-2]  # 忽略最后两列
    column_mapping = {col: col for col in columns_to_include}
    return column_mapping

# 创建大模型的输入
def create_model_input(df, column_mapping):
    input_data = []
    for _, row in df.iterrows():
        mapped_input = {column_mapping[key]: row[key] for key in column_mapping}
        input_data.append(mapped_input)
    return input_data

def create_system_prompt(num_new_entries):
    # 动态生成system prompt，明确生成固定数量的新数据
    return f"""
    You are an expert in data augmentation. You will be provided with a table structure and sample data. 
    Your task is to augment the dataset by generating exactly {num_new_entries} new entries while maintaining consistency with the original format and meaning of the columns.
    Focus on creating diverse and realistic entries, ensuring the augmented data aligns closely with the structure and content of the original dataset.
    """

def create_user_prompt(input_data):
    # 构建user prompt，将所有input_data中的信息加入提示
    prompt = "Here is the structure of the dataset with sample data:\n\n"
    for row_idx, row in enumerate(input_data):
        prompt += f"Row {row_idx + 1}:\n"
        for col_name, value in row.items():
            prompt += f"  Column: {col_name}, Value: {value}\n"
        prompt += "\n"  # 每行数据后增加换行，区分不同行
    prompt += "Please generate more data in the same structure and format."
    return prompt

In [4]:
def generate_data(system_prompt, user_prompt):
    messages_to_model=[
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt}
      ]

    completion = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=messages_to_model,
        timeout=180  # 设置为120秒，确保有足够的时间
    )

    generated_data = completion.choices[0].message.content
    
    return generated_data

In [6]:
# test load_csv() and generate_column_mapping() and create_model_input()
file_path = "原始数据集/8-现代时尚 - 健身人数信息.csv"

# 第1步：加载 CSV 文件
df = load_csv(file_path)

df_columns = df.columns[:-2]
# print(df_columns)

# 第2步：自动生成 column_mapping，忽略 "采集时间" 和 "备注【疑问汇总】"
column_mapping = generate_column_mapping(df)
# print("###", column_mapping, "###")

# 第3步：创建大模型输入
input_data = create_model_input(df, column_mapping)

# print(type(input_data), input_data)
# rint(len((input_data[0].keys())), input_data[0].keys())

# test generate_data()
system_prompt = create_system_prompt(5)
print(system_prompt)
user_prompt = create_user_prompt(input_data)
print(user_prompt)
generated_data = generate_data(system_prompt, user_prompt)


    You are an expert in data augmentation. You will be provided with a table structure and sample data. 
    Your task is to augment the dataset by generating exactly 5 new entries while maintaining consistency with the original format and meaning of the columns.
    Focus on creating diverse and realistic entries, ensuring the augmented data aligns closely with the structure and content of the original dataset.
    
Here is the structure of the dataset with sample data:

Row 1:
  Column: 年月, Value: 2023-12-01
  Column: 地区, Value: 中国
  Column: 健身房人数, Value: 6975.0
  Column: 单位, Value: 万人
  Column: 健身工作室数量, Value: 42177.0
  Column: 单位.1, Value: 个
  Column: 健身俱乐部数量, Value: 36447.0
  Column: 单位.2, Value: 个
  Column: 健身渗透率, Value: 0.0502
  Column: 经常参加体育锻炼的人数, Value: 21000.0
  Column: 单位.3, Value: 万人
  Column: 人均体育场馆面积, Value: 1.5
  Column: 单位.4, Value: 平方米
  Column: 健身休闲产业规模, Value: 675.0
  Column: 单位.5, Value: 亿元
  Column: 马拉松参加人数, Value: 50
  Column: 单位.6, Value: 万人
  Column: 采集来源, Va

In [7]:
print(generated_data)

Here are 5 new entries augmented in the same structure and format as the provided dataset:

Row 36:
  Column: 年月, Value: 2023-12-01
  Column: 地区, Value: 山东
  Column: 健身房人数, Value: 210.0
  Column: 单位, Value: 万人
  Column: 健身工作室数量, Value: 1100.0
  Column: 单位.1, Value: 个
  Column: 健身俱乐部数量, Value: 1000.0
  Column: 单位.2, Value: 个
  Column: 健身渗透率, Value: 0.0175
  Column: 经常参加体育锻炼的人数, Value: 680.22
  Column: 单位.3, Value: 万人
  Column: 人均体育场馆面积, Value: 1.55
  Column: 单位.4, Value: 平方米
  Column: 健身休闲产业规模, Value: 17.40
  Column: 单位.5, Value: 亿元
  Column: 马拉松参加人数, Value: 1.75
  Column: 单位.6, Value: 万人
  Column: 采集来源, Value: 国家体育总局
  Column: 来源链接, Value: https://example.com/news2023_shandong

Row 37:
  Column: 年月, Value: 2023-12-01
  Column: 地区, Value: 天津
  Column: 健身房人数, Value: 130.0
  Column: 单位, Value: 万人
  Column: 健身工作室数量, Value: 600.0
  Column: 单位.1, Value: 个
  Column: 健身俱乐部数量, Value: 500.0
  Column: 单位.2, Value: 个
  Column: 健身渗透率, Value: 0.0115
  Column: 经常参加体育锻炼的人数, Value: 400.30
  Column: 单位.

In [8]:
def generate_extraction_model(num_columns_to_augment):
    fields = {f'column{i+1}': (list[str], ...) for i in range(num_columns_to_augment)}
    return create_model('Extraction', **fields)

def extract_generated_data(generated_data, num_columns_to_augment):
    Extraction = generate_extraction_model(num_columns_to_augment)

    completion = client.beta.chat.completions.parse(
        model="gpt-4o-2024-08-06", # gpt-4o-mini-2024-07-18
        messages=[
            {"role": "system", "content": "You are an expert at structured data extraction. Extract the data into the exact column structure provided."},
            {"role": "user", "content": generated_data}
        ],
        response_format=Extraction,
    )
    
    extracted_generated_data = completion.choices[0].message.parsed   
    return extracted_generated_data

# # claude修改后的代码
# def extract_generated_data(generated_data, columns):
#     Extraction = generate_extraction_model(columns)

#     completion = client.beta.chat.completions.parse(
#         model="gpt-4o-2024-08-06",
#         messages=[
#             {"role": "system", "content": "You are an expert at structured data extraction. Extract the data into the exact column structure provided."},
#             {"role": "user", "content": f"Extract the following data into these columns: {', '.join(columns)}\n\n{generated_data}"}
#         ],
#         response_format=Extraction,
#     )
    
#     extracted_generated_data = completion.choices[0].message.parsed   
#     return extracted_generated_data

In [10]:
num_columns_to_augment = len((input_data[0].keys()))
print(num_columns_to_augment, "\n")

# 第6步：结构化输出提取数据
extracted_generated_data = extract_generated_data(generated_data, num_columns_to_augment)

19 



In [12]:
print(type(extracted_generated_data), extracted_generated_data)

<class '__main__.Extraction'> column1=['2023-12-01', '2023-12-01', '2023-12-01', '2023-12-01', '2023-12-01'] column2=['山东', '天津', '陕西', '四川', '吉林'] column3=['210.0', '130.0', '89.0', '112.0', '76.0'] column4=['万人', '万人', '万人', '万人', '万人'] column5=['1100.0', '600.0', '500.0', '750.0', '450.0'] column6=['个', '个', '个', '个', '个'] column7=['1000.0', '500.0', '400.0', '620.0', '350.0'] column8=['个', '个', '个', '个', '个'] column9=['0.0175', '0.0115', '0.0095', '0.0128', '0.0059'] column10=['680.22', '400.30', '280.75', '320.50', '220.80'] column11=['万人', '万人', '万人', '万人', '万人'] column12=['1.55', '1.60', '1.90', '1.44', '1.69'] column13=['平方米', '平方米', '平方米', '平方米', '平方米'] column14=['17.40', '12.00', '8.00', '10.40', '7.05'] column15=['亿元', '亿元', '亿元', '亿元', '亿元'] column16=['1.75', '0.95', '0.65', '0.78', '0.55'] column17=['万人', '万人', '万人', '万人', '万人'] column18=['国家体育总局', '天津市体育局', '陕西省体育局', '四川省体育局', '吉林省体育局'] column19=['https://example.com/news2023_shandong', 'https://example.com/news2023_tianj

### test

In [6]:
def convert_extracted_generated_data_to_df(extracted_generated_data):
    # print(extracted_generated_data, "\n\n")
    # 初始化一个空字典，用于存储列名和对应的列数据
    data_dict = {}
    
    # 遍历 extracted_generated_data，每个元素是一个 (列名, 列数据) 的元组
    for col_name, col_data in extracted_generated_data:
        # 将列名和对应的列数据添加到字典中
        data_dict[col_name] = col_data
    
    # 将字典转化为 DataFrame
    df = pd.DataFrame(data_dict) # 这里可能会报错：ValueError: All arrays must be of the same length。列表长度不一致
    # print(df.head())
    return df

# 将扩展生成的数据与原始数据合并
def merge_data(original_df, extracted_generated_data_df, num_columns_to_augment):
    # 获取原始数据的列名，去掉最后的 "采集时间" 和 "备注【疑问汇总】" 两列
    original_columns = original_df.columns[:-2]
    
    # 生成数据的列名为 column1, column2, ...，将其映射为原始数据的列名
    extracted_generated_data_df.columns = original_columns[:num_columns_to_augment]
    
    # 为生成的数据添加 "采集时间" 和 "备注【疑问汇总】" 两列，默认填充为 None
    extracted_generated_data_df["采集时间"] = None
    extracted_generated_data_df["备注【疑问汇总】"] = None

    # print(extracted_generated_data_df.head())
    
    # 将原始数据和生成数据按列拼接起来
    merged_df = pd.concat([original_df, extracted_generated_data_df], axis=0)
    
    return merged_df

In [7]:
# 入口函数
def main(file_path, save_path):
    # 第1步：加载 csv 文件
    df = load_csv(file_path)

    # 第2步：自动生成 column_mapping，忽略 "采集时间" 和 "备注【疑问汇总】"
    column_mapping = generate_column_mapping(df)

    # 第3步：创建大模型输入
    input_data = create_model_input(df, column_mapping)

    num_columns_to_augment = len((input_data[0].keys())) # 这个表格需要处理的列数 - 忽略 "采集时间" 和 "备注【疑问汇总】"

    # 第4步：创建 system_prompt 和 user_prompt
    num_new_entries = 20  # 指定生成的新数据数量
    system_prompt = create_system_prompt(num_new_entries)
    user_prompt = create_user_prompt(input_data)

    merged_df = df.copy()  # 初始化 merged_df 为原始数据
    total_rows = len(merged_df)

    while total_rows < 100:
        try:
            # 第5步：调用 OpenAI API 生成数据
            generated_data = generate_data(system_prompt, user_prompt)

            # 第6步：结构化输出提取数据
            extracted_generated_data = extract_generated_data(generated_data, num_columns_to_augment)

            # 第7步：将extracted_generated_data转化成df
            extracted_generated_data_df = convert_extracted_generated_data_to_df(extracted_generated_data)

            # 第8步：合并原始数据和生成数据
            merged_df = merge_data(merged_df, extracted_generated_data_df, num_columns_to_augment)

            total_rows = len(merged_df)  # 更新总行数

            # 每次生成都保存扩展后的数据
            merged_df.to_csv(save_path, mode='a', header=not os.path.exists(save_path), index=False)
            print(f"扩展后的数据已保存到 {save_path}。当前总行数为 {total_rows}，继续生成数据...")

        except ValueError as e:
            print(f"生成数据时发生错误：{str(e)}。跳过本次生成，继续下一次。")
            continue
        except Exception as e:
            print(f"发生未预期的错误：{str(e)}。跳过本次生成，继续下一次。")
            continue

In [None]:
# 示例调用
if __name__ == "__main__":
    # csv 文件路径
    file_path = "8-现代时尚 - 健身人数信息.csv"
    
    # 保存结果的文件路径
    save_path = "8-现代时尚 - 健身人数信息_expadnded.csv"

    # 执行主流程
    main(file_path, save_path)