# 构造故事数据，注意，以下代码较慢且耗费较多Token，可以不运行

In [None]:
import os,openai,backoff
import pandas as pd
from openai import OpenAI

client = OpenAI(api_key = os.environ["OPENAI_API_KEY"])
dynasties= ['唐', '宋', '元', '明', '清', '汉', '魏', '晋', '南北朝']
super_powers = ['隐形', '飞行', '读心术', '瞬间移动', '不死之身', '喷火']
story_types = ['轻松', '努力', '艰难']

@backoff.on_exception(backoff.expo, openai.RateLimitError)
def gpt35(prompt, max_tokens=2048, temperature=0.5, top_p=1, frequency_penalty=0, presence_penalty=0):
    response = client.completions.create(
        model="gpt-3.5-turbo-instruct",
        prompt=prompt,
        max_tokens=max_tokens,
        temperature=temperature,
        top_p=top_p,
        frequency_penalty=frequency_penalty,
        presence_penalty=presence_penalty)
    return response.choices[0].text

def prepare_stories(dynasties, super_powers, story_types, repeat=3, output_file="data/ultraman_stories.csv"):
    df = pd.DataFrame()
    for dynasty in dynasties:
        for super_power in super_powers:
            for story_type in story_types:
                   for i in range(repeat):
                        prompt = f"""请你用中文写一段300字的故事，情节跌宕起伏，讲述一位{dynasty}朝时期的英雄人物，穿越到现代，拥有了{super_power}这样的超能力，通过{story_type}的战斗，帮助奥特曼一起打败了怪兽的故事。"""
                        story = gpt35(prompt)
                        row = {"dynasty": dynasty, "super_power": super_power, "story_type": story_type, "story": story}
                        row = pd.DataFrame([row])
                        df = pd.concat([df, row], axis=0, ignore_index=True)

    df.to_csv("data/ultraman_stories.csv")

prepare_stories(dynasties, super_powers, story_types)


# 读取CSV，构造微调数据

In [2]:
import pandas as pd
df = pd.read_csv("data/ultraman_stories.csv")
df['sub_prompt'] = df['dynasty'] + "," + df['super_power'] + "," + df['story_type']
prepared_data = df.loc[:,['sub_prompt','story']]
prepared_data.rename(columns={'sub_prompt':'prompt', 'story':'completion'}, inplace=True)
prepared_data.to_csv('data/prepared_data.csv',index=False)

import subprocess

subprocess.run('openai tools fine_tunes.prepare_data --file data/prepared_data.csv --quiet'.split())


Analyzing...

- Based on your file extension, your file is formatted as a CSV file
- Your file contains 464 prompt-completion pairs
- Your data does not contain a common separator at the end of your prompts. Having a separator string appended to the end of the prompt makes it clearer to the fine-tuned model where the completion should begin. See https://platform.openai.com/docs/guides/fine-tuning/preparing-your-dataset for more detail and examples. If you intend to do open-ended generation, then you should leave the prompts empty
- Your data does not contain a common ending at the end of your completions. Having a common ending string appended to the end of the completion makes it clearer to the fine-tuned model where the completion should end. See https://platform.openai.com/docs/guides/fine-tuning/preparing-your-dataset for more detail and examples.
- The completion should start with a whitespace character (` `). This tends to produce better results due to the tokenization we use. Se

CompletedProcess(args=['openai', 'tools', 'fine_tunes.prepare_data', '--file', 'data/prepared_data.csv', '--quiet'], returncode=0)

# 模型微调

In [None]:
from openai import OpenAI
client = OpenAI()
file = client.files.create(
  file=open("data/prepared_data_prepared.jsonl", "rb"),
  purpose="fine-tune"
)

client.fine_tuning.jobs.create(
  training_file=file.id, 
  model="babbage-002",
  suffix="ultraman"
)


In [6]:
client.fine_tuning.jobs.list()

SyncCursorPage[FineTuningJob](data=[], object='list', has_more=False)

In [None]:
import os
import openai

def write_a_story(prompt):
    response = client.completions.create(
        model="[微调出的模型名称]",
        prompt=prompt,
        temperature=0.7,
        max_tokens=2000,
        top_p=1,
        stop=["."])
    return response.choices[0].text

story = write_a_story("宋,发射激光,艰难 ->\n")
print(story)

In [None]:
story = write_a_story("秦,龙卷风,辛苦 ->\n")
print(story)

# 增量训练优化模型

In [None]:
#生成额外的数据代码
dynasties= ['秦', '五代', '隋']
super_powers = ['龙卷风', '冰冻大海', '流星火雨']
story_types = ['轻松', '努力', '艰难', '勇敢', '辛苦']

new_stories = "data/ultraman_stories_more.csv"
prepare_stories(dynasties, super_powers, story_types, repeat=3, output_file=new_stories)


#数据转换
df = pd.read_csv(new_stories)
df['sub_prompt'] = df['dynasty'] + "," + df['super_power'] + "," + df['story_type']
prepared_data = df.loc[:,['sub_prompt','story']]
prepared_data.rename(columns={'sub_prompt':'prompt', 'story':'completion'}, inplace=True)
new_stories_prepared = 'data/prepared_data_more.csv'
prepared_data.to_csv(new_stories_prepared, index=False)

subprocess.run('openai tools fine_tunes.prepare_data --file data/prepared_data_more.csv --quiet'.split())


#继续微调
from openai.types.fine_tuning.job_create_params import Hyperparameters

file = client.files.create(
  file=open("data/prepared_data_more_prepared.jsonl", "rb"),
  purpose="fine-tune"
)

client.fine_tuning.jobs.create(
  training_file=file.id,
  model="[上面微调过的模型名]",
  suffix="ultraman",
  hyperparameters=Hyperparameters(learning_rate_multiplier=0.2)
)



# 流式生成

In [None]:
def write_a_story_by_stream(prompt):
    response = client.completions.create(
        model="[上面微调过的模型名]",
        prompt=prompt,
        temperature=0.7,
        max_tokens=2000,
        stream=True,
        top_p=1,
        stop=["."])
    return response

response = write_a_story_by_stream("汉,冰冻大海,艰难 ->\n")

for event in response:
    event_text = event.choices[0].text
    print(event_text, end = '')
