Предыдущий файл - 3_data_from_SMILES_to_text.ipynb

Теперь необходимо создать тренировочный и тестовый наборы данных в виде чата.
Каждый из блоков кода я отдельно запускал на своем компьютере и кластерном сервере в формате .py для ускорения создания наборов данных.

#Тестовый набор данных

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch
from datasets import Dataset
from tqdm import tqdm

ds = pd.read_csv("text_chem_data.csv", sep="\t", index_col=0) #загрузка данных

train_df, test_df = train_test_split(ds, test_size=0.3, random_state=42) #разделение данных

model_name = "KingNish/Reasoning-Llama-1b-v0.1"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.padding_side = 'right'
tokenizer.pad_token = tokenizer.eos_token

bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    trust_remote_code=True,
    quantization_config=bnb_config,
    torch_dtype=torch.float16,
    device_map="auto")

special_tokens_dict = {'additional_special_tokens': ['<|result|>']} #добавляем специальный токен, чтобы модель понимала, где должен находиться ответ
tokenizer.add_special_tokens(special_tokens_dict)
model.resize_token_embeddings(len(tokenizer))

def generate_shot_sequence(train_df, test_example, num_shot: int): #функция для генерации шотов. Примеры берутся из тренировочного набора данных, реакции для задачи - из тестового. При этом примеры берутся разнообразные, если есть реакция с 'low' выходом, то следующая будет с 'high'
    high_samples = train_df[train_df["Yield"] == "high"].sample(n=num_shot // 2, replace=True)
    low_samples = train_df[train_df["Yield"] == "low"].sample(n=num_shot // 2, replace=True)
    shots = pd.concat([high_samples, low_samples]).sample(frac=1, random_state=42)

    messages = [
        {'role': 'system', 'content': 'Here are some examples of chemical reactions and their yield rates. "high" means the yield is >= 70%. "low" means the yield is <70%.'}
    ]

    for i, shot in enumerate(shots.to_dict(orient="records"), 1):
        messages.append({"role": "user", "content": f'Example {i}: Reaction: {shot["text"]}'})
        messages.append({"role": "assistant", "content": shot["Yield"]})

    messages.append({"role": "user", "content": f'Task: Based on these examples, predict the yield of the following reaction. Print "high" if yield of this reaction >=70%, or "low" if yield is <70%. Reaction: {test_example} Answer: <|result|>'})

    return messages

def prepare_dataset(train_df, test_df, num_shot=2): #генерация шотов и их токенизация
    dataset = []
    for test_example in tqdm(test_df["text"]):
        shots = generate_shot_sequence(train_df, test_example, num_shot)
        text_prompt = tokenizer.apply_chat_template(shots, tokenize=False)
        answer = test_df[test_df["text"] == test_example]["Yield"].values[0]
        dataset.append({"text": text_prompt, "answer": answer})

    return pd.DataFrame(dataset)

test_dataset = prepare_dataset(train_df, test_df, num_shot=2)

test_dataset.to_csv('test_dataset.csv', sep='\t')

#Первая половина тренировочного набора данных

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch
from datasets import Dataset
from tqdm import tqdm

ds = pd.read_csv("text_chem_data.csv", sep="\t", index_col=0)

train_df, test_df = train_test_split(ds, test_size=0.3, random_state=42)

half = len(train_df)//2
half_df_1 = train_df.iloc[:half]
half_df_1 = half_df_1[['text']]
half_df_2 = train_df.iloc[half:]
half_df_2 = half_df_2[['text']]

model_name = "KingNish/Reasoning-Llama-1b-v0.1"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.padding_side = 'right'
tokenizer.pad_token = tokenizer.eos_token

bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    trust_remote_code=True,
    quantization_config=bnb_config,
    torch_dtype=torch.float16,
    device_map="auto")

special_tokens_dict = {'additional_special_tokens': ['<|result|>']}
tokenizer.add_special_tokens(special_tokens_dict)
model.resize_token_embeddings(len(tokenizer))

def generate_shot_sequence(train_df, test_example, num_shot: int):
    high_samples = train_df[train_df["Yield"] == "high"].sample(n=num_shot // 2, replace=True)
    low_samples = train_df[train_df["Yield"] == "low"].sample(n=num_shot // 2, replace=True)
    shots = pd.concat([high_samples, low_samples]).sample(frac=1, random_state=42)

    messages = [
        {'role': 'system', 'content': 'Here are some examples of chemical reactions and their yield rates. "high" means the yield is >= 70%. "low" means the yield is <70%.'}
    ]

    for i, shot in enumerate(shots.to_dict(orient="records"), 1):
        messages.append({"role": "user", "content": f'Example {i}: Reaction: {shot["text"]}'})
        messages.append({"role": "assistant", "content": shot["Yield"]})

    messages.append({"role": "user", "content": f'Task: Based on these examples, predict the yield of the following reaction. Print "high" if yield of this reaction >=70%, or "low" if yield is <70%. Reaction: {test_example} Answer: <|result|>'})

    return messages

def prepare_dataset(train_df, test_df, num_shot=2):
    dataset = []
    for test_example in tqdm(test_df["text"]):
        shots = generate_shot_sequence(train_df, test_example, num_shot)
        text_prompt = tokenizer.apply_chat_template(shots, tokenize=False)
        answer = test_df[test_df["text"] == test_example]["Yield"].values[0]
        dataset.append({"text": text_prompt, "answer": answer})

    return pd.DataFrame(dataset)

train_dataset = prepare_dataset(train_df, half_df_1, num_shot=2)

train_dataset.to_csv('train_dataset_1.csv', sep='\t')

#Вторая половина тренировочного набора данных

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch
from datasets import Dataset
from tqdm import tqdm

ds = pd.read_csv("text_chem_data.csv", sep="\t", index_col=0)

train_df, test_df = train_test_split(ds, test_size=0.3, random_state=42)

half = len(train_df)//2
half_df_1 = train_df.iloc[:half]
half_df_1 = half_df_1[['text']]
half_df_2 = train_df.iloc[half:]
half_df_2 = half_df_2[['text']]

model_name = "KingNish/Reasoning-Llama-1b-v0.1"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.padding_side = 'right'
tokenizer.pad_token = tokenizer.eos_token

bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    trust_remote_code=True,
    quantization_config=bnb_config,
    torch_dtype=torch.float16,
    device_map="auto")

special_tokens_dict = {'additional_special_tokens': ['<|result|>']}
tokenizer.add_special_tokens(special_tokens_dict)
model.resize_token_embeddings(len(tokenizer))

def generate_shot_sequence(train_df, test_example, num_shot: int):
    high_samples = train_df[train_df["Yield"] == "high"].sample(n=num_shot // 2, replace=True)
    low_samples = train_df[train_df["Yield"] == "low"].sample(n=num_shot // 2, replace=True)
    shots = pd.concat([high_samples, low_samples]).sample(frac=1, random_state=42)

    messages = [
        {'role': 'system', 'content': 'Here are some examples of chemical reactions and their yield rates. "high" means the yield is >= 70%. "low" means the yield is <70%.'}
    ]

    for i, shot in enumerate(shots.to_dict(orient="records"), 1):
        messages.append({"role": "user", "content": f'Example {i}: Reaction: {shot["text"]}'})
        messages.append({"role": "assistant", "content": shot["Yield"]})

    messages.append({"role": "user", "content": f'Task: Based on these examples, predict the yield of the following reaction. Print "high" if yield of this reaction >=70%, or "low" if yield is <70%. Reaction: {test_example} Answer: <|result|>'})

    return messages

def prepare_dataset(train_df, test_df, num_shot=2):
    dataset = []
    for test_example in tqdm(test_df["text"]):
        shots = generate_shot_sequence(train_df, test_example, num_shot)
        text_prompt = tokenizer.apply_chat_template(shots, tokenize=False)
        answer = test_df[test_df["text"] == test_example]["Yield"].values[0]
        dataset.append({"text": text_prompt, "answer": answer})

    return pd.DataFrame(dataset)

train_dataset = prepare_dataset(train_df, half_df_2, num_shot=2)

train_dataset.to_csv('train_dataset_2.csv', sep='\t')

#Валидационный набор данных

Данный набор сделан на подобие тестового набора, однако реакции в примерах будут другими

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch
from datasets import Dataset
from tqdm import tqdm

ds = pd.read_csv("text_chem_data.csv", sep="\t", index_col=0)

train_df, test_df = train_test_split(ds, test_size=0.3, random_state=42)

model_name = "KingNish/Reasoning-Llama-1b-v0.1"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.padding_side = 'right'
tokenizer.pad_token = tokenizer.eos_token

bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    trust_remote_code=True,
    quantization_config=bnb_config,
    torch_dtype=torch.float16,
    device_map="auto")

special_tokens_dict = {'additional_special_tokens': ['<|result|>']}
tokenizer.add_special_tokens(special_tokens_dict)
model.resize_token_embeddings(len(tokenizer))

def generate_shot_sequence(train_df, test_example, num_shot: int):
    high_samples = train_df[train_df["Yield"] == "high"].sample(n=num_shot // 2, replace=True)
    low_samples = train_df[train_df["Yield"] == "low"].sample(n=num_shot // 2, replace=True)
    shots = pd.concat([high_samples, low_samples]).sample(frac=1, random_state=42)

    messages = [
        {'role': 'system', 'content': 'Here are some examples of chemical reactions and their yield rates. "high" means the yield is >= 70%. "low" means the yield is <70%.'}
    ]

    for i, shot in enumerate(shots.to_dict(orient="records"), 1):
        messages.append({"role": "user", "content": f'Example {i}: Reaction: {shot["text"]}'})
        messages.append({"role": "assistant", "content": shot["Yield"]})

    messages.append({"role": "user", "content": f'Task: Based on these examples, predict the yield of the following reaction. Print "high" if yield of this reaction >=70%, or "low" if yield is <70%. Reaction: {test_example} Answer: <|result|>'})

    return messages

def prepare_dataset(train_df, test_df, num_shot=2):
    dataset = []
    for test_example in tqdm(test_df["text"]):
        shots = generate_shot_sequence(train_df, test_example, num_shot)
        text_prompt = tokenizer.apply_chat_template(shots, tokenize=False)
        answer = test_df[test_df["text"] == test_example]["Yield"].values[0]
        dataset.append({"text": text_prompt, "answer": answer})

    return pd.DataFrame(dataset)

test_dataset = prepare_dataset(train_df, test_df, num_shot=2)

test_dataset.to_csv('valid_dataset.csv', sep='\t')

Таким образом, пример одного таска для модели без специальных токенов выглядел так:

system:

Here are some examples of chemical reactions and their yield rates. "high" means the yield is >= 70%. "low" means the yield is <70%.

user:

Example 1: Reaction: 2-(carbamoylamino)-5-(4-ethenylphenyl)-1h-pyrrole-3-carboxamide reacts in the presence of palladium, methanol to produce 2-(carbamoylamino)-5-(4-ethylphenyl)-1h-pyrrole-3-carboxamide.

assistant: low

user:

Example 2: Reaction:
Isocyanatomethylbenzene, 5-[5-(azepan-1-ylmethyl)thiophen-2-yl]-1,3,4-oxadiazol-2-amine react together in the presence of pyridine to produce 1-[5-[5-(azepan-1-ylmethyl)thiophen-2-yl]-1,3,4-oxadiazol-2-yl]-3-benzylurea.

assistant: high

user:

Task: Based on these examples, predict the yield of the following reaction. Print "high" if yield of this reaction >=70%, or "low" if yield is <70%.  Reaction:
N-(5-acetamido-2-aminophenyl)-4-tert-butylbenzamide, 1h-indole-6-carboxylic acid, hexafluorophosphate, bromo(tripyrrolidin-1-yl)phosphanium, n-ethyl-n-propan-2-ylpropan-2-amine react together in the presence of dichloromethane, n,n-dimethylformamide to produce n-[4-acetamido-2-[(4-tert-butylbenzoyl)amino]phenyl]-1h-indole-6-carboxamide.
Answer:


Продолжение в 5_llama_training.py