In [1]:
import requests
from typing import Dict, Any, List, Tuple

import json
import torch
from dataclasses import dataclass
from peft import LoraConfig
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from trl import SFTTrainer, SFTConfig
import yaml
from torch.utils.data import Dataset


import sys
import os

notebook_dir = os.path.dirname(os.path.abspath('__file__'))
working_dir = os.path.dirname(notebook_dir)
sys.path.append(working_dir)
# used to load environment variables from the .env file
from dotenv import load_dotenv
load_dotenv()

os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["FLOCK_API_KEY"] = "somekey"
os.environ["HF_TOKEN"] = os.getenv('HF_TOKEN')

  from .autonotebook import tqdm as notebook_tqdm


```markdown
1. Checking for GPU Availability

In [2]:
# Check if GPU is available with PyTorch
print("CUDA is available:", torch.cuda.is_available())

if torch.cuda.is_available():
    # Get the number of available GPUs
    gpu_count = torch.cuda.device_count()
    print(f"Number of available GPUs: {gpu_count}")
    
    # Display information about each GPU
    for i in range(gpu_count):
        gpu_name = torch.cuda.get_device_name(i)
        gpu_properties = torch.cuda.get_device_properties(i)
        print(f"GPU {i}: {gpu_name}")
        print(f"  Total memory: {gpu_properties.total_memory / 1024**3:.2f} GB")
        print(f"  CUDA Capability: {gpu_properties.major}.{gpu_properties.minor}")
else:
    print("No GPU available. Training will be slow on CPU.")


CUDA is available: False
No GPU available. Training will be slow on CPU.


```markdown
2. Fetching Task Data from the API

In [3]:
# This cell fetches task information from the Flock API
# It retrieves details about a specific task using its ID
# The response includes title, description, data, and important dates

task_id="6"
location = f'https://fed-ledger-prod.flock.io/api/v1/tasks/get?task_id={task_id}'
response = requests.get(location)
task = json.loads(response.text)
data_url = task["data"]["training_set_url"]
print(task['title'])
print(task['description'])
print(task['data'])


print()
max_params = task['data']['max_params']
if isinstance(max_params, int):
    # Convert to billions and format
    params_in_billions = max_params / 1_000_000_000
    print(f"Maximum parameters allowed: {params_in_billions:.1f}B parameters")
else:
    print(max_params)

print(task['submission_phase_ends_at'])
print(task['final_validation_ends_at'])

AI Producer: SEEK48
<p>FLock introduces SEEK48, an advanced AI idol producer model designed to help idol teams precisely select members, establish unique identities, and develop the most effective debut strategies. From image building and talent matching to fan engagement strategies, AI leverages data analysis and trend forecasting to create idol groups with strong market appeal. Whether shaping group concepts or optimizing individual career paths, our AI ensures that every idol takes the stage in the best possible way to become the next superstar.</p><br/><p>FLock 推出全新的 AI 偶像制作人模型 SEEK48，帮助偶像团队精准筛选成员、确定个性化定位，并制定最优出道方案。从形象塑造、才艺匹配到粉丝策略，AI 将结合数据分析与趋势预测，助力打造具有市场吸引力的偶像团队。无论是策划团体风格，还是优化个人发展路线，我们的AI都让每位偶像都能以最适合的方式站上舞台，成为未来的超级明星。</p><br/><p>FLock은 새로운 AI 아이돌 프로듀서 모델 SEEK48을 출시하여, 아이돌 팀이 멤버를 정교하게 선발하고 개성을 확립하며 최적의 데뷔 전략을 수립할 수 있도록 돕습니다. 이미지 구축부터 재능 매칭, 팬 전략까지, AI는 데이터 분석과 트렌드 예측을 결합하여 시장에서 경쟁력 있는 아이돌 그룹을 탄생시킵니다. 그룹 컨셉 기획부터 개인 성장 최적화까지, 우리의 AI는 각 아이돌이 자신의 강점을 극대화하여 무대에서 빛날 수 있도록 지원합니다.</p><br/>

```markdown
3. Downloading the Training Data

In [4]:
response = requests.get(data_url, stream=True)
train_file = f"{working_dir}/data/task{task_id}_demo_data.jsonl"
os.makedirs(os.path.dirname(train_file), exist_ok=True)
with open(train_file, "wb") as f:
    for chunk in response.iter_content(chunk_size=8192):
        if chunk:  # filter out keep-alive new chunks
            f.write(chunk)
    f.flush()
    os.fsync(f.fileno())
print(f"Data saved successfully to {train_file}")

Data saved successfully to /Users/elizabethlui/work/arena_examples/data/task6_demo_data.jsonl


```markdown
4. Inspecting the Dataset

In [5]:
# Load and display a random item from the downloaded JSONL file
import json
import random

# Read all lines from the JSONL file
with open(train_file, 'r', encoding='utf-8') as f:
    lines = f.readlines()

# Select a random line
random_line = random.choice(lines)

# Parse the JSON
random_item = json.loads(random_line)

# Display the random item
print("Random item from the dataset:")
print(json.dumps(random_item, indent=2, ensure_ascii=False))

# Print the number of items in the dataset
print(f"\nTotal number of items in the dataset: {len(lines)}")


Random item from the dataset:
{
  "conversations": [
    {
      "role": "user",
      "content": "我最近关注到一支新出道的偶像团体，他们团队共有5人。经过多年训练，团队内有人在唱功、rap、主持和创作方面都有不错的专长，另一些成员在形象管理上也很有优势，尤其在时尚品味和妆容风格上颇具市场吸引力。不过，目前这支团体的粉丝数量和活跃度还偏低，虽然公司在团队建设和资源投入上有相应的努力，但市场竞争压力仍然较大。请问你怎么看这支团体目前的优势与劣势？"
    },
    {
      "role": "assistant",
      "content": "这支团体的优势在于成员技能多元，能够覆盖唱功、rap、主持及创作等多个领域，这为出道后的多元化发展提供了坚实基础。另外，团队在形象管理方面表现出色，时尚与妆容细节都符合当前年轻市场的审美需求，可以在宣传和视觉效果上形成鲜明特色。劣势在于粉丝基础尚不够牢固，活跃度和忠诚度相对较低，这使得短期内专辑销量及综艺曝光度可能遇到瓶颈；同时，市场上已经有不少成熟的偶像团体，竞争较为激烈。总体来看，需要在粉丝运营和品牌塑造上下更多功夫。"
    },
    {
      "role": "user",
      "content": "针对之前的分析，我想更详细地讨论下出道后推广策略问题。比如在影视综艺资源的争取和专辑销量提升方面，你有什么切实可行的方案？另外，我也关注团队内在的凝聚力，如何在保持个性特色的同时增强整体合作呢？"
    },
    {
      "role": "assistant",
      "content": "在推广策略上，建议团队先制定一个清晰的品牌定位，利用成员各自不同的技能点打造‘多面手’形象。例如，可以为专注rap和创作的成员争取参与音乐剧或原创音乐节目的机会，而具备主持和唱功的成员则可争取参加热门综艺节目，从而切入影视和综艺资源。同时，配合线上社交媒体和粉丝活动互动，借助平台经济提升专辑销量。关于增强团队凝聚力，一方面可以组织内部定期的团建活动和创作工作坊，让成员在共同学习和磨合中形成默契；另一方面，公司也可以考虑设计团队联名的活动或专题节目，既突出个人才华又展示团队整体魅力

```markdown
5. Loading Training Arguments

In [6]:
train_arg_file = f"{working_dir}/args/task{task_id}_training_args.yaml"
with open(train_arg_file, 'r') as f:
    all_training_args_as_list = yaml.safe_load(f)

In [7]:
# take a look at the model config that we are going to use
use_args = all_training_args_as_list['Qwen/Qwen1.5-0.5B']
# use_args['lora_rank'] = 4
# use_args['lora_alpha'] = 8
use_args

{'per_device_train_batch_size': 1,
 'gradient_accumulation_steps': 8,
 'num_train_epochs': 1,
 'lora_rank': 8,
 'lora_alpha': 16,
 'lora_dropout': 0.1}

In [8]:
qwen_template = {
    "system_format": "<|im_start|>system\n{content}<|im_end|>\n",
    "user_format": "<|im_start|>user\n{content}<|im_end|>\n<|im_start|>assistant\n",
    "assistant_format": "{content}<|im_end|>\n",
    "tool_format": "{content}",
    "function_format": "{content}",
    "observation_format": "<|im_start|>tool\n{content}<|im_end|>\n<|im_start|>assistant\n",
    "system": "You are a helpful assistant.",
}

gemma_template = {
    "system_format": "<bos>",
    "user_format": "<start_of_turn>user\n{content}<end_of_turn>\n<start_of_turn>model\n",
    "assistant_format": "{content}<eos>\n",
    "tool_format": "{content}",
    "function_format": "{content}",
    "observation_format": "<start_of_turn>tool\n{content}<end_of_turn>\n<start_of_turn>model\n",
    "system": None,
}

model2template = {
    "Qwen/Qwen1.5-0.5B": qwen_template,
    "Qwen/Qwen1.5-1.8B": qwen_template,
    "Qwen/Qwen1.5-7B": qwen_template,
    "google/gemma-2b": gemma_template,
    "google/gemma-7b": gemma_template,
}

model2size = {
    "Qwen/Qwen1.5-0.5B": 620_000_000,
    "Qwen/Qwen1.5-1.8B": 1_840_000_000,
    "Qwen/Qwen1.5-7B": 7_720_000_000,
    "google/gemma-2b": 2_510_000_000,
    "google/gemma-7b": 8_540_000_000,
}

model2base_model = {
    "Qwen/Qwen1.5-0.5B": "qwen1.5",
    "Qwen/Qwen1.5-1.8B": "qwen1.5",
    "Qwen/Qwen1.5-7B": "qwen1.5",
    "google/gemma-2b": "gemma",
    "google/gemma-7b": "gemma",
}

```markdown
6. Preparing datasets for training

In [9]:
class SFTDataset(Dataset):
    def __init__(self, file, tokenizer, max_seq_length, template):
        self.tokenizer = tokenizer
        self.system_format = template["system_format"]
        self.user_format = template["user_format"]
        self.assistant_format = template["assistant_format"]
        self.tool_format = template["tool_format"]
        self.function_format = template["function_format"]
        self.observation_format = template["observation_format"]

        self.max_seq_length = max_seq_length
        # logger.info("Loading data: {}".format(file))
        with open(file, "r", encoding="utf8") as f:
            data_list = f.readlines()
        # logger.info("There are {} data in dataset".format(len(data_list)))
        self.data_list = data_list

    def __len__(self):
        return len(self.data_list)

    def __getitem__(self, index):
        data = self.data_list[index]
        data = json.loads(data)
        input_ids, target_mask = [], []

        # setting system information
        if self.system_format is not None:
            system = data["system"].strip() if "system" in data.keys() else self.system

            if system is not None:
                system_text = self.system_format.format(content=system)
                input_ids = self.tokenizer.encode(system_text, add_special_tokens=False)
                target_mask = [0] * len(input_ids)

        conversations = data["conversations"]

        input_buffer = ""
        for i in range(len(conversations)):
            role = conversations[i]["role"]
            content = conversations[i]["content"].strip()

            if role != "assistant":
                if role == "user":
                    human = self.user_format.format(
                        content=content, stop_token=self.tokenizer.eos_token
                    )
                    input_buffer += human

            else:
                assistant = self.assistant_format.format(
                    content=content, stop_token=self.tokenizer.eos_token
                )

                input_tokens = self.tokenizer.encode(
                    input_buffer, add_special_tokens=False
                )
                output_tokens = self.tokenizer.encode(
                    assistant, add_special_tokens=False
                )

                input_ids += input_tokens + output_tokens
                target_mask += [0] * len(input_tokens) + [1] * len(output_tokens)
                input_buffer = ""

        assert len(input_ids) == len(target_mask)

        input_ids = input_ids[: self.max_seq_length]
        target_mask = target_mask[: self.max_seq_length]
        attention_mask = [1] * len(input_ids)
        assert len(input_ids) == len(target_mask) == len(attention_mask)
        inputs = {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "target_mask": target_mask,
        }
        return inputs



class SFTDataCollator(object):
    def __init__(self, tokenizer, max_seq_length):
        self.tokenizer = tokenizer
        self.max_seq_length = max_seq_length
        self.pad_token_id = tokenizer.pad_token_id

    def __call__(self, batch: List[Dict[str, Any]]) -> Dict[str, Any]:
        # Find the maximum length in the batch
        lengths = [len(x["input_ids"]) for x in batch if x["input_ids"] is not None]
        # Take the maximum length in the batch, if it exceeds max_seq_length, take max_seq_length
        batch_max_len = min(max(lengths), self.max_seq_length)

        input_ids_batch, attention_mask_batch, target_mask_batch = [], [], []
        # Truncate and pad
        for x in batch:
            input_ids = x["input_ids"]
            attention_mask = x["attention_mask"]
            target_mask = x["target_mask"]
            if input_ids is None:
                logger.info("some input_ids is None")
                continue
            padding_len = batch_max_len - len(input_ids)
            # Pad
            input_ids = input_ids + [self.pad_token_id] * padding_len
            attention_mask = attention_mask + [0] * padding_len
            target_mask = target_mask + [0] * padding_len
            # Truncate
            input_ids = input_ids[: self.max_seq_length]
            attention_mask = attention_mask[: self.max_seq_length]
            target_mask = target_mask[: self.max_seq_length]

            input_ids_batch.append(input_ids)
            attention_mask_batch.append(attention_mask)
            target_mask_batch.append(target_mask)

        # Convert lists to tensors to get the final model input
        input_ids_batch = torch.tensor(input_ids_batch, dtype=torch.long)
        attention_mask_batch = torch.tensor(attention_mask_batch, dtype=torch.long)
        target_mask_batch = torch.tensor(target_mask_batch, dtype=torch.long)
        # input_ids_batch = torch.tensor(input_ids_batch, dtype=torch.long, device='cuda:0')
        # attention_mask_batch = torch.tensor(attention_mask_batch, dtype=torch.long, device='cuda:0')
        # target_mask_batch = torch.tensor(target_mask_batch, dtype=torch.long, device='cuda:0')

        labels = torch.where(target_mask_batch == 1, input_ids_batch, -100)
        inputs = {
            "input_ids": input_ids_batch,
            "attention_mask": attention_mask_batch,
            "labels": labels,
        }
        return inputs

In [10]:
@dataclass
class LoraTrainingArguments:
    per_device_train_batch_size: int
    gradient_accumulation_steps: int
    num_train_epochs: int
    lora_rank: int
    lora_alpha: int
    lora_dropout: int

def train_lora(
    model_id: str, context_length: int, training_args: LoraTrainingArguments,
    data_file_path: str,
    model_output_dir: str,
    model_template: dict,
    target_module: list = ["q_proj", "v_proj"],
    max_steps: int = None,  # New parameter to limit training steps
):
    assert model_id in model2template, f"model_id {model_id} not supported"
    lora_config = LoraConfig(
        r=training_args.lora_rank,
        target_modules=target_module,
        lora_alpha=training_args.lora_alpha,
        lora_dropout=training_args.lora_dropout,
        task_type="CAUSAL_LM",
    )
    # Load model in 4-bit to do qLoRA
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=False,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
    )
    
    # Configure training with option to limit steps
    train_config = {
        "per_device_train_batch_size": training_args.per_device_train_batch_size,
        "gradient_accumulation_steps": training_args.gradient_accumulation_steps,
        "warmup_steps": 100,
        "learning_rate": 2e-4,
        "bf16": True,
        "logging_steps": 20,
        "output_dir": "outputs",
        # Change THIS line:
        # "optim": "paged_adamw_8bit",  # bitsandbytes 8-bit
        "optim": "adamw_torch_fused",   # a normal CPU/MPS-friendly optimizer
        "remove_unused_columns": False,
        "max_seq_length": context_length,
    }
    
    # Either use max_steps or num_train_epochs
    if max_steps is not None:
        train_config["max_steps"] = max_steps
    else:
        train_config["num_train_epochs"] = training_args.num_train_epochs
        
    training_args = SFTConfig(**train_config)
    
    tokenizer = AutoTokenizer.from_pretrained(
        model_id,
        use_fast=True,
    )
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        # quantization_config=bnb_config,
        token=os.environ["HF_TOKEN"],
        device_map="auto"
    )

    # Load dataset
    dataset = SFTDataset(
        file=data_file_path,
        tokenizer=tokenizer,
        max_seq_length=context_length,
        template=model_template,
    )

    # Define trainer
    trainer = SFTTrainer(
        model=model,
        train_dataset=dataset,
        args=training_args,
        peft_config=lora_config,
        data_collator=SFTDataCollator(tokenizer, max_seq_length=context_length),
    )

    # Train model with OOM handling
    try:
        trainer.train()
    except (RuntimeError, torch.cuda.OutOfMemoryError) as e:
        if "CUDA out of memory" in str(e):
            print("Caught OOM error. Saving current model state...")
        else:
            print(f"Error during training: {e}")
        # Save whatever progress was made before the error
    
    # Save model regardless of whether training completed or was interrupted
    try:
        trainer.save_model(model_output_dir)
        print(f"Model saved to {model_output_dir}")
    except Exception as e:
        print(f"Error saving model: {e}")

    # remove checkpoint folder
    os.system("rm -rf outputs/checkpoint-*")

    # upload lora weights and tokenizer
    print("Training Completed.")

In [11]:
use_template = model2template['Qwen/Qwen1.5-0.5B']
use_template['system_format'] = None
use_template['system'] = None


```markdown
7. Train the model!

In [12]:
no_submission = True # set to False to submit the model; need API key to do so
target_module = ["q_proj", "v_proj"] # default
max_params = task["data"]["max_params"]
context_length = task["data"]["context_length"]

model_id = list(all_training_args_as_list.keys())[0]
output_dir = f"{working_dir}/outputs/task{task_id}_{model_id}"

print(f"Start to train the model {model_id}...")
try:
    train_lora(
        model_id=model_id,
        context_length=context_length,
        training_args=LoraTrainingArguments(**use_args),
        data_file_path=train_file,
        model_output_dir=output_dir,
        target_module=target_module,
        model_template=use_template,
        max_steps=3
    )
except RuntimeError as e:
    print(f"Error: {e}")
    print("Proceed to the next model...")

Start to train the model Qwen/Qwen1.5-0.5B...


  warn("The installed version of bitsandbytes was compiled without GPU support. "


'NoneType' object has no attribute 'cadam32bit_grad_fp32'


max_steps is given, it will override any value given in num_train_epochs
100%|██████████| 3/3 [00:24<00:00,  8.32s/it]


{'train_runtime': 24.9792, 'train_samples_per_second': 0.961, 'train_steps_per_second': 0.12, 'train_loss': 2.6741841634114585, 'epoch': 0.27}
Model saved to /Users/elizabethlui/work/arena_examples/outputs/task6_Qwen/Qwen1.5-0.5B
Training Completed.
