In [2]:
INPUT_FILE = "../json_data/quadrant_experiment_conversations.json"
OUTPUT_FILE = "../json_data/line_experiment_conversations2.json"

In [7]:
import json
with open(OUTPUT_FILE, "r") as f:
    data = json.load(f)

len(data)

# # 2) Transform each list (conversation) into a dict with a 'conversations' key.
# wrapped_data = []

# for i, conversation_list in enumerate(data):
#     if i % 50 == 0:
#         print(f"Processing conversation {i}...")
#     wrapped_data.append({
#         "conversations": conversation_list
#     })

# # 3) Write the new JSON
# with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
#     json.dump(wrapped_data, f, ensure_ascii=False)

# print(f"Done! Wrote {len(wrapped_data)} items to {OUTPUT_FILE}.")

10000

In [4]:
from dataclasses import dataclass, field
import json
import math
import logging
import os
from typing import Dict, Optional, List
import torch
from torch.utils.data import Dataset
from deepspeed import zero
from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus
import transformers
from transformers import Trainer, GPTQConfig, deepspeed
from transformers.trainer_pt_utils import LabelSmoother
from torch.utils.data import DataLoader as Dataloader
import random
from PIL import Image, ImageDraw
import pandas as pd
import argparse
import csv
import json
import tempfile

IGNORE_TOKEN_ID = LabelSmoother.ignore_index

def preprocess_sources(
    sources,
    tokenizer: transformers.PreTrainedTokenizer,
    max_len: int,
    system_message: str = "You are a helpful assistant."
) -> Dict:
    roles = {"user": "<|im_start|>user", "assistant": "<|im_start|>assistant"}

    im_start = tokenizer.im_start_id
    im_end = tokenizer.im_end_id
    nl_tokens = tokenizer('\n').input_ids
    _system = tokenizer('system').input_ids + nl_tokens
    _user = tokenizer('user').input_ids + nl_tokens
    _assistant = tokenizer('assistant').input_ids + nl_tokens

    # Apply prompt templates
    input_ids, targets = [], []
    for i, source in enumerate(sources):
        if roles[source[0]["from"]] != roles["user"]:
            source = source[1:]

        input_id, target = [], []
        system = [im_start] + _system + tokenizer(system_message).input_ids + [im_end] + nl_tokens
        input_id += system
        target += [im_start] + [IGNORE_TOKEN_ID] * (len(system)-3) + [im_end] + nl_tokens
        assert len(input_id) == len(target)
        for j, sentence in enumerate(source):
            role = roles[sentence["from"]]
            _input_id = tokenizer(role).input_ids + nl_tokens + \
                tokenizer(sentence["value"]).input_ids + [im_end] + nl_tokens
            input_id += _input_id
            if role == '<|im_start|>user':
                _target = [im_start] + [IGNORE_TOKEN_ID] * (len(_input_id)-3) + [im_end] + nl_tokens
            elif role == '<|im_start|>assistant':
                _target = [im_start] + [IGNORE_TOKEN_ID] * len(tokenizer(role).input_ids) + \
                    _input_id[len(tokenizer(role).input_ids)+1:-2] + [im_end] + nl_tokens
            else:
                raise NotImplementedError
            target += _target
        assert len(input_id) == len(target)
        input_id += [tokenizer.pad_token_id] * (max_len - len(input_id))
        target += [IGNORE_TOKEN_ID] * (max_len - len(target))
        input_ids.append(input_id[:max_len])
        targets.append(target[:max_len])
    input_ids = torch.tensor(input_ids, dtype=torch.int)
    targets = torch.tensor(targets, dtype=torch.int)

    return dict(
        input_ids=input_ids,
        labels=targets,
        attention_mask=input_ids.ne(tokenizer.pad_token_id),
    )


class SupervisedDataset(Dataset):
    """Dataset for supervised fine-tuning."""

    def __init__(self, raw_data, tokenizer: transformers.PreTrainedTokenizer, max_len: int):
        super(SupervisedDataset, self).__init__()

        
        sources = [example["conversations"] for example in raw_data]
        data_dict = preprocess_sources(sources, tokenizer, max_len)

        self.input_ids = data_dict["input_ids"]
        self.labels = data_dict["labels"]
        self.attention_mask = data_dict["attention_mask"]

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
        return dict(
            input_ids=self.input_ids[i],
            labels=self.labels[i],
            attention_mask=self.attention_mask[i],
        )

[2025-01-15 12:00:47,211] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cpu (auto detect)


  from .autonotebook import tqdm as notebook_tqdm


In [5]:
tokenizer = transformers.AutoTokenizer.from_pretrained(
        "Qwen/Qwen-VL-Chat",
        model_max_length=4096,
        padding_side="right",
        use_fast=False,
        trust_remote_code=True,
    )
tokenizer.pad_token_id = tokenizer.eod_id

In [8]:
test = SupervisedDataset(data, tokenizer, 4096)

In [9]:
len(test)

10000