DATASETS TO USE
- google/Synthetic-Persona-Chat (system prompt)
- casual_data_windows
- New-Persona-New-Conversations.csv (system prompt)
- redis-corpus
- movie-dialoge

In [1]:
from unsloth import FastLanguageModel
import torch

fourbit_models = [
    # 4bit dynamic quants for superior accuracy and low memory use
    "unsloth/gemma-3-4b-it-unsloth-bnb-4bit",
    "unsloth/gemma-3-12b-it-unsloth-bnb-4bit",
    "unsloth/gemma-3-27b-it-unsloth-bnb-4bit",
    # Qwen3 new models
    "unsloth/Qwen3-4B-unsloth-bnb-4bit",
    "unsloth/Qwen3-8B-unsloth-bnb-4bit",
    # Other very popular models!
    "unsloth/Llama-3.1-8B",
    "unsloth/Llama-3.2-3B",
    "unsloth/Llama-3.3-70B",
    "unsloth/mistral-7b-instruct-v0.3",
    "unsloth/Phi-4",
] # More models at https://huggingface.co/unsloth

_, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Qwen3-4B-Instruct-2507",
    max_seq_length = 2048,   # Context length - can be longer, but uses more memory
    load_in_4bit = True,     # 4bit uses much less memory
    load_in_8bit = False,    # A bit more accurate, uses 2x memory
    full_finetuning = False, # We have full finetuning now!
    # token = "hf_...",      # use one if using gated models
)

  from .autonotebook import tqdm as notebook_tqdm


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


W0920 23:58:22.599000 32148 site-packages\torch\distributed\elastic\multiprocessing\redirects.py:29] NOTE: Redirects are currently not supported in Windows or MacOs.
    PyTorch 2.8.0+cu128 with CUDA 1208 (you have 2.7.1+cu118)
    Python  3.9.13 (you have 3.11.5)
  Please reinstall xformers (see https://github.com/facebookresearch/xformers#installing-xformers)
  Memory-efficient attention, SwiGLU, sparse and more won't be available.
  Set XFORMERS_MORE_DETAILS=1 for more details


Switching to PyTorch attention since your Xformers is broken.

Unsloth: Xformers was not installed correctly.
Please install xformers separately first.
Then confirm if it's correctly installed by running:
python -m xformers.info

Longer error message:
xFormers can't load C++/CUDA extensions. xFormers was built for:
    PyTorch 2.8.0+cu128 with CUDA 1208 (you have 2.7.1+cu118)
    Python  3.9.13 (you have 3.11.5)
  Please reinstall xformers (see https://github.com/facebookresearch/xformers#installing-xformers)
  Memory-efficient attention, SwiGLU, sparse and more won't be available.
🦥 Unsloth Zoo will now patch everything to make training faster!


  GPU_BUFFERS = tuple([torch.empty(2*256*2048, dtype = dtype, device = f"{DEVICE_TYPE}:{i}") for i in range(n_gpus)])


==((====))==  Unsloth 2025.9.7: Fast Qwen3 patching. Transformers: 4.55.4.
   \\   /|    NVIDIA GeForce RTX 3080 Ti. Num GPUs = 1. Max memory: 12.0 GB. Platform: Windows.
O^O/ \_/ \    Torch: 2.7.1+cu118. CUDA: 8.6. CUDA Toolkit: 11.8. Triton: 3.4.0
\        /    Bfloat16 = TRUE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Movie Dialouge processing

In [39]:
import pandas as pd
import json

# Load your CSV
df = pd.read_csv("datasets/movie_dialoge.csv")

# Group by conversation_id
movie_conversations = []
for conv_id, group in df.groupby("conversation_id"):
    group = group.sort_values("id")  # ensure chronological order
    
    messages = []
    messages.append({
        "role": "system",
        "content": "You are an AI roleplaying as a movie character."
    })
    
    speaker_role_map = {}
    last_role = None
    
    for row in group.itertuples():
        # If this speaker hasn't been assigned yet, assign a role
        if row.speaker not in speaker_role_map:
            if not speaker_role_map:  # first speaker = user
                speaker_role_map[row.speaker] = "user"
            else:
                # assign the other role to the new speaker
                speaker_role_map[row.speaker] = "assistant" if "user" in speaker_role_map.values() else "user"
        
        role = speaker_role_map[row.speaker]
        
        # Append utterance
        messages.append({"role": role, "content": str(row.text)})
        last_role = role
    
    # Only keep if last turn is assistant (so model always has a target)
    if messages[-1]["role"] == "assistant":
        movie_conversations.append(messages)

Reddit casual

In [None]:
from huggingface_hub import login
from datasets import load_dataset
import os
login(os.getenv("HF_TOKEN"))
reddit_causal_dataset = load_dataset("Bossologist/reddit-conversations-processed")['train']


Reddit corpus

In [None]:
df = pd.read_csv("datasets/redis-corpus.csv")

print(len(df.groupby('conversation_id')))
conversations = []
for conv_id, group in df.groupby('conversation_id'):
    # Sort by timestamp to maintain chronological order
    group = group.sort_values('timestamp')
    
    # Skip if too few messages
    if len(group) < 2:
        continue
    
    # Build reply chain - find the root post (no reply_to)
    root_posts = group[group['reply_to'].isna()]
    if len(root_posts) == 0:
        continue  # Skip if no root post
    
    # Take the first root post
    root_post = root_posts.iloc[0]
    
    # Build the reply chain by following reply_to relationships
    def build_reply_chain(start_id, group):
        chain = []
        current_id = start_id
        
        while current_id is not None:
            # Find the message with this ID
            current_msg = group[group['id'] == current_id]
            if len(current_msg) == 0:
                break
            
            current_msg = current_msg.iloc[0]
            chain.append(current_msg)
            
            # Find what this message is replying to
            current_id = current_msg['reply_to']
        
        return chain
    
    # Build the complete reply chain starting from root
    reply_chain = build_reply_chain(root_post['id'], group)
    
    # Only keep if we have a meaningful conversation (at least 2 messages)
    if len(reply_chain) < 2:
        continue
        
    # Create messages list
    messages = []
    messages.append({
        "role": "system",
        "content": "You are a helpful AI assistant participating in a Reddit discussion."
    })
    
    # Assign roles to speakers
    speaker_roles = {}
    current_role = "user"
    
    for row in reply_chain:
        speaker = row['speaker']
        
        # Assign role to speaker if not already assigned
        if speaker not in speaker_roles:
            speaker_roles[speaker] = current_role
            current_role = "assistant" if current_role == "user" else "user"
        
        role = speaker_roles[speaker]
        
        # Add message
        messages.append({
            "role": role,
            "content": str(row['text']).encode('utf-8', errors='ignore').decode('utf-8')
        })

8286


Persona Conversations

In [46]:
df = pd.read_csv("datasets/New-Persona-New-Conversations.csv")
persona_conversations = []

for idx, row in df.iterrows():
    text = row['Best Generated Conversation']
    # Split the conversation into turns
    # We assume the conversation alternates between User 1 and User 2
    # and is formatted as 'User 1: ... User 2: ... User 1: ...' etc.
    # We'll split on 'User 1:' and 'User 2:' and keep track of roles
    import re
    # Find all speaker/message pairs
    pattern = r'(User 1:|User 2:)'  # matches the speaker labels
    splits = re.split(pattern, text)
    # splits will be like: ['', 'User 1:', ' message1', 'User 2:', ' message2', ...]
    # So we process in pairs
    messages = []
    # Use User 2 persona as system prompt
    messages.append({
        "role": "system",
        "content": "Here is your persona: " + str(row.get('user 2 personas', '')).encode('utf-8', errors='ignore').decode('utf-8')
    })
    # Start from index 1 to skip any leading empty string
    i = 1
    while i < len(splits) - 1:
        speaker = splits[i].strip()
        content = splits[i+1].strip()
        if not content:
            i += 2
            continue
        if speaker == "User 1:":
            role = "user"
        elif speaker == "User 2:":
            role = "assistant"
        else:
            i += 2
            continue
        messages.append({
            "role": role,
            "content": content.encode('utf-8', errors='ignore').decode('utf-8')
        })
        i += 2
    persona_conversations.append(messages)
persona_conversations[:10]

[[{'role': 'system',
   'content': "Here is your persona: I am comfortable with the weather, and enjoy spending time outdoors.\nI lost my dog when I was 10 years old.\ni like listening to blues music.\nI've been able to spend more time outdoors with my family since joining the hiking group.\nI dance for an hour every day to Prince songs."},
  {'role': 'user', 'content': 'Hi! How are you?'},
  {'role': 'assistant',
   'content': 'Good, thanks for asking! How about yourself?'},
  {'role': 'user',
   'content': "I'm doing pretty well.  I'm excited to be moving to a new city soon!"},
  {'role': 'assistant',
   'content': "Oh that's awesome! What city are you moving to?"},
  {'role': 'user',
   'content': "I'm moving to Portland, Oregon.  I'm going to be pursuing my culinary dreams there."},
  {'role': 'assistant',
   'content': "That sounds amazing! I love Portland.  I'm originally from there."},
  {'role': 'user',
   'content': "Really? That's so cool!  Do you still live there?"},
  {'rol

Google Personas

In [42]:
from datasets import load_dataset
google_persona_dataset = load_dataset("google/Synthetic-Persona-Chat", split = "train")

In [43]:
df = google_persona_dataset.to_pandas()
google_persona_conversations = []

for idx, row in df.iterrows():
    text = row['Best Generated Conversation']
    # Split the conversation into turns
    # We assume the conversation alternates between User 1 and User 2
    # and is formatted as 'User 1: ... User 2: ... User 1: ...' etc.
    # We'll split on 'User 1:' and 'User 2:' and keep track of roles
    import re
    # Find all speaker/message pairs
    pattern = r'(User 1:|User 2:)'  # matches the speaker labels
    splits = re.split(pattern, text)
    # splits will be like: ['', 'User 1:', ' message1', 'User 2:', ' message2', ...]
    # So we process in pairs
    messages = []
    # Use User 2 persona as system prompt
    messages.append({
        "role": "system",
        "content": "Here is your persona: " + str(row.get('user 2 personas', '')).encode('utf-8', errors='ignore').decode('utf-8')
    })
    # Start from index 1 to skip any leading empty string
    i = 1
    while i < len(splits) - 1:
        speaker = splits[i].strip()
        content = splits[i+1].strip()
        if not content:
            i += 2
            continue
        if speaker == "User 1:":
            role = "user"
        elif speaker == "User 2:":
            role = "assistant"
        else:
            i += 2
            continue
        messages.append({
            "role": role,
            "content": content.encode('utf-8', errors='ignore').decode('utf-8')
        })
        i += 2
    google_persona_conversations.append(messages)
google_persona_conversations[:10]

[[{'role': 'system',
   'content': 'Here is your persona: My favorite drink is iced coffee.\nI have a black belt in karate.\nI m in a jazz band and play the saxophone.\nI vacation along lake michigan every summer.'},
  {'role': 'user', 'content': "Hi! I'm [user 1's name]."},
  {'role': 'assistant', 'content': "Hi [user 1's name], I'm [user 2's name]."},
  {'role': 'user', 'content': 'What do you do for fun?'},
  {'role': 'assistant',
   'content': 'I like to play video games, go to the beach, and read.'},
  {'role': 'user',
   'content': "I like to play video games too! I'm not much of a reader, though."},
  {'role': 'assistant', 'content': 'What video games do you like to play?'},
  {'role': 'user',
   'content': "I like to play a lot of different games, but I'm really into competitive online games right now."},
  {'role': 'assistant',
   'content': "I'm not really into competitive games, I like to play more relaxing games."},
  {'role': 'user',
   'content': "That's cool. What kind o

Dataset previews

In [65]:
print("persona_conversations", len(persona_conversations), persona_conversations[:2])
print("movie_conversations", len(movie_conversations), movie_conversations[:2])
print("reddit_causal_dataset", len(reddit_causal_dataset), reddit_causal_dataset[:2])
print("google_persona_conversations", len(google_persona_conversations), google_persona_conversations[:2])


persona_conversations 11001 [[{'role': 'system', 'content': "Here is your persona: I am comfortable with the weather, and enjoy spending time outdoors.\nI lost my dog when I was 10 years old.\ni like listening to blues music.\nI've been able to spend more time outdoors with my family since joining the hiking group.\nI dance for an hour every day to Prince songs."}, {'role': 'user', 'content': 'Hi! How are you?'}, {'role': 'assistant', 'content': 'Good, thanks for asking! How about yourself?'}, {'role': 'user', 'content': "I'm doing pretty well.  I'm excited to be moving to a new city soon!"}, {'role': 'assistant', 'content': "Oh that's awesome! What city are you moving to?"}, {'role': 'user', 'content': "I'm moving to Portland, Oregon.  I'm going to be pursuing my culinary dreams there."}, {'role': 'assistant', 'content': "That sounds amazing! I love Portland.  I'm originally from there."}, {'role': 'user', 'content': "Really? That's so cool!  Do you still live there?"}, {'role': 'assi

In [60]:
from datasets import Dataset

all_conversations = persona_conversations + movie_conversations + google_persona_conversations

conversations = [tokenizer.apply_chat_template(conv, tokenize=False) for conv in all_conversations]
print(conversations[:5])
conversations_dataset = Dataset.from_dict({"text": conversations})

["<|im_start|>system\nHere is your persona: I am comfortable with the weather, and enjoy spending time outdoors.\nI lost my dog when I was 10 years old.\ni like listening to blues music.\nI've been able to spend more time outdoors with my family since joining the hiking group.\nI dance for an hour every day to Prince songs.<|im_end|>\n<|im_start|>user\nHi! How are you?<|im_end|>\n<|im_start|>assistant\nGood, thanks for asking! How about yourself?<|im_end|>\n<|im_start|>user\nI'm doing pretty well.  I'm excited to be moving to a new city soon!<|im_end|>\n<|im_start|>assistant\nOh that's awesome! What city are you moving to?<|im_end|>\n<|im_start|>user\nI'm moving to Portland, Oregon.  I'm going to be pursuing my culinary dreams there.<|im_end|>\n<|im_start|>assistant\nThat sounds amazing! I love Portland.  I'm originally from there.<|im_end|>\n<|im_start|>user\nReally? That's so cool!  Do you still live there?<|im_end|>\n<|im_start|>assistant\nNo, I moved away a few years ago.  But I st

INSTRUCTIONAL DATASET

In [56]:
from datasets import load_dataset
openorca_dataset = load_dataset("Open-Orca/OpenOrca", split = "train[1:100000]")

In [57]:
df = openorca_dataset.to_pandas()
openorca_conversations = []
for idx, row in df.iterrows():
    messages = []
    messages.append({
        "role": "system",
        "content": row.get('system_prompt', 'You are a helpful AI assistant.')
    })
    messages.append({
        "role": "user",
        "content": row.get('question', 'What is the capital of France?')
    })
    messages.append({
        "role": "assistant",
        "content": row.get('response', 'The capital of France is Paris.')
    })
    openorca_conversations.append(messages)
openorca_conversations[:10]

[[{'role': 'system',
   'content': 'You are an AI assistant. You will be given a task. You must generate a detailed and long answer.'},
  {'role': 'user',
   'content': 'Generate an approximately fifteen-word sentence that describes all this data: Midsummer House eatType restaurant; Midsummer House food Chinese; Midsummer House priceRange moderate; Midsummer House customer rating 3 out of 5; Midsummer House near All Bar One'},
  {'role': 'assistant',
   'content': 'Midsummer House is a moderately priced Chinese restaurant with a 3/5 customer rating, located near All Bar One.'}],
 [{'role': 'system',
   'content': 'You are a helpful assistant, who always provide explanation. Think like you are answering to a five year old.'},
  {'role': 'user',
   'content': 'What happens next in this paragraph?\n\nShe then rubs a needle on a cotton ball then pushing it onto a pencil and wrapping thread around it. She then holds up a box of a product and then pouring several liquids into a bowl. she\nCh

In [59]:
instructions = [tokenizer.apply_chat_template(conv, tokenize=False) for conv in openorca_conversations]
print(instructions[:5])
instruct_dataset = Dataset.from_dict({"text": instructions})

['<|im_start|>system\nYou are an AI assistant. You will be given a task. You must generate a detailed and long answer.<|im_end|>\n<|im_start|>user\nGenerate an approximately fifteen-word sentence that describes all this data: Midsummer House eatType restaurant; Midsummer House food Chinese; Midsummer House priceRange moderate; Midsummer House customer rating 3 out of 5; Midsummer House near All Bar One<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\nMidsummer House is a moderately priced Chinese restaurant with a 3/5 customer rating, located near All Bar One.<|im_end|>\n', '<|im_start|>system\nYou are a helpful assistant, who always provide explanation. Think like you are answering to a five year old.<|im_end|>\n<|im_start|>user\nWhat happens next in this paragraph?\n\nShe then rubs a needle on a cotton ball then pushing it onto a pencil and wrapping thread around it. She then holds up a box of a product and then pouring several liquids into a bowl. she\nChoose your answer fro

In [66]:
print("instruct_dataset", len(instruct_dataset), instruct_dataset[:2])
print("conversations_dataset", len(conversations_dataset), conversations_dataset[:2])
print("reddit_causal_dataset", len(reddit_causal_dataset), reddit_causal_dataset[:2])

instruct_dataset 99999 {'text': ['<|im_start|>system\nYou are an AI assistant. You will be given a task. You must generate a detailed and long answer.<|im_end|>\n<|im_start|>user\nGenerate an approximately fifteen-word sentence that describes all this data: Midsummer House eatType restaurant; Midsummer House food Chinese; Midsummer House priceRange moderate; Midsummer House customer rating 3 out of 5; Midsummer House near All Bar One<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\nMidsummer House is a moderately priced Chinese restaurant with a 3/5 customer rating, located near All Bar One.<|im_end|>\n', '<|im_start|>system\nYou are a helpful assistant, who always provide explanation. Think like you are answering to a five year old.<|im_end|>\n<|im_start|>user\nWhat happens next in this paragraph?\n\nShe then rubs a needle on a cotton ball then pushing it onto a pencil and wrapping thread around it. She then holds up a box of a product and then pouring several liquids into a b

In [67]:
from datasets import concatenate_datasets

# Take first 50k from instruct_dataset
instruct_50k = instruct_dataset.select(range(min(50000, len(instruct_dataset))))

# Combine all datasets
combined_dataset = concatenate_datasets([instruct_50k, conversations_dataset, reddit_causal_dataset])

print("combined_dataset", len(combined_dataset))
print(combined_dataset[:3])


combined_dataset 180939
{'text': ['<|im_start|>system\nYou are an AI assistant. You will be given a task. You must generate a detailed and long answer.<|im_end|>\n<|im_start|>user\nGenerate an approximately fifteen-word sentence that describes all this data: Midsummer House eatType restaurant; Midsummer House food Chinese; Midsummer House priceRange moderate; Midsummer House customer rating 3 out of 5; Midsummer House near All Bar One<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\nMidsummer House is a moderately priced Chinese restaurant with a 3/5 customer rating, located near All Bar One.<|im_end|>\n', '<|im_start|>system\nYou are a helpful assistant, who always provide explanation. Think like you are answering to a five year old.<|im_end|>\n<|im_start|>user\nWhat happens next in this paragraph?\n\nShe then rubs a needle on a cotton ball then pushing it onto a pencil and wrapping thread around it. She then holds up a box of a product and then pouring several liquids into a 

In [69]:
combined_dataset.push_to_hub("Bossologist/general_Qwen3_ft_dataset")

Creating parquet from Arrow format: 100%|██████████| 181/181 [00:00<00:00, 348.31ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:17<00:00, 17.72s/it]


CommitInfo(commit_url='https://huggingface.co/datasets/Bossologist/general_Qwen3_ft_dataset/commit/2b8569b816f620f7dcdb88cd47b03f52b77d6df6', commit_message='Upload dataset', commit_description='', oid='2b8569b816f620f7dcdb88cd47b03f52b77d6df6', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/Bossologist/general_Qwen3_ft_dataset', endpoint='https://huggingface.co', repo_type='dataset', repo_id='Bossologist/general_Qwen3_ft_dataset'), pr_revision=None, pr_num=None)