In [None]:
"""
prepare_dataset.py
-------------------
Downloads and merges DailyDialog + EmpatheticDialogues datasets
into a single human_chat.txt file for training Human Communication GPT.
Now supports Hugging Face's new `trust_remote_code=True` requirement.
"""

import os
from datasets import load_dataset

def safe_load_dataset(name, alt_name=None):
    """Try to load dataset with trust_remote_code=True, fallback if needed."""
    try:
        print(f"üîπ Loading dataset: {name}")
        return load_dataset(name, trust_remote_code=True)
    except Exception as e:
        print(f"‚ö†Ô∏è  Could not load {name}: {e}")
        if alt_name:
            print(f"üîÅ Trying alternate dataset: {alt_name}")
            try:
                return load_dataset(alt_name, trust_remote_code=True)
            except Exception as e2:
                print(f"‚ö†Ô∏è  Alternate failed too: {e2}")
        return None


def main():
    daily = safe_load_dataset("RoskoN/dailydialog", alt_name="daily_dialog")
    empathetic = safe_load_dataset("KoTfi/empathetic_dialogues_parquet", alt_name="empathetic_dialogues")

    dialogs = []

    if daily:
        print("‚úÖ Processing DailyDialog data...")
        split_name = "train" if "train" in daily else list(daily.keys())[0]
        for d in daily[split_name]:
            if isinstance(d, dict):
                if "dialog" in d:
                    dialogs.append("\n".join(d["dialog"]))
                elif "utterances" in d:
                    dialogs.append("\n".join(d["utterances"]))
    else:
        print("‚ö†Ô∏è  No DailyDialog data loaded.")

    if empathetic:
        print("‚úÖ Processing EmpatheticDialogues data...")
        split_name = "train" if "train" in empathetic else list(empathetic.keys())[0]
        for d in empathetic[split_name]:
            if isinstance(d, dict):
                context = d.get("context", "")
                utter = d.get("utterance", "")
                dialogs.append(f"{context}\n{utter}")
    else:
        print("‚ö†Ô∏è  No EmpatheticDialogues data loaded.")

    print(f"üß© Combining {len(dialogs)} dialogues...")
    text = "\n\n".join(dialogs)

    os.makedirs("data", exist_ok=True)
    with open("data/human_chat.txt", "w", encoding="utf-8") as f:
        f.write(text)

    print("‚úÖ Dataset saved to data/human_chat.txt")
    print(f"üì¶ Total characters: {len(text):,}")

if __name__ == "__main__":
    main()


üîπ Loading dataset: RoskoN/dailydialog


Repo card metadata block was not found. Setting CardData to empty.
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Generating train split: 11118 examples [00:00, 18977.21 examples/s]
Generating validation split: 1000 examples [00:00, 15632.13 examples/s]
Generating test split: 1000 examples [00:00, 15906.37 examples/s]


üîπ Loading dataset: KoTfi/empathetic_dialogues_parquet
‚ö†Ô∏è  Could not load KoTfi/empathetic_dialogues_parquet: Dataset 'KoTfi/empathetic_dialogues_parquet' doesn't exist on the Hub or cannot be accessed.
üîÅ Trying alternate dataset: empathetic_dialogues


Downloading data: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 28.0M/28.0M [00:06<00:00, 4.53MB/s]
Generating train split: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 76673/76673 [00:03<00:00, 22042.36 examples/s]
Generating validation split: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 12030/12030 [00:01<00:00, 10572.44 examples/s]
Generating test split: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10943/10943 [00:00<00:00, 11779.14 examples/s]


‚úÖ Processing DailyDialog data...
‚úÖ Processing EmpatheticDialogues data...
üß© Combining 87791 dialogues...
‚úÖ Dataset saved to data/human_chat.txt
üì¶ Total characters: 12,950,920


In [2]:
from datasets import load_dataset
import re, os

# ----------------------- Config -----------------------
os.makedirs("data", exist_ok=True)
dialogs = []
total_chars = 0

datasets_to_load = [
    ("RoskoN/dailydialog", None),
    ("facebook/empathetic_dialogues", None),
    ("allenai/blended_skill_talk", None),
    ("bavard/personachat_truecased", None),
    ("open_subtitles", "en-hi"),
    ("multi_woz_v22", None),
]

def clean(text):
    text = text.replace("\t", " ").replace("\r", "")
    text = re.sub(r"\[.*?\]", "", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()

# ----------------------- Merge Loop -----------------------
for name, subset in datasets_to_load:
    try:
        print(f"\nüì• Loading {name} ...")
        ds = load_dataset(name, subset, trust_remote_code=True)
        count_before = len(dialogs)

        for split in ds.keys():
            for d in ds[split]:
                text = None
                if "dialog" in d:
                    dialog_data = d["dialog"]
                    if isinstance(dialog_data, list):
                        text = "\n".join(dialog_data)
                    elif isinstance(dialog_data, str):
                        text = dialog_data

                elif "utterances" in d:
                    text = "\n".join(
                        u["text"] for u in d["utterances"] if "text" in u
                    )

                elif "context" in d and "utterance" in d:
                    text = f"{d['context']}\n{d['utterance']}"

                elif "text" in d:
                    text = d["text"]

                elif "response" in d:
                    text = d["response"]

                if text and isinstance(text, str):
                    t = clean(text)
                    if len(t) > 10:
                        dialogs.append(t)
                        total_chars += len(t)

        added = len(dialogs) - count_before
        print(f"‚úÖ {name}: added {added:,} dialogues, total ~{total_chars/1e6:.2f} MB")

    except Exception as e:
        print(f"‚ö†Ô∏è Skipped {name} due to error:\n{e}\n")

# ----------------------- Save -----------------------
merged_text = "\n\n".join(dialogs)
output_path = "data/human_chat_1gb.txt"

with open(output_path, "w", encoding="utf-8") as f:
    f.write(merged_text)

print(f"\n‚úÖ Saved merged dataset: {output_path}")
print(f"üì¶ Final size: {len(merged_text.encode('utf-8'))/1e6:.2f} MB ({len(dialogs):,} dialogues)")
print("\nüéâ Done! You can now train your Human Communication GPT.")


`trust_remote_code` is not supported anymore.
Please check that the Hugging Face dataset 'RoskoN/dailydialog' isn't based on a loading script and remove `trust_remote_code`.
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.



üì• Loading RoskoN/dailydialog ...


Using the latest cached version of the dataset since RoskoN/dailydialog couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'full' at C:\Users\aman\.cache\huggingface\datasets\RoskoN___dailydialog\full\1.0.0\7d96d5a6afcb95cf518611d5147758f4a5991bab51dc97c3a8131b6fb7811b76 (last modified on Sat Nov  1 09:40:46 2025).
`trust_remote_code` is not supported anymore.
Please check that the Hugging Face dataset 'facebook/empathetic_dialogues' isn't based on a loading script and remove `trust_remote_code`.
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.


‚ö†Ô∏è Skipped RoskoN/dailydialog due to error:
string indices must be integers


üì• Loading facebook/empathetic_dialogues ...


Using the latest cached version of the dataset since facebook/empathetic_dialogues couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at C:\Users\aman\.cache\huggingface\datasets\facebook___empathetic_dialogues\default\0.1.0\09bbeed3882a67db98c73952fb3c1c9a85af83dc78f81454c2454382fd03f6cf (last modified on Sun Nov  2 18:15:39 2025).
`trust_remote_code` is not supported anymore.
Please check that the Hugging Face dataset 'allenai/blended_skill_talk' isn't based on a loading script and remove `trust_remote_code`.
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.


‚úÖ facebook/empathetic_dialogues: added 99,530 dialogues, total ~9.71 MB

üì• Loading allenai/blended_skill_talk ...


`trust_remote_code` is not supported anymore.
Please check that the Hugging Face dataset 'bavard/personachat_truecased' isn't based on a loading script and remove `trust_remote_code`.
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.


‚ö†Ô∏è Skipped allenai/blended_skill_talk due to error:
Dataset 'allenai/blended_skill_talk' doesn't exist on the Hub or cannot be accessed.


üì• Loading bavard/personachat_truecased ...


Using the latest cached version of the dataset since bavard/personachat_truecased couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'full' at C:\Users\aman\.cache\huggingface\datasets\bavard___personachat_truecased\full\1.0.0\73ee8f1a0d9e42255af5a8301877a2f3ac638e55b1cd9cbccca5ab7e23d2b638 (last modified on Sun Nov  2 18:16:30 2025).
`trust_remote_code` is not supported anymore.
Please check that the Hugging Face dataset 'open_subtitles' isn't based on a loading script and remove `trust_remote_code`.
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.


‚úÖ bavard/personachat_truecased: added 0 dialogues, total ~9.71 MB

üì• Loading open_subtitles ...


Using the latest cached version of the dataset since open_subtitles couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'en-hi' at C:\Users\aman\.cache\huggingface\datasets\open_subtitles\en-hi\2018.0.0\c1ec973ca4b6e588740d8f167cc0e24ea3f626e70bc7ffe467e944730500e198 (last modified on Sun Nov  2 18:23:30 2025).
`trust_remote_code` is not supported anymore.
Please check that the Hugging Face dataset 'multi_woz_v22' isn't based on a loading script and remove `trust_remote_code`.
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.


‚úÖ open_subtitles: added 0 dialogues, total ~9.71 MB

üì• Loading multi_woz_v22 ...


Using the latest cached version of the dataset since multi_woz_v22 couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'v2.2_active_only' at C:\Users\aman\.cache\huggingface\datasets\multi_woz_v22\v2.2_active_only\2.2.0\6719c8b21478299411a0c6fdb7137c3ebab2e6425129af831687fb7851c69eb5 (last modified on Sun Nov  2 18:18:13 2025).


‚úÖ multi_woz_v22: added 0 dialogues, total ~9.71 MB

‚úÖ Saved merged dataset: data/human_chat_1gb.txt
üì¶ Final size: 9.91 MB (99,530 dialogues)

üéâ Done! You can now train your Human Communication GPT.


In [7]:
# ============================================================
# üß† Build a 1 GB Human-Communication Dataset Automatically
# ============================================================

import os, re, shutil
from huggingface_hub import login, hf_hub_download
import pandas as pd

# ----------------------- 1Ô∏è‚É£  LOGIN --------------------------
# üëá Paste your HF token between the quotes ‚Üì‚Üì‚Üì‚Üì‚Üì
HF_TOKEN = "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"
login(token=HF_TOKEN)

os.makedirs("data", exist_ok=True)

# ----------------------- 2Ô∏è‚É£  DOWNLOAD -----------------------
downloads = [
    ("yhavinga/dailydialog-parquet", "train.parquet", "dailydialog-parquet/train.parquet"),
    ("facebook/empathetic_dialogues", "train.json", "empathetic_dialogues/train.json"),
    ("lightchen/personachat_cleaned", "train.parquet", "personachat_cleaned/train.parquet"),
    ("Helsinki-NLP/OpenSubtitles", "en_train.parquet", "opensubtitles/en_train.parquet"),
]

for repo, filename, outpath in downloads:
    try:
        print(f"\nüì• Downloading {repo}/{filename} ...")
        os.makedirs(os.path.dirname(f"data/{outpath}"), exist_ok=True)
        local_path = hf_hub_download(repo_id=repo, filename=filename, token=HF_TOKEN)
        shutil.copy(local_path, f"data/{outpath}")
        print(f"‚úÖ Saved to data/{outpath}")
    except Exception as e:
        print(f"‚ö†Ô∏è Failed {repo}: {e}")

# ----------------------- 3Ô∏è‚É£  MERGE --------------------------
dialogs, total_chars = [], 0

def clean(t):
    t = re.sub(r"\s+", " ", str(t))
    return t.strip()

def add_texts(path, columns):
    global total_chars
    if not os.path.exists(path):
        print(f"‚ö†Ô∏è {path} not found, skipping")
        return
    print(f"üìÇ Loading {path}")
    if path.endswith(".parquet"):
        df = pd.read_parquet(path)
    else:
        df = pd.read_json(path, lines=True)
    count = 0
    for col in columns:
        if col in df.columns:
            for t in df[col].dropna():
                txt = clean(t)
                if len(txt) > 10:
                    dialogs.append(txt)
                    total_chars += len(txt)
                    count += 1
    print(f"‚úÖ Added {count:,} texts, total ~{total_chars/1e6:.1f} MB")

# Each dataset‚Äôs useful columns
add_texts("data/dailydialog-parquet/train.parquet", ["dialog"])
add_texts("data/empathetic_dialogues/train.json", ["utterance", "context"])
add_texts("data/personachat_cleaned/train.parquet", ["text"])
add_texts("data/opensubtitles/en_train.parquet", ["translation", "text"])

# ----------------------- 4Ô∏è‚É£  SAVE ---------------------------
merged = "\n\n".join(dialogs)
out_path = "data/human_chat_1gb.txt"
open(out_path, "w", encoding="utf-8").write(merged)

size_mb = len(merged.encode("utf-8")) / 1e6
print(f"\n‚úÖ Saved {out_path}  ({len(dialogs):,} dialogs, ~{size_mb:.1f} MB)")
print("\nüéâ Done ‚Äî you can now train your Human Communication GPT!")



üì• Downloading yhavinga/dailydialog-parquet/train.parquet ...
‚ö†Ô∏è Failed yhavinga/dailydialog-parquet: 404 Client Error. (Request ID: Root=1-69076e45-2a8e00714e3fd219406fcbc9;b2161432-631d-4775-aab6-5f350762f554)

Repository Not Found for url: https://huggingface.co/yhavinga/dailydialog-parquet/resolve/main/train.parquet.
Please make sure you specified the correct `repo_id` and `repo_type`.
If you are trying to access a private or gated repo, make sure you are authenticated. For more details, see https://huggingface.co/docs/huggingface_hub/authentication

üì• Downloading facebook/empathetic_dialogues/train.json ...
‚ö†Ô∏è Failed facebook/empathetic_dialogues: 404 Client Error. (Request ID: Root=1-69076e46-1c5fd6593149b5796c96567b;dcd56eb7-9940-49b7-b17b-5eda56b85cac)

Repository Not Found for url: https://huggingface.co/facebook/empathetic_dialogues/resolve/main/train.json.
Please make sure you specified the correct `repo_id` and `repo_type`.
If you are trying to access a privat

In [9]:
# ============================================================
# Build a 1-GB Human-Communication Dataset (Current Hugging Face)
# ============================================================

from datasets import load_dataset
import os, re

os.makedirs("data", exist_ok=True)
dialogs, total_chars = [], 0

datasets_to_load = [
    "daily_dialog",
    "empathetic_dialogues",
    "blended_skill_talk",
    "bavard/personachat_truecased",
    "open_subtitles",
    "multi_woz_v22",
]

def clean(txt):
    txt = re.sub(r"\s+", " ", str(txt))
    return txt.strip()

for name in datasets_to_load:
    try:
        print(f"\nüì• Loading {name} ...")
        ds = load_dataset(name)
        count_before = len(dialogs)

        for split in ds.keys():
            for d in ds[split]:
                text = None
                for key in ["dialog", "dialogue", "utterances", "text",
                            "response", "context", "utterance"]:
                    if key in d:
                        val = d[key]
                        if isinstance(val, list):
                            val = "\n".join(map(str, val))
                        text = str(val)
                        break
                if text and len(text) > 10:
                    dialogs.append(clean(text))
                    total_chars += len(text)

        added = len(dialogs) - count_before
        print(f"‚úÖ {name}: +{added:,} samples, total ~{total_chars/1e6:.1f} MB")

    except Exception as e:
        print(f"‚ö†Ô∏è  Skipped {name}: {e}")

# Save merged text
out_path = "data/human_chat_1gb.txt"
open(out_path, "w", encoding="utf-8").write("\n\n".join(dialogs))
size = len(open(out_path, "rb").read()) / 1e6
print(f"\n‚úÖ Saved {out_path}  ({len(dialogs):,} dialogs, ~{size:.1f} MB)")



üì• Loading daily_dialog ...
‚ö†Ô∏è  Skipped daily_dialog: Dataset scripts are no longer supported, but found daily_dialog.py

üì• Loading empathetic_dialogues ...


Using the latest cached version of the dataset since empathetic_dialogues couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at C:\Users\aman\.cache\huggingface\datasets\empathetic_dialogues\default\0.1.0\09bbeed3882a67db98c73952fb3c1c9a85af83dc78f81454c2454382fd03f6cf (last modified on Sat Nov  1 09:41:03 2025).


‚úÖ empathetic_dialogues: +14,426 samples, total ~0.2 MB

üì• Loading blended_skill_talk ...
‚úÖ blended_skill_talk: +4,507 samples, total ~0.3 MB

üì• Loading bavard/personachat_truecased ...


Using the latest cached version of the dataset since bavard/personachat_truecased couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'full' at C:\Users\aman\.cache\huggingface\datasets\bavard___personachat_truecased\full\1.0.0\73ee8f1a0d9e42255af5a8301877a2f3ac638e55b1cd9cbccca5ab7e23d2b638 (last modified on Sun Nov  2 18:16:30 2025).


‚úÖ bavard/personachat_truecased: +0 samples, total ~0.3 MB

üì• Loading open_subtitles ...


Using the latest cached version of the dataset since open_subtitles couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'en-hi' at C:\Users\aman\.cache\huggingface\datasets\open_subtitles\en-hi\2018.0.0\c1ec973ca4b6e588740d8f167cc0e24ea3f626e70bc7ffe467e944730500e198 (last modified on Sun Nov  2 18:23:30 2025).


‚úÖ open_subtitles: +0 samples, total ~0.3 MB

üì• Loading multi_woz_v22 ...


Using the latest cached version of the dataset since multi_woz_v22 couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'v2.2_active_only' at C:\Users\aman\.cache\huggingface\datasets\multi_woz_v22\v2.2_active_only\2.2.0\6719c8b21478299411a0c6fdb7137c3ebab2e6425129af831687fb7851c69eb5 (last modified on Sun Nov  2 18:18:13 2025).


‚úÖ multi_woz_v22: +0 samples, total ~0.3 MB

‚úÖ Saved data/human_chat_1gb.txt  (18,933 dialogs, ~0.3 MB)


In [4]:
import os

path = "data/dailydialog-parquet/train/dialogues_train.txt"

if os.path.exists(path):
    size_mb = os.path.getsize(path) / 1e6
    print(f"‚úÖ File found: {path} ({size_mb:.2f} MB)")

    with open(path, "r", encoding="utf-8") as f:
        lines = f.readlines()
        print(f"üìÑ Total lines: {len(lines)}")
        print("üß© Sample content:")
        print("\n".join(lines[:5]))
else:
    print("‚ùå File missing ‚Äî check your download path.")


‚úÖ File found: data/dailydialog-parquet/train/dialogues_train.txt (6.04 MB)
üìÑ Total lines: 11118
üß© Sample content:
Say , Jim , how about going for a few beers after dinner ? __eou__ You know that is tempting but is really not good for our fitness . __eou__ What do you mean ? It will help us to relax . __eou__ Do you really think so ? I don't . It will just make us fat and act silly . Remember last time ? __eou__ I guess you are right.But what shall we do ? I don't feel like sitting at home . __eou__ I suggest a walk over to the gym where we can play singsong and meet some of our friends . __eou__ That's a good idea . I hear Mary and Sally often go there to play pingpong.Perhaps we can make a foursome with them . __eou__ Sounds great to me ! If they are willing , we could ask them to go dancing with us.That is excellent exercise and fun , too . __eou__ Good.Let ' s go now . __eou__ All right . __eou__

Can you do push-ups ? __eou__ Of course I can . It's a piece of cake ! Believe