In [None]:
"""
prepare_dataset.py
-------------------
Downloads and merges DailyDialog + EmpatheticDialogues datasets
into a single human_chat.txt file for training Human Communication GPT.
Now supports Hugging Face's new `trust_remote_code=True` requirement.
"""

import os
from datasets import load_dataset

def safe_load_dataset(name, alt_name=None):
    """Try to load dataset with trust_remote_code=True, fallback if needed."""
    try:
        print(f"üîπ Loading dataset: {name}")
        return load_dataset(name, trust_remote_code=True)
    except Exception as e:
        print(f"‚ö†Ô∏è  Could not load {name}: {e}")
        if alt_name:
            print(f"üîÅ Trying alternate dataset: {alt_name}")
            try:
                return load_dataset(alt_name, trust_remote_code=True)
            except Exception as e2:
                print(f"‚ö†Ô∏è  Alternate failed too: {e2}")
        return None


def main():
    daily = safe_load_dataset("RoskoN/dailydialog", alt_name="daily_dialog")
    empathetic = safe_load_dataset("KoTfi/empathetic_dialogues_parquet", alt_name="empathetic_dialogues")

    dialogs = []

    if daily:
        print("‚úÖ Processing DailyDialog data...")
        split_name = "train" if "train" in daily else list(daily.keys())[0]
        for d in daily[split_name]:
            if isinstance(d, dict):
                if "dialog" in d:
                    dialogs.append("\n".join(d["dialog"]))
                elif "utterances" in d:
                    dialogs.append("\n".join(d["utterances"]))
    else:
        print("‚ö†Ô∏è  No DailyDialog data loaded.")

    if empathetic:
        print("‚úÖ Processing EmpatheticDialogues data...")
        split_name = "train" if "train" in empathetic else list(empathetic.keys())[0]
        for d in empathetic[split_name]:
            if isinstance(d, dict):
                context = d.get("context", "")
                utter = d.get("utterance", "")
                dialogs.append(f"{context}\n{utter}")
    else:
        print("‚ö†Ô∏è  No EmpatheticDialogues data loaded.")

    print(f"üß© Combining {len(dialogs)} dialogues...")
    text = "\n\n".join(dialogs)

    os.makedirs("data", exist_ok=True)
    with open("data/human_chat.txt", "w", encoding="utf-8") as f:
        f.write(text)

    print("‚úÖ Dataset saved to data/human_chat.txt")
    print(f"üì¶ Total characters: {len(text):,}")

if __name__ == "__main__":
    main()


üîπ Loading dataset: RoskoN/dailydialog


Repo card metadata block was not found. Setting CardData to empty.
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Generating train split: 11118 examples [00:00, 18977.21 examples/s]
Generating validation split: 1000 examples [00:00, 15632.13 examples/s]
Generating test split: 1000 examples [00:00, 15906.37 examples/s]


üîπ Loading dataset: KoTfi/empathetic_dialogues_parquet
‚ö†Ô∏è  Could not load KoTfi/empathetic_dialogues_parquet: Dataset 'KoTfi/empathetic_dialogues_parquet' doesn't exist on the Hub or cannot be accessed.
üîÅ Trying alternate dataset: empathetic_dialogues


Downloading data: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 28.0M/28.0M [00:06<00:00, 4.53MB/s]
Generating train split: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 76673/76673 [00:03<00:00, 22042.36 examples/s]
Generating validation split: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 12030/12030 [00:01<00:00, 10572.44 examples/s]
Generating test split: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10943/10943 [00:00<00:00, 11779.14 examples/s]


‚úÖ Processing DailyDialog data...
‚úÖ Processing EmpatheticDialogues data...
üß© Combining 87791 dialogues...
‚úÖ Dataset saved to data/human_chat.txt
üì¶ Total characters: 12,950,920


In [7]:
from datasets import load_dataset
import re, os

# ----------------------- Config -----------------------
os.makedirs("data", exist_ok=True)
dialogs = []
total_chars = 0

datasets_to_load = [
    ("RoskoN/dailydialog", None),
    ("facebook/empathetic_dialogues", None),
    ("allenai/blended_skill_talk", None),
    ("bavard/personachat_truecased", None),
    ("open_subtitles", "en-hi"),
    ("multi_woz_v22", None),
]

def clean(text):
    text = text.replace("\t", " ").replace("\r", "")
    text = re.sub(r"\[.*?\]", "", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()

# ----------------------- Merge Loop -----------------------
for name, subset in datasets_to_load:
    try:
        print(f"\nüì• Loading {name} ...")
        ds = load_dataset(name, subset, trust_remote_code=True)
        count_before = len(dialogs)

        for split in ds.keys():
            for d in ds[split]:
                text = None
                if "dialog" in d:
                    dialog_data = d["dialog"]
                    if isinstance(dialog_data, list):
                        text = "\n".join(dialog_data)
                    elif isinstance(dialog_data, str):
                        text = dialog_data

                elif "utterances" in d:
                    text = "\n".join(
                        u["text"] for u in d["utterances"] if "text" in u
                    )

                elif "context" in d and "utterance" in d:
                    text = f"{d['context']}\n{d['utterance']}"

                elif "text" in d:
                    text = d["text"]

                elif "response" in d:
                    text = d["response"]

                if text and isinstance(text, str):
                    t = clean(text)
                    if len(t) > 10:
                        dialogs.append(t)
                        total_chars += len(t)

        added = len(dialogs) - count_before
        print(f"‚úÖ {name}: added {added:,} dialogues, total ~{total_chars/1e6:.2f} MB")

    except Exception as e:
        print(f"‚ö†Ô∏è Skipped {name} due to error:\n{e}\n")

# ----------------------- Save -----------------------
merged_text = "\n\n".join(dialogs)
output_path = "data/human_chat_1gb.txt"

with open(output_path, "w", encoding="utf-8") as f:
    f.write(merged_text)

print(f"\n‚úÖ Saved merged dataset: {output_path}")
print(f"üì¶ Final size: {len(merged_text.encode('utf-8'))/1e6:.2f} MB ({len(dialogs):,} dialogues)")
print("\nüéâ Done! You can now train your Human Communication GPT.")



üì• Loading RoskoN/dailydialog ...


Repo card metadata block was not found. Setting CardData to empty.


‚ö†Ô∏è Skipped RoskoN/dailydialog due to error:
string indices must be integers


üì• Loading facebook/empathetic_dialogues ...
‚úÖ facebook/empathetic_dialogues: added 99,530 dialogues, total ~9.71 MB

üì• Loading allenai/blended_skill_talk ...
‚ö†Ô∏è Skipped allenai/blended_skill_talk due to error:
Dataset 'allenai/blended_skill_talk' doesn't exist on the Hub or cannot be accessed.


üì• Loading bavard/personachat_truecased ...


Repo card metadata block was not found. Setting CardData to empty.


‚úÖ bavard/personachat_truecased: added 0 dialogues, total ~9.71 MB

üì• Loading open_subtitles ...


Downloading data: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2.97M/2.97M [00:01<00:00, 1.56MB/s]
Generating train split: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 93016/93016 [00:05<00:00, 16130.09 examples/s]


‚úÖ open_subtitles: added 0 dialogues, total ~9.71 MB

üì• Loading multi_woz_v22 ...
‚úÖ multi_woz_v22: added 0 dialogues, total ~9.71 MB

‚úÖ Saved merged dataset: data/human_chat_1gb.txt
üì¶ Final size: 9.91 MB (99,530 dialogues)

üéâ Done! You can now train your Human Communication GPT.
