In [2]:
# scripts/rebuild_from_csv.py
import csv, json, os
from tqdm import tqdm

os.makedirs("data", exist_ok=True)

def csv_to_jsonl(csv_path, out_path, user_col="context", reply_col="utterance", max_reply_words=80):
    written = 0
    with open(csv_path, newline='', encoding='utf-8') as f_in, open(out_path, 'w', encoding='utf-8') as f_out:
        reader = csv.DictReader(f_in)
        for row in tqdm(reader, desc=f"Processing {os.path.basename(csv_path)}"):
            user = row.get(user_col, "").strip()
            reply = row.get(reply_col, "").strip()
            if not user or not reply:
                continue
            # basic normalization of whitespace
            user = " ".join(user.split())
            reply = " ".join(reply.split())
            # filter extremely long replies (optional)
            if len(reply.split()) > max_reply_words:
                continue
            j = {"user": user, "compatika": reply}
            f_out.write(json.dumps(j, ensure_ascii=False) + "\n")
            written += 1
    print(f"Saved {written} examples to {out_path}")

# adjust paths to your CSV files
csv_to_jsonl("datas/empathetic_dialogues/train.csv", "data/train.jsonl")
csv_to_jsonl("datas/empathetic_dialogues/valid.csv", "data/val.jsonl")
csv_to_jsonl("datas/empathetic_dialogues/test.csv", "data/test.jsonl")


Processing train.csv: 76673it [00:00, 84456.22it/s]


Saved 76600 examples to data/train.jsonl


Processing valid.csv: 12030it [00:00, 86447.45it/s]


Saved 12019 examples to data/val.jsonl


Processing test.csv: 10943it [00:00, 81132.05it/s]

Saved 10932 examples to data/test.jsonl





In [4]:
# scripts/clean_jsonl.py
import json, os, re
from tqdm import tqdm

os.makedirs("data/cleaned", exist_ok=True)

REPLACEMENTS = [
    (r"_comma_", ","),
    (r"_apostrophe_", "'"),
    (r"_quote_", '"'),
    (r" +,", ","),        # spaces before commas
    (r"\s+([?.!,])", r"\1"), # fix spacing before punctuation
]

def clean_text(s):
    s = s.strip()
    for pat, repl in REPLACEMENTS:
        s = re.sub(pat, repl, s)
    # fix multiple spaces
    s = re.sub(r"\s+", " ", s)
    # fix spacing around parentheses and stray underscores
    s = s.replace(" _comma_", ",").replace("_", "")
    return s.strip()

def clean_file(infile, outfile, max_reply_words=80):
    kept = 0
    with open(infile, 'r', encoding='utf-8') as fi, open(outfile, 'w', encoding='utf-8') as fo:
        for line in tqdm(fi):
            try:
                j = json.loads(line)
                user = clean_text(j.get("user",""))
                reply = clean_text(j.get("compatika",""))
                if not user or not reply:
                    continue
                # enforce reply length limit
                if len(reply.split()) > max_reply_words:
                    # shorten by taking first 40 words + "..."
                    reply = " ".join(reply.split()[:40]).strip() + "..."
                fo.write(json.dumps({"user": user, "compatika": reply}, ensure_ascii=False) + "\n")
                kept += 1
            except Exception:
                continue
    print(f"Saved {kept} cleaned examples to {outfile}")

# clean all rebuilt files
clean_file("data/train.jsonl", "data/cleaned/train.jsonl")
clean_file("data/val.jsonl", "data/cleaned/val.jsonl")
clean_file("data/test.jsonl", "data/cleaned/test.jsonl")


0it [00:00, ?it/s]

76600it [00:02, 37178.01it/s]


Saved 76600 cleaned examples to data/cleaned/train.jsonl


12019it [00:00, 37222.04it/s]


Saved 12019 cleaned examples to data/cleaned/val.jsonl


10932it [00:00, 35804.55it/s]

Saved 10932 cleaned examples to data/cleaned/test.jsonl





In [5]:
# scripts/dedupe.py
import json
from tqdm import tqdm
seen = set()
outf = open("data/cleaned/train_dedup.jsonl","w",encoding="utf-8")
count=0
with open("data/cleaned/train.jsonl","r",encoding="utf-8") as f:
    for line in tqdm(f):
        j = json.loads(line)
        key = (j["user"], j["compatika"])
        if key in seen: continue
        seen.add(key)
        outf.write(line)
        count+=1
outf.close()
print("deduped count:", count)


76600it [00:00, 248655.23it/s]

deduped count: 76066





In [6]:
import json
for i, path in enumerate(["data/cleaned/train.jsonl","data/cleaned/val.jsonl"]):
    print("---- preview", path, "----")
    with open(path,'r',encoding='utf-8') as f:
        for n,line in enumerate(f):
            if n>=5: break
            j=json.loads(line)
            print(f"Sample {n+1}:\n USER: {j['user']}\n COMPATIKA: {j['compatika']}\n")


---- preview data/cleaned/train.jsonl ----
Sample 1:
 USER: sentimental
 COMPATIKA: I remember going to see the fireworks with my best friend. It was the first time we ever spent time alone together. Although there was a lot of people, we felt like the only people in the world.

Sample 2:
 USER: sentimental
 COMPATIKA: Was this a friend you were in love with, or just a best friend?

Sample 3:
 USER: sentimental
 COMPATIKA: This was a best friend. I miss her.

Sample 4:
 USER: sentimental
 COMPATIKA: Where has she gone?

Sample 5:
 USER: sentimental
 COMPATIKA: We no longer talk.

---- preview data/cleaned/val.jsonl ----
Sample 1:
 USER: terrified
 COMPATIKA: Today,as i was leaving for work in the morning,i had a tire burst in the middle of a busy road. That scared the hell out of me!

Sample 2:
 USER: terrified
 COMPATIKA: Are you fine now?

Sample 3:
 USER: terrified
 COMPATIKA: Yeah,i'm doing alright now, but with minor injuries.

Sample 4:
 USER: terrified
 COMPATIKA: Cool :) Is you

In [7]:
import csv
with open("datas/empathetic_dialogues/train.csv", newline='', encoding='utf-8') as f:
    r = csv.reader(f)
    headers = next(r)
    print("Headers:", headers)
    first = next(r)
    print("First row example:", first)


Headers: ['conv_id', 'utterance_idx', 'context', 'prompt', 'speaker_idx', 'utterance', 'selfeval', 'tags']
First row example: ['hit:0_conv:1', '1', 'sentimental', 'I remember going to the fireworks with my best friend. There was a lot of people_comma_ but it only felt like us in the world.', '1', 'I remember going to see the fireworks with my best friend. It was the first time we ever spent time alone together. Although there was a lot of people_comma_ we felt like the only people in the world.', '5|5|5_2|2|5', '']


In [8]:
import csv, json, os
from tqdm import tqdm

os.makedirs("data", exist_ok=True)

def csv_to_jsonl(csv_path, out_path, user_col="prompt", reply_col="utterance", max_reply_words=80):
    written = 0
    with open(csv_path, newline='', encoding='utf-8') as f_in, open(out_path, 'w', encoding='utf-8') as f_out:
        reader = csv.DictReader(f_in)
        for row in tqdm(reader, desc=f"Processing {os.path.basename(csv_path)}"):
            user = row.get(user_col, "").strip()
            reply = row.get(reply_col, "").strip()
            if not user or not reply:
                continue
            # remove _comma_ etc
            user = user.replace("_comma_", ",")
            reply = reply.replace("_comma_", ",")
            if len(reply.split()) > max_reply_words:
                continue
            j = {"user": user, "compatika": reply}
            f_out.write(json.dumps(j, ensure_ascii=False) + "\n")
            written += 1
    print(f"‚úÖ Saved {written} examples to {out_path}")

# update paths as needed
csv_to_jsonl("datas/empathetic_dialogues/train.csv", "data/train.jsonl")
csv_to_jsonl("datas/empathetic_dialogues/valid.csv", "data/val.jsonl")
csv_to_jsonl("datas/empathetic_dialogues/test.csv", "data/test.jsonl")


Processing train.csv: 76673it [00:00, 89291.26it/s]


‚úÖ Saved 76600 examples to data/train.jsonl


Processing valid.csv: 12030it [00:00, 59126.55it/s]


‚úÖ Saved 12019 examples to data/val.jsonl


Processing test.csv: 10943it [00:00, 81206.27it/s]

‚úÖ Saved 10932 examples to data/test.jsonl





In [9]:
import json

with open("data/train.jsonl", "r", encoding="utf-8") as f:
    for i, line in enumerate(f):
        j = json.loads(line)
        print(f"Sample {i+1}:\n USER: {j['user']}\n COMPATIKA: {j['compatika']}\n")
        if i >= 4:
            break


Sample 1:
 USER: I remember going to the fireworks with my best friend. There was a lot of people, but it only felt like us in the world.
 COMPATIKA: I remember going to see the fireworks with my best friend. It was the first time we ever spent time alone together. Although there was a lot of people, we felt like the only people in the world.

Sample 2:
 USER: I remember going to the fireworks with my best friend. There was a lot of people, but it only felt like us in the world.
 COMPATIKA: Was this a friend you were in love with, or just a best friend?

Sample 3:
 USER: I remember going to the fireworks with my best friend. There was a lot of people, but it only felt like us in the world.
 COMPATIKA: This was a best friend. I miss her.

Sample 4:
 USER: I remember going to the fireworks with my best friend. There was a lot of people, but it only felt like us in the world.
 COMPATIKA: Where has she gone?

Sample 5:
 USER: I remember going to the fireworks with my best friend. There was

In [10]:
# scripts/make_text_for_tokenizer.py
import json
import os
from tqdm import tqdm
import re

# Input and output paths
input_files = [
    "data/train.jsonl",
    "data/val.jsonl"
]
os.makedirs("data", exist_ok=True)
output_path = "data/text_for_tok.txt"

def clean_text(text):
    """Basic cleaning and normalization for tokenizer data"""
    text = text.strip()
    text = text.replace("_comma_", ",")
    text = re.sub(r"\s+", " ", text)
    text = re.sub(r"\s([?.!,])", r"\1", text)
    text = text.replace(" ,", ",")
    text = text.replace(" .", ".")
    return text.strip()

lines = []
for path in input_files:
    print(f"üìñ Reading {path}")
    with open(path, "r", encoding="utf-8") as f:
        for line in tqdm(f, desc=f"Processing {os.path.basename(path)}"):
            try:
                j = json.loads(line)
                user = clean_text(j.get("user", ""))
                compatika = clean_text(j.get("compatika", ""))
                if not user or not compatika:
                    continue
                # Merge user + compatika into one line
                merged = f"{user} {compatika}"
                lines.append(merged)
            except Exception:
                continue

# Remove duplicates
print(f"üßπ Removing duplicates...")
unique_lines = list(set(lines))
print(f"üßæ Writing {len(unique_lines)} unique lines to {output_path}")

with open(output_path, "w", encoding="utf-8") as out:
    for line in unique_lines:
        out.write(line.strip() + "\n")

print("‚úÖ Done! Your tokenizer training file is ready at:", output_path)


üìñ Reading data/train.jsonl


Processing train.jsonl: 76600it [00:01, 48746.57it/s]


üìñ Reading data/val.jsonl


Processing val.jsonl: 12019it [00:00, 46605.40it/s]


üßπ Removing duplicates...
üßæ Writing 88357 unique lines to data/text_for_tok.txt
‚úÖ Done! Your tokenizer training file is ready at: data/text_for_tok.txt


In [12]:
with open("data/text_for_tok.txt", "r", encoding="utf-8") as f:
    for i in range(5):
        print(f.readline().strip())


I broked something at home and everyonw blamed my brother. I didn't say anything. I feel bad about it. That is terrible! Why did you blame him?
i like person who are honestly with me That's a good way of saying live a good life.
I stole from my parents as a kid and got caught. I felt so bad afterwards. I would feel guilty too if I was in your shoes. Did you feel bad because you stole or bad because you got caught?
Doing a race with the wrong shoes. Good thing you found someone who had as big of feet as you do!
I am so scary that my manager doesn't keep his promise for my promotion. No, no reason to he will do it.


In [13]:
# scripts/make_text_for_tokenizer.py (clean single-line version)
import json, os, re
from tqdm import tqdm

os.makedirs("data", exist_ok=True)
input_files = ["data/train.jsonl", "data/val.jsonl"]
output_path = "data/text_for_tok.txt"

def clean_text(text):
    text = text.replace("_comma_", ",")
    text = re.sub(r"\s+", " ", text)
    text = re.sub(r"\s([?.!,])", r"\1", text)
    text = text.replace(" ,", ",").replace(" .", ".")
    return text.strip()

lines = set()
for path in input_files:
    print(f"üìñ Reading {path}")
    with open(path, "r", encoding="utf-8") as f:
        for line in tqdm(f, desc=f"Processing {os.path.basename(path)}"):
            try:
                j = json.loads(line)
                user = clean_text(j.get("user", ""))
                compatika = clean_text(j.get("compatika", ""))
                if not user or not compatika:
                    continue
                # add each sentence separately (not merged)
                lines.add(user)
                lines.add(compatika)
            except Exception:
                continue

print(f"üßæ Writing {len(lines)} unique lines to {output_path}")
with open(output_path, "w", encoding="utf-8") as f:
    for line in lines:
        f.write(line + "\n")

print("‚úÖ Clean tokenizer file ready:", output_path)


üìñ Reading data/train.jsonl


Processing train.jsonl: 76600it [00:01, 51290.87it/s]


üìñ Reading data/val.jsonl


Processing val.jsonl: 12019it [00:00, 52296.52it/s]

üßæ Writing 102044 unique lines to data/text_for_tok.txt
‚úÖ Clean tokenizer file ready: data/text_for_tok.txt





In [14]:
with open("data/text_for_tok.txt", "r", encoding="utf-8") as f:
    for i in range(10):
        print(f.readline().strip())


I want them to be able to go to the store and buy everything with no worries.
I was worried driving home tonight. People were swerving all over
I had an interview for a hospital I want to volunteer at but I felt a little under-qualified
Sorry about caps lock i didn't realize. And yea i think i'm going to propose here shortly.
I was sad when my cat died. He was so sick
The barber shop is totally worth it. They do such a good job.
Wow what kind of toys did you find
i was pissed when i saw someone left my gate open
Got seasons pass to an amusement park.
Yeah, that sounds like a really good idea!


In [15]:
import json
from tqdm import tqdm

def verify_jsonl(path):
    issues = 0
    total = 0
    with open(path, "r", encoding="utf-8") as f:
        for i, line in enumerate(tqdm(f, desc=f"Checking {path}")):
            total += 1
            try:
                j = json.loads(line)
                if not isinstance(j.get("user"), str) or not isinstance(j.get("compatika"), str):
                    print(f"‚ùå Line {i+1}: non-string fields")
                    issues += 1
                if not j["user"].strip() or not j["compatika"].strip():
                    print(f"‚ö†Ô∏è Line {i+1}: empty text")
                    issues += 1
                if "_comma_" in j["user"] or "_comma_" in j["compatika"]:
                    print(f"‚ö†Ô∏è Line {i+1}: found _comma_ artifact")
                    issues += 1
            except Exception as e:
                print(f"‚ùå Line {i+1}: invalid JSON ({e})")
                issues += 1
    print(f"\n‚úÖ Checked {total} lines. Issues found: {issues}")

verify_jsonl("data/train.jsonl")
verify_jsonl("data/val.jsonl")


Checking data/train.jsonl: 76600it [00:00, 260384.87it/s]



‚úÖ Checked 76600 lines. Issues found: 0


Checking data/val.jsonl: 12019it [00:00, 207128.46it/s]


‚úÖ Checked 12019 lines. Issues found: 0





In [16]:
import json
with open("data/train.jsonl", "r", encoding="utf-8") as f:
    for i, line in enumerate(f):
        j = json.loads(line)
        print(f"USER: {j['user']}")
        print(f"COMPATIKA: {j['compatika']}\n")
        if i >= 4: break


USER: I remember going to the fireworks with my best friend. There was a lot of people, but it only felt like us in the world.
COMPATIKA: I remember going to see the fireworks with my best friend. It was the first time we ever spent time alone together. Although there was a lot of people, we felt like the only people in the world.

USER: I remember going to the fireworks with my best friend. There was a lot of people, but it only felt like us in the world.
COMPATIKA: Was this a friend you were in love with, or just a best friend?

USER: I remember going to the fireworks with my best friend. There was a lot of people, but it only felt like us in the world.
COMPATIKA: This was a best friend. I miss her.

USER: I remember going to the fireworks with my best friend. There was a lot of people, but it only felt like us in the world.
COMPATIKA: Where has she gone?

USER: I remember going to the fireworks with my best friend. There was a lot of people, but it only felt like us in the world.
CO

In [17]:
import sentencepiece as spm

sp = spm.SentencePieceProcessor()
sp.Load("tok/compatika_sp.model")

print("‚úÖ Vocab size:", sp.GetPieceSize())
sample = "I feel a bit anxious today."
ids = sp.EncodeAsIds(sample)
print("Encoded IDs:", ids)
print("Decoded text:", sp.DecodeIds(ids))


‚úÖ Vocab size: 10000
Encoded IDs: [5, 87, 9, 364, 112, 204, 4]
Decoded text: I feel a bit anxious today.


In [18]:
with open("data/text_for_tok.txt", "r", encoding="utf-8") as f:
    for i in range(10):
        print(f.readline().strip())


I want them to be able to go to the store and buy everything with no worries.
I was worried driving home tonight. People were swerving all over
I had an interview for a hospital I want to volunteer at but I felt a little under-qualified
Sorry about caps lock i didn't realize. And yea i think i'm going to propose here shortly.
I was sad when my cat died. He was so sick
The barber shop is totally worth it. They do such a good job.
Wow what kind of toys did you find
i was pissed when i saw someone left my gate open
Got seasons pass to an amusement park.
Yeah, that sounds like a really good idea!


In [4]:
import json
from tqdm import tqdm

def verify_jsonl(path):
    total = 0
    errors = 0
    with open(path, "r", encoding="utf-8") as f:
        for i, line in enumerate(tqdm(f, desc=f"Checking {path}")):
            total += 1
            try:
                j = json.loads(line)
                if not isinstance(j.get("user"), str) or not isinstance(j.get("compatika"), str):
                    print(f"‚ùå Line {i+1}: missing or invalid fields")
                    errors += 1
            except Exception as e:
                print(f"‚ùå Line {i+1}: invalid JSON ({e})")
                errors += 1
    print(f"\n‚úÖ Checked {total} lines. Issues: {errors}")

verify_jsonl("data/val.jsonl")

Checking data/val.jsonl: 0it [00:00, ?it/s]

Checking data/val.jsonl: 12019it [00:00, 200608.61it/s]


‚úÖ Checked 12019 lines. Issues: 0





In [6]:
with open("data/text_for_tok.txt", "r", encoding="utf-8") as f:
    for i in range(10):
        print(f.readline().strip())

I want them to be able to go to the store and buy everything with no worries.
I was worried driving home tonight. People were swerving all over
I had an interview for a hospital I want to volunteer at but I felt a little under-qualified
Sorry about caps lock i didn't realize. And yea i think i'm going to propose here shortly.
I was sad when my cat died. He was so sick
The barber shop is totally worth it. They do such a good job.
Wow what kind of toys did you find
i was pissed when i saw someone left my gate open
Got seasons pass to an amusement park.
Yeah, that sounds like a really good idea!


In [7]:
import sentencepiece as spm

sp = spm.SentencePieceProcessor(model_file="tok/compatika_sp.model")

print("‚úÖ Vocab size:", sp.get_piece_size())
print("Sample tokens:", [sp.id_to_piece(i) for i in range(10)])
print("Encode/Decode test:")
ids = sp.encode("I feel a bit anxious today.", out_type=int)
print("Encoded:", ids)
print("Decoded:", sp.decode(ids))

‚úÖ Vocab size: 10000
Sample tokens: ['<pad>', '<unk>', '<s>', '</s>', '.', '‚ñÅI', '_', '‚ñÅto', "'", '‚ñÅa']
Encode/Decode test:
Encoded: [5, 87, 9, 364, 112, 204, 4]
Decoded: I feel a bit anxious today.


In [1]:
# quick example cleaning step
import re, json
def clean_text(t):
    t = t.replace("_comma", ",")
    t = re.sub(r"\s+", " ", t)
    t = re.sub(r"[0-9]+\|[0-9]+\|[0-9]+[_0-9|]*", "", t)  # remove rating patterns
    t = re.sub(r":\d+", "", t)  # remove stray numeric tokens
    return t.strip()

with open("data/train.jsonl","r",encoding="utf-8") as f, open("data/train_clean.jsonl","w",encoding="utf-8") as out:
    for line in f:
        s = json.loads(line)
        s["user"] = clean_text(s["user"])
        s["compatika"] = clean_text(s["compatika"])
        out.write(json.dumps(s, ensure_ascii=False) + "\n")


In [2]:
input_text = f"USER: {s['user']}\nCOMPATIKA:"
target_text = s['compatika']


In [4]:
import torch
state_dict = torch.load("compatika_v1alpha_scratch.pt")
model.load_state_dict(state_dict)
train_more_epochs(...)


  state_dict = torch.load("compatika_v1alpha_scratch.pt")


NameError: name 'model' is not defined

In [5]:
import json

unique = set()
cleaned = []
with open("data/train.jsonl", "r", encoding="utf-8") as f:
    for line in f:
        d = json.loads(line)
        pair = (d["user"].strip().lower(), d["compatika"].strip().lower())
        if pair not in unique:
            unique.add(pair)
            cleaned.append(d)

with open("data/train_clean.jsonl", "w", encoding="utf-8") as f:
    for d in cleaned:
        f.write(json.dumps(d, ensure_ascii=False) + "\n")

print(f"‚úÖ Cleaned dataset saved with {len(cleaned)} unique pairs.")


‚úÖ Cleaned dataset saved with 76338 unique pairs.


In [11]:
import re

def clean_text(t):
    t = t.replace("_comma", ",")
    t = re.sub(r"\s+", " ", t).strip()
    return t

# Apply cleaning to both fields before saving


In [12]:
import random
lines = open("data/train_clean.jsonl", "r", encoding="utf-8").readlines()
random.shuffle(lines)
open("data/train_shuffled.jsonl", "w", encoding="utf-8").writelines(lines)


In [13]:
import json

with open("data/train_clean.jsonl", "r", encoding="utf-8") as f, \
     open("data/text_for_tok.txt", "w", encoding="utf-8") as out:
    for line in f:
        sample = json.loads(line)
        out.write(sample["user"] + "\n")
        out.write(sample["compatika"] + "\n")


In [14]:
import sentencepiece as spm
spm.SentencePieceTrainer.Train(
    input='data/text_for_tok.txt',
    model_prefix='tok/compatika_sp',
    vocab_size=10000,
    model_type='unigram',
    character_coverage=1.0,
    pad_id=0,
    unk_id=1,
    bos_id=2,
    eos_id=3
)
print("‚úÖ Tokenizer retrained and saved as tok/compatika_sp.model")


RuntimeError: Permission denied: "tok/compatika_sp.model": No such file or directory Error #2

In [None]:
import sentencepiece as spm
sp = spm.SentencePieceProcessor(model_file="tok/compatika_sp.model")
text = "I feel anxious about tomorrow."
print("Encoded:", sp.encode(text, out_type=int))
print("Decoded:", sp.decode(sp.encode(text, out_type=int)))


Encoded: [4, 72, 771, 45, 254, 5]
Decoded: I feel anxious about tomorrow.


In [15]:
import sentencepiece as spm
import os

# Create output folder if not exists
os.makedirs("tok", exist_ok=True)

# Train tokenizer
spm.SentencePieceTrainer.Train(
    input='data/text_for_tok.txt',          # your dataset text
    model_prefix='tok/compatika_sp',        # output files prefix
    vocab_size=10000,                       # vocabulary size
    model_type='unigram',                   # good for dialogue data
    character_coverage=1.0,                 # cover all characters
    pad_id=0, unk_id=1, bos_id=2, eos_id=3  # reserved IDs
)

print("‚úÖ Tokenizer trained successfully!")
print("üìÑ Files generated: tok/compatika_sp.model and tok/compatika_sp.vocab")


‚úÖ Tokenizer trained successfully!
üìÑ Files generated: tok/compatika_sp.model and tok/compatika_sp.vocab


In [16]:
import sentencepiece as spm

sp = spm.SentencePieceProcessor(model_file="tok/compatika_sp.model")
print("‚úÖ Vocab size:", sp.get_piece_size())

sample_text = "I feel anxious about tomorrow."
encoded = sp.encode(sample_text, out_type=int)
decoded = sp.decode(encoded)

print("Encoded IDs:", encoded)
print("Decoded text:", decoded)


‚úÖ Vocab size: 10000
Encoded IDs: [4, 72, 771, 45, 254, 5]
Decoded text: I feel anxious about tomorrow.


In [18]:
import json
import random

input_path = "data/train_clean.jsonl"   # or your main dataset file
train_out = "data/train.jsonl"
val_out = "data/val.jsonl"

# 1Ô∏è‚É£ Read all samples
with open(input_path, "r", encoding="utf-8") as f:
    samples = [json.loads(line) for line in f]

# 2Ô∏è‚É£ Shuffle for randomness
random.shuffle(samples)

# 3Ô∏è‚É£ Split 90% train / 10% val (you can change ratio)
split_idx = int(0.9 * len(samples))
train_samples = samples[:split_idx]
val_samples = samples[split_idx:]

# 4Ô∏è‚É£ Write to new files
with open(train_out, "w", encoding="utf-8") as f:
    for s in train_samples:
        f.write(json.dumps(s, ensure_ascii=False) + "\n")

with open(val_out, "w", encoding="utf-8") as f:
    for s in val_samples:
        f.write(json.dumps(s, ensure_ascii=False) + "\n")

print(f"‚úÖ Split complete!")
print(f"Train samples: {len(train_samples)}")
print(f"Val samples:   {len(val_samples)}")


‚úÖ Split complete!
Train samples: 68940
Val samples:   7660
