In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
import json
import random
import gzip

# path setting
input_path = "./ultrafeedback_binarized_llamafactory.tar.gz"
output_path = "output_path.jsonl"

open_fn = gzip.open if input_path.endswith(".gz") else open

data = []
with open_fn(input_path, "rt", encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if not line:
            continue 
        try:
            data.append(json.loads(line))
        except json.JSONDecodeError:
            print("parsing error", line[:50])
            continue

print(f"✅ total jsonl line: {len(data)}")

sample_size = 10000
sampled_data = random.sample(data, sample_size)

with open(output_path, "w", encoding="utf-8") as f:
    for item in sampled_data:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")

print(f"🎯 {sample_size} samples saved: {output_path}")

parsing error ultrafeedback_binarized_llamafactory.jsonl        


parsing error                                                   
✅ total jsonl line: 64247
🎯 10000 samples saved: output_path.jsonl


In [None]:
from google.colab import files
files.download(output_path)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [2]:
import random
import json

# 1. 기존 샘플링된 10,000개 로드 (이미 메모리에 있다면 이 단계 생략 가능)
input_path = "/data/dataset_cartography/data/ultrafeedback_sample_10000.jsonl"
output_path = "/data/dataset_cartography/data/ultrafeedback_sample_swapped_5000.jsonl"

with open(input_path, "r", encoding="utf-8") as f:
    data = [json.loads(line) for line in f]

# 2. 2000개 무작위 선택 후 chosen <-> rejected 스왑
swap_indices = set(random.sample(range(len(data)), 5000))

for idx in swap_indices:
    data[idx]["chosen"], data[idx]["rejected"] = data[idx]["rejected"], data[idx]["chosen"]

# 3. 저장
with open(output_path, "w", encoding="utf-8") as f:
    for item in data:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")

print(f"🔄 2000개 스왑 완료! 저장 경로: {output_path}")

🔄 2000개 스왑 완료! 저장 경로: /data/dataset_cartography/data/ultrafeedback_sample_swapped_5000.jsonl


In [None]:
from google.colab import files

output_path = "/content/drive/MyDrive/ultrafeedback/ultrafeedback_sample_swapped_2000.jsonl"

# 다운로드 요청
files.download(output_path)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
!mv "/content/drive/MyDrive/Colab Notebooks/ultrafeedback_sampling.ipynb" "/content/drive/MyDrive/ultrafeedback/ultrafeedback_sampling.ipynb"

In [None]:
# 스왑된 항목 저장 (2000개)
swapped_only_path = "/content/drive/MyDrive/ultrafeedback/ultrafeedback_swapped_only_2000.jsonl"
swapped_data = [data[idx] for idx in swap_indices]

with open(swapped_only_path, "w", encoding="utf-8") as f:
    for item in swapped_data:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")

print(f"✅ 스왑된 2000개 샘플 저장 완료: {swapped_only_path}")

# 스왑되지 않은 항목 저장 (8000개)
not_swapped_path = "/content/drive/MyDrive/ultrafeedback/ultrafeedback_not_swapped_8000.jsonl"
not_swapped_indices = set(range(len(data))) - swap_indices
not_swapped_data = [data[idx] for idx in not_swapped_indices]

with open(not_swapped_path, "w", encoding="utf-8") as f:
    for item in not_swapped_data:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")

print(f"✅ 스왑되지 않은 8000개 샘플 저장 완료: {not_swapped_path}")

✅ 스왑된 2000개 샘플 저장 완료: /content/drive/MyDrive/ultrafeedback/ultrafeedback_swapped_only_2000.jsonl
✅ 스왑되지 않은 8000개 샘플 저장 완료: /content/drive/MyDrive/ultrafeedback/ultrafeedback_not_swapped_8000.jsonl


In [None]:
from google.colab import files

# 파일 경로
swapped_only_path = "/content/drive/MyDrive/ultrafeedback/ultrafeedback_swapped_only_2000.jsonl"
not_swapped_path = "/content/drive/MyDrive/ultrafeedback/ultrafeedback_not_swapped_8000.jsonl"

# 먼저 Colab 작업 디렉토리로 복사 (Google Drive에서 바로 다운로드는 불안정할 수 있음)
!cp "{swapped_only_path}" /content/swapped_only_2000.jsonl
!cp "{not_swapped_path}" /content/not_swapped_8000.jsonl

# 다운로드
files.download("/content/swapped_only_2000.jsonl")
files.download("/content/not_swapped_8000.jsonl")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import random
import json

# 1. 기존 샘플링된 10,000개 로드 (이미 메모리에 있다면 이 단계 생략 가능)
input_path = "/content/drive/MyDrive/ultrafeedback/ultrafeedback_sample_10000.jsonl"
output_path = "/content/drive/MyDrive/ultrafeedback/ultrafeedback_sample_swapped_500.jsonl"

with open(input_path, "r", encoding="utf-8") as f:
    data = [json.loads(line) for line in f]

# 2. 2000개 무작위 선택 후 chosen <-> rejected 스왑
swap_indices = set(random.sample(range(len(data)), 500))

for idx in swap_indices:
    data[idx]["chosen"], data[idx]["rejected"] = data[idx]["rejected"], data[idx]["chosen"]

# 3. 저장
with open(output_path, "w", encoding="utf-8") as f:
    for item in data:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")

print(f"🔄 500개 스왑 완료! 저장 경로: {output_path}")

🔄 500개 스왑 완료! 저장 경로: /content/drive/MyDrive/ultrafeedback/ultrafeedback_sample_swapped_500.jsonl


In [None]:
from google.colab import files

output_path = "/content/drive/MyDrive/ultrafeedback/ultrafeedback_sample_swapped_500.jsonl"

# 다운로드 요청
files.download(output_path)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [4]:
# 스왑된 항목 저장 (2000개)
swapped_only_path = "/content/drive/MyDrive/ultrafeedback/ultrafeedback_swapped_only_500.jsonl"
swapped_data = [data[idx] for idx in swap_indices]

with open(swapped_only_path, "w", encoding="utf-8") as f:
    for item in swapped_data:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")

print(f"✅ 스왑된 500개 샘플 저장 완료: {swapped_only_path}")

# 스왑되지 않은 항목 저장 (8000개)
not_swapped_path = "/content/drive/MyDrive/ultrafeedback/ultrafeedback_not_swapped_9500.jsonl"
not_swapped_indices = set(range(len(data))) - swap_indices
not_swapped_data = [data[idx] for idx in not_swapped_indices]

with open(not_swapped_path, "w", encoding="utf-8") as f:
    for item in not_swapped_data:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")

print(f"✅ 스왑되지 않은 9500개 샘플 저장 완료: {not_swapped_path}")

✅ 스왑된 500개 샘플 저장 완료: /content/drive/MyDrive/ultrafeedback/ultrafeedback_swapped_only_500.jsonl
✅ 스왑되지 않은 9500개 샘플 저장 완료: /content/drive/MyDrive/ultrafeedback/ultrafeedback_not_swapped_9500.jsonl


In [5]:
from google.colab import files

# 파일 경로
swapped_only_path = "/content/drive/MyDrive/ultrafeedback/ultrafeedback_swapped_only_500.jsonl"
not_swapped_path = "/content/drive/MyDrive/ultrafeedback/ultrafeedback_not_swapped_9500.jsonl"

# 먼저 Colab 작업 디렉토리로 복사 (Google Drive에서 바로 다운로드는 불안정할 수 있음)
!cp "{swapped_only_path}" /content/swapped_only_500.jsonl
!cp "{not_swapped_path}" /content/not_swapped_9500.jsonl

# 다운로드
files.download("/content/swapped_only_500.jsonl")
files.download("/content/not_swapped_9500.jsonl")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [6]:
import random
import json

# 1. 기존 샘플링된 10,000개 로드 (이미 메모리에 있다면 이 단계 생략 가능)
input_path = "/content/drive/MyDrive/ultrafeedback/ultrafeedback_sample_10000.jsonl"
output_path = "/content/drive/MyDrive/ultrafeedback/ultrafeedback_sample_swapped_1000.jsonl"

with open(input_path, "r", encoding="utf-8") as f:
    data = [json.loads(line) for line in f]

# 2. 2000개 무작위 선택 후 chosen <-> rejected 스왑
swap_indices = set(random.sample(range(len(data)), 1000))

for idx in swap_indices:
    data[idx]["chosen"], data[idx]["rejected"] = data[idx]["rejected"], data[idx]["chosen"]

# 3. 저장
with open(output_path, "w", encoding="utf-8") as f:
    for item in data:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")

print(f"🔄 1000개 스왑 완료! 저장 경로: {output_path}")

🔄 1000개 스왑 완료! 저장 경로: /content/drive/MyDrive/ultrafeedback/ultrafeedback_sample_swapped_1000.jsonl


In [7]:
from google.colab import files

output_path = "/content/drive/MyDrive/ultrafeedback/ultrafeedback_sample_swapped_1000.jsonl"

# 다운로드 요청
files.download(output_path)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [8]:
# 스왑된 항목 저장 (2000개)
swapped_only_path = "/content/drive/MyDrive/ultrafeedback/ultrafeedback_swapped_only_1000.jsonl"
swapped_data = [data[idx] for idx in swap_indices]

with open(swapped_only_path, "w", encoding="utf-8") as f:
    for item in swapped_data:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")

print(f"✅ 스왑된 1000개 샘플 저장 완료: {swapped_only_path}")

# 스왑되지 않은 항목 저장 (8000개)
not_swapped_path = "/content/drive/MyDrive/ultrafeedback/ultrafeedback_not_swapped_9000.jsonl"
not_swapped_indices = set(range(len(data))) - swap_indices
not_swapped_data = [data[idx] for idx in not_swapped_indices]

with open(not_swapped_path, "w", encoding="utf-8") as f:
    for item in not_swapped_data:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")

print(f"✅ 스왑되지 않은 9000개 샘플 저장 완료: {not_swapped_path}")

✅ 스왑된 1000개 샘플 저장 완료: /content/drive/MyDrive/ultrafeedback/ultrafeedback_swapped_only_1000.jsonl
✅ 스왑되지 않은 9000개 샘플 저장 완료: /content/drive/MyDrive/ultrafeedback/ultrafeedback_not_swapped_9000.jsonl


In [9]:
from google.colab import files

# 파일 경로
swapped_only_path = "/content/drive/MyDrive/ultrafeedback/ultrafeedback_swapped_only_1000.jsonl"
not_swapped_path = "/content/drive/MyDrive/ultrafeedback/ultrafeedback_not_swapped_9000.jsonl"

# 먼저 Colab 작업 디렉토리로 복사 (Google Drive에서 바로 다운로드는 불안정할 수 있음)
!cp "{swapped_only_path}" /content/swapped_only_1000.jsonl
!cp "{not_swapped_path}" /content/not_swapped_9000.jsonl

# 다운로드
files.download("/content/swapped_only_1000.jsonl")
files.download("/content/not_swapped_9000.jsonl")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [10]:
import random
import json

# 1. 기존 샘플링된 10,000개 로드 (이미 메모리에 있다면 이 단계 생략 가능)
input_path = "/content/drive/MyDrive/ultrafeedback/ultrafeedback_sample_10000.jsonl"
output_path = "/content/drive/MyDrive/ultrafeedback/ultrafeedback_sample_swapped_3000.jsonl"

with open(input_path, "r", encoding="utf-8") as f:
    data = [json.loads(line) for line in f]

# 2. 2000개 무작위 선택 후 chosen <-> rejected 스왑
swap_indices = set(random.sample(range(len(data)), 3000))

for idx in swap_indices:
    data[idx]["chosen"], data[idx]["rejected"] = data[idx]["rejected"], data[idx]["chosen"]

# 3. 저장
with open(output_path, "w", encoding="utf-8") as f:
    for item in data:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")

print(f"🔄 3000개 스왑 완료! 저장 경로: {output_path}")

🔄 3000개 스왑 완료! 저장 경로: /content/drive/MyDrive/ultrafeedback/ultrafeedback_sample_swapped_3000.jsonl


In [11]:
from google.colab import files

output_path = "/content/drive/MyDrive/ultrafeedback/ultrafeedback_sample_swapped_3000.jsonl"

# 다운로드 요청
files.download(output_path)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [12]:
# 스왑된 항목 저장 (2000개)
swapped_only_path = "/content/drive/MyDrive/ultrafeedback/ultrafeedback_swapped_only_3000.jsonl"
swapped_data = [data[idx] for idx in swap_indices]

with open(swapped_only_path, "w", encoding="utf-8") as f:
    for item in swapped_data:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")

print(f"✅ 스왑된 3000개 샘플 저장 완료: {swapped_only_path}")

# 스왑되지 않은 항목 저장 (8000개)
not_swapped_path = "/content/drive/MyDrive/ultrafeedback/ultrafeedback_not_swapped_7000.jsonl"
not_swapped_indices = set(range(len(data))) - swap_indices
not_swapped_data = [data[idx] for idx in not_swapped_indices]

with open(not_swapped_path, "w", encoding="utf-8") as f:
    for item in not_swapped_data:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")

print(f"✅ 스왑되지 않은 7000개 샘플 저장 완료: {not_swapped_path}")

✅ 스왑된 3000개 샘플 저장 완료: /content/drive/MyDrive/ultrafeedback/ultrafeedback_swapped_only_3000.jsonl
✅ 스왑되지 않은 7000개 샘플 저장 완료: /content/drive/MyDrive/ultrafeedback/ultrafeedback_not_swapped_7000.jsonl


In [13]:
from google.colab import files

# 파일 경로
swapped_only_path = "/content/drive/MyDrive/ultrafeedback/ultrafeedback_swapped_only_3000.jsonl"
not_swapped_path = "/content/drive/MyDrive/ultrafeedback/ultrafeedback_not_swapped_7000.jsonl"

# 먼저 Colab 작업 디렉토리로 복사 (Google Drive에서 바로 다운로드는 불안정할 수 있음)
!cp "{swapped_only_path}" /content/swapped_only_3000.jsonl
!cp "{not_swapped_path}" /content/not_swapped_7000.jsonl

# 다운로드
files.download("/content/swapped_only_3000.jsonl")
files.download("/content/not_swapped_7000.jsonl")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>