In [2]:
import subprocess, sys, importlib

def ensure(pkg, version=None):
    try:
        importlib.import_module(pkg)
    except ImportError:
        v = f"{pkg}=={version}" if version else pkg
        subprocess.check_call([sys.executable, "-m", "pip", "install", v])

ensure("datasets", "2.19.0")
ensure("huggingface_hub", "0.23.0")   # provides login() helper


In [3]:
from huggingface_hub import login
import getpass, os

if "HUGGINGFACEHUB_API_TOKEN" not in os.environ:
    # prompt once per session; the token is kept only in memory
    token = getpass.getpass("Paste your HuggingFace access token: ")


Paste your HuggingFace access token:  ········


In [4]:
import subprocess, sys, importlib, textwrap

def ensure(pkg, version=None):
    try:
        importlib.import_module(pkg)
        print(f"✔ {pkg} already installed")
    except ImportError:
        v = f"{pkg}=={version}" if version else pkg
        print(f"⏳ Installing {v} …")
        subprocess.check_call([sys.executable, "-m", "pip", "install", v])

ensure("datasets", "2.19.0")
ensure("huggingface_hub", "0.23.0")     # provides `login` helper


✔ datasets already installed
✔ huggingface_hub already installed


In [5]:
from huggingface_hub import login
import getpass, os, textwrap, json

if "HUGGINGFACEHUB_API_TOKEN" not in os.environ:
    token = getpass.getpass("🔑 Paste your Hugging Face access token → ")
    login(token=token, write_permission=False)
else:
    print("✔ Already logged in")


🔑 Paste your Hugging Face access token →  ········


In [7]:
from datasets import load_dataset
from pathlib import Path

RAW_DIR = Path("../data/raw")
CACHE_DIR = RAW_DIR / "hf_cache"
CACHE_DIR.mkdir(parents=True, exist_ok=True)

sharegpt = load_dataset(
    "HuggingFaceH4/ShareGPTV4",
    split="train",
    cache_dir=str(CACHE_DIR),
    streaming=False,        # downloads to disk
    trust_remote_code=True  # suppresses safety prompt
)

print(f"✅ Downloaded. Total conversations: {len(sharegpt):,}")


DatasetNotFoundError: Dataset 'HuggingFaceH4/ShareGPTV4' doesn't exist on the Hub or cannot be accessed.

In [8]:
from datasets import load_dataset
from pathlib import Path

# point the Hugging Face datasets cache into your project tree
RAW_DIR = Path("../data/raw")
RAW_DIR.mkdir(parents=True, exist_ok=True)

sharegpt = load_dataset(
    "HuggingFaceH4/ShareGPT_Vicuna_unfiltered",
    split="train",
    cache_dir=str(RAW_DIR / "hf_cache"),   # keeps everything inside data/raw/
    streaming=False                       # downloads to disk; safer than streaming for large evals
)

print(f"Total conversations: {len(sharegpt):,}")


DatasetNotFoundError: Dataset 'HuggingFaceH4/ShareGPT_Vicuna_unfiltered' doesn't exist on the Hub or cannot be accessed.

In [9]:
from datasets import load_dataset
from pathlib import Path

RAW_DIR   = Path("../data/raw")
CACHE_DIR = RAW_DIR / "hf_cache"
CACHE_DIR.mkdir(parents=True, exist_ok=True)

sharegpt = load_dataset(
    "anon8231489123/ShareGPT_Vicuna_unfiltered",
    split="train",
    cache_dir=str(CACHE_DIR),
    streaming=False        # <‑‑ downloads ~4 GB once
)

print(f"✅  Conversations loaded: {len(sharegpt):,}")


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


DataFilesNotFoundError: No (supported) data files found in anon8231489123/ShareGPT_Vicuna_unfiltered

In [10]:
from datasets import load_dataset

ds_stream = load_dataset(
    "anon8231489123/ShareGPT_Vicuna_unfiltered",
    split="train",
    streaming=True          # <‑‑ zero‑download index check
)

first = next(iter(ds_stream))
print(first.keys())         # -> dict with 'conversations', 'id', etc.
print(first["conversations"][:2])


DataFilesNotFoundError: No (supported) data files found in anon8231489123/ShareGPT_Vicuna_unfiltered

In [11]:
dataset = load_dataset("anon8231489123/ShareGPT_Vicuna_unfiltered")

DataFilesNotFoundError: No (supported) data files found in anon8231489123/ShareGPT_Vicuna_unfiltered

In [12]:
from pathlib import Path
RAW_DIR = Path("../data/raw")
RAW_DIR.mkdir(parents=True, exist_ok=True)


In [13]:
import requests, tqdm, shutil

url   = "https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json"
fname = RAW_DIR / "ShareGPT_V3_unfiltered_cleaned_split.json"

if not fname.exists():
    with requests.get(url, stream=True) as r:
        r.raise_for_status()
        total = int(r.headers.get("content-length", 0))
        with tqdm.tqdm.wrapattr(r.raw, "read", total=total, desc=fname.name) as raw, open(fname, "wb") as f:
            shutil.copyfileobj(raw, f)
    print("✔ Downloaded to", fname)
else:
    print("✔ File already exists:", fname)


ShareGPT_V3_unfiltered_cleaned_split.json: 100%|████████████████████████████████████| 642M/642M [02:08<00:00, 5.25MB/s]

✔ Downloaded to ..\data\raw\ShareGPT_V3_unfiltered_cleaned_split.json





In [17]:
from pathlib import Path
import gzip, json

fname = Path("../data/raw/ShareGPT_V3_unfiltered_cleaned_split.json")

print("Size on disk:", round(fname.stat().st_size / 1e6, 1), "MB")

# Peek at the first 200 bytes to see the format
with open(fname, "rb") as f:
    head = f.read(200)

print("First bytes:", head[:30])


Size on disk: 672.8 MB
First bytes: b'[\n  {\n    "id": "QWJhYvA_0",\n '


In [18]:
with open(fname, "r", encoding="utf‑8") as f:
    data = json.load(f)
import pandas as pd
df = pd.DataFrame(data)


In [19]:
print("Rows:", len(df))
print("Columns:", df.columns.tolist())
df.head(2)


Rows: 94145
Columns: ['id', 'conversations']


Unnamed: 0,id,conversations
0,QWJhYvA_0,"[{'from': 'human', 'value': 'Summarize the mai..."
1,i6IyJda_0,"[{'from': 'human', 'value': 'How to tell if a ..."


In [20]:
def flatten(row):
    turns = [f"{m['from']}: {m['value'].strip()}" for m in row["conversations"]]
    return "\n<eot>\n".join(turns)

df["transcript"] = df.apply(flatten, axis=1)
df["transcript"].str[:300].head()


0    human: Summarize the main ideas of Jeff Walker...
1    human: How to tell if a customer segment is we...
2    human: In Java, I want to replace string like ...
3    human: Metaphorical language is also used to d...
4    gpt: Lo and behold! By the grace of divine int...
Name: transcript, dtype: object

In [21]:
arrow_path = Path("../data/raw/sharegpt_v3.arrow")
if not arrow_path.exists():
    import pyarrow as pa, pyarrow.parquet as pq
    table = pa.Table.from_pandas(df[["id", "transcript"]])
    pq.write_table(table, arrow_path)
    print("✔ Arrow file saved:", arrow_path)
else:
    print("Arrow file already exists:", arrow_path)


✔ Arrow file saved: ..\data\raw\sharegpt_v3.arrow


In [22]:
#Next time you can load instantly:
import pyarrow.parquet as pq
df = pq.read_table("../data/raw/sharegpt_v3.arrow").to_pandas()


In [23]:
### Cell ⑨ — (optional) track the raw JSON & Arrow with DVC + Git

!dvc add ../data/raw/ShareGPT_V3_unfiltered_cleaned_split.json
!dvc add ../data/raw/sharegpt_v3.arrow
!git add ../data/raw/*.dvc notebooks/01_download_sharegpt.ipynb
!git commit -m "feat: processed ShareGPT_V3 to Arrow + transcripts"
!dvc push && git push


'dvc' is not recognized as an internal or external command,
operable program or batch file.
'dvc' is not recognized as an internal or external command,
operable program or batch file.
fatal: pathspec '../data/raw/*.dvc' did not match any files


On branch main
Your branch is up to date with 'origin/main'.

Untracked files:
  (use "git add <file>..." to include in what will be committed)
	../data/raw/ShareGPT_V3_unfiltered_cleaned_split.json
	../data/raw/sharegpt_v3.arrow
	./

nothing added to commit but untracked files present (use "git add" to track)


black................................................(no files to check)Skipped
ruff.................................................(no files to check)Skipped
fix end of files.....................................(no files to check)Skipped
trim trailing whitespace.............................(no files to check)Skipped
'dvc' is not recognized as an internal or external command,
operable program or batch file.
