In [None]:

# ✅ STEP 1: Mount Google Drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# ✅ STEP 2: Install Hugging Face datasets
!pip install datasets -q

# ✅ STEP 3: Create folders in Drive
import os

base_path = "/content/drive/MyDrive/AI_Reasoning_Datasets"
os.makedirs(base_path, exist_ok=True)

# ✅ STEP 4: Load and Save GSM8K Dataset
from datasets import load_dataset
import json

gsm8k = load_dataset("gsm8k", "main")
gsm8k_path = f"{base_path}/gsm8k_sample.json"
with open(gsm8k_path, "w") as f:
    json.dump(gsm8k["train"][:100], f)
print(f"GSM8K sample saved to: {gsm8k_path}")

# ✅ STEP 5: Load and Save MATH Dataset
math_ds = load_dataset("openai/math", "algebra")
math_path = f"{base_path}/math_sample.json"
with open(math_path, "w") as f:
    json.dump(math_ds["test"][:100], f)
print(f"MATH dataset sample saved to: {math_path}")

# ✅ STEP 6: Clone ARC and copy to Drive
!git clone https://github.com/fchollet/ARC.git
!cp -r ARC/data {base_path}/ARC_data
print("ARC data copied to Drive.")

# ✅ STEP 7: Load and Save Lean Dojo Dataset
lean = load_dataset("lean-dojo/benchmarks", split="train")
lean_path = f"{base_path}/lean_dojo_sample.json"
with open(lean_path, "w") as f:
    json.dump(lean[:100], f)
print(f"Lean Dojo sample saved to: {lean_path}")

# ✅ STEP 8: Load and Save HumanEval
humaneval = load_dataset("openai_humaneval")
he_path = f"{base_path}/humaneval_sample.json"
with open(he_path, "w") as f:
    json.dump(humaneval["test"][:100], f)
print(f"HumanEval sample saved to: {he_path}")

# ✅ STEP 9: Load and Save MBPP
mbpp = load_dataset("mbpp")
mbpp_path = f"{base_path}/mbpp_sample.json"
with open(mbpp_path, "w") as f:
    json.dump(mbpp["train"][:100], f)
print(f"MBPP sample saved to: {mbpp_path}")

# ✅ STEP 10: Load and Save Code Instructions
code_instruct = load_dataset("codeparrot/code_instructions_filtered")
ci_path = f"{base_path}/code_instruct_sample.json"
with open(ci_path, "w") as f:
    json.dump(code_instruct["train"][:100], f)
print(f"Code Instructions sample saved to: {ci_path}")

# ✅ STEP 11: Stream The Stack (Python) and Save
stack = load_dataset("bigcode/the-stack", data_dir="data/python", split="train", streaming=True)
stack_samples = [x for _, x in zip(range(10), stack)]
stack_path = f"{base_path}/stack_python_sample.json"
with open(stack_path, "w") as f:
    json.dump(stack_samples, f)
print(f"The Stack (Python) sample saved to: {stack_path}")
