In [8]:
!pip install datasets pycocotools huggingface_hub pillow matplotlib pandas seaborn tqdm ultralytics




In [18]:
# Colab cell: mount drive
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [27]:
import os
WORKDIR = '/content/drive/MyDrive/M2-Dataset'
os.makedirs(WORKDIR, exist_ok=True)


In [28]:
%%sh
ls

drive
DSAI-M2
project_dataset_sample
sample_data


In [22]:
# create_coco_sample.py
import json, os, random, shutil, requests, zipfile, io
from tqdm import tqdm
os.makedirs(WORKDIR, exist_ok=True)
ANNOT_ZIP = os.path.join(WORKDIR, 'annotations_trainval2017.zip')
ANNOT_DIR = os.path.join(WORKDIR, 'coco_annotations')
SAMPLE_OUT_JSON = os.path.join(WORKDIR, 'instances_coco_sample.json')
IMG_OUT_DIR = os.path.join(WORKDIR, 'images', 'coco_sample')
os.makedirs(IMG_OUT_DIR, exist_ok=True)

def download_annotations():
    if not os.path.exists(ANNOT_ZIP):
        url = 'http://images.cocodataset.org/annotations/annotations_trainval2017.zip'
        print("Downloading COCO annotations (240MB)...")
        r = requests.get(url, stream=True)
        with open(ANNOT_ZIP, 'wb') as f:
            for chunk in r.iter_content(chunk_size=8192):
                if chunk: f.write(chunk)
    if not os.path.exists(ANNOT_DIR):
        shutil.unpack_archive(ANNOT_ZIP, ANNOT_DIR)

def sample_coco(coco_json_path, out_json_path, n_images=1000, seed=42, ensure_cat_ids=None):
    random.seed(seed)
    coco = json.load(open(coco_json_path,'r'))
    images = coco['images']
    anns = coco['annotations']
    cats = coco['categories']

    img_id_to_img = {img['id']:img for img in images}
    img_id_to_anns = {}
    for a in anns:
        img_id_to_anns.setdefault(a['image_id'], []).append(a)

    all_image_ids = list(img_id_to_img.keys())
    selected = set()

    if ensure_cat_ids:
        cat_to_imgs = {}
        for a in anns:
            if a['category_id'] in ensure_cat_ids:
                cat_to_imgs.setdefault(a['category_id'], set()).add(a['image_id'])
        per_cat_quota = max(1, n_images // max(1,len(ensure_cat_ids)))
        for cat, imgs in cat_to_imgs.items():
            imgs = list(imgs)
            random.shuffle(imgs)
            to_take = imgs[:per_cat_quota]
            selected.update(to_take)

    remaining = [i for i in all_image_ids if i not in selected]
    random.shuffle(remaining)
    to_add = n_images - len(selected)
    if to_add > 0:
        selected.update(remaining[:to_add])

    sel_images = [img_id_to_img[i] for i in selected]
    sel_anns = [a for a in anns if a['image_id'] in selected]
    used_cat_ids = sorted({a['category_id'] for a in sel_anns})
    sel_cats = [c for c in cats if c['id'] in used_cat_ids]

    out = {'images': sel_images, 'annotations': sel_anns, 'categories': sel_cats}
    json.dump(out, open(out_json_path,'w'), indent=2)
    print(f"Saved sample JSON with {len(sel_images)} images and {len(sel_anns)} annotations to {out_json_path}")
    return out

def download_images_for_coco_sample(coco_sample_json, out_dir, retries=3):
    coco = json.load(open(coco_sample_json,'r'))
    os.makedirs(out_dir, exist_ok=True)
    for img in tqdm(coco['images']):
        url = img.get('coco_url') or ("http://images.cocodataset.org/train2017/" + img['file_name'])
        outp = os.path.join(out_dir, img['file_name'])
        if os.path.exists(outp): continue
        ok = False
        for attempt in range(retries):
            try:
                r = requests.get(url, stream=True, timeout=20)
                if r.status_code == 200:
                    with open(outp,'wb') as f:
                        for chunk in r.iter_content(1024):
                            if chunk: f.write(chunk)
                    ok = True
                    break
            except Exception as e:
                pass
        if not ok:
            print("Failed to download", url)

# run
download_annotations()
COCO_JSON = os.path.join(ANNOT_DIR, 'annotations', 'instances_train2017.json')
sample_coco(COCO_JSON, SAMPLE_OUT_JSON, n_images=1000, seed=42)
download_images_for_coco_sample(SAMPLE_OUT_JSON, IMG_OUT_DIR)
print("COCO sample created at:", SAMPLE_OUT_JSON)


Downloading COCO annotations (240MB)...
Saved sample JSON with 1000 images and 7011 annotations to /content/M2-Dataset/instances_coco_sample.json


100%|██████████| 1000/1000 [04:29<00:00,  3.72it/s]

COCO sample created at: /content/M2-Dataset/instances_coco_sample.json





In [23]:
# create_splits.py
import json, os, random
def create_splits(coco_path, out_dir, seed=42, ratios=(0.8,0.1,0.1)):
    os.makedirs(out_dir, exist_ok=True)
    coco = json.load(open(coco_path,'r'))
    images = coco['images']
    anns = coco['annotations']
    cats = coco['categories']
    random.seed(seed)
    ids = [im['id'] for im in images]
    random.shuffle(ids)
    n = len(ids)
    n_train = int(ratios[0]*n)
    n_val = int(ratios[1]*n)
    train_ids = set(ids[:n_train])
    val_ids = set(ids[n_train:n_train+n_val])
    test_ids = set(ids[n_train+n_val:])
    def subset(idsset):
        imgs = [i for i in images if i['id'] in idsset]
        anns_sub = [a for a in anns if a['image_id'] in idsset]
        used_cat_ids = sorted({a['category_id'] for a in anns_sub})
        cats_sub = [c for c in cats if c['id'] in used_cat_ids]
        return {'images':imgs,'annotations':anns_sub,'categories':cats_sub}
    json.dump(subset(train_ids), open(os.path.join(out_dir,'instances_train.json'),'w'), indent=2)
    json.dump(subset(val_ids), open(os.path.join(out_dir,'instances_val.json'),'w'), indent=2)
    json.dump(subset(test_ids), open(os.path.join(out_dir,'instances_test.json'),'w'), indent=2)
    # data.yaml
    names = [c['name'] for c in cats]
    data_yaml = {
      'train': 'path/to/images/train',
      'val': 'path/to/images/val',
      'test': 'path/to/images/test',
      'nc': len(names),
      'names': names
    }
    import yaml
    with open(os.path.join(out_dir,'data.yaml'),'w') as f:
        yaml.safe_dump(data_yaml, f)
    print("Splits created in", out_dir)

# example run
create_splits(f'{WORKDIR}/instances_coco_sample.json', f'{WORKDIR}/processed')


Splits created in /content/M2-Dataset/processed
