# HAM10000 Complete TFRecord Pipeline - Google Drive Edition
Pipeline complet pentru dataset-ul HAM10000 :
- Split stratificat deterministic (90% train, 10% test)
- Generare TFRecords shard-uite
- tf.data pipeline cu augmentare
- Preprocessing pentru ResNet152V2

## 1. Mount Google Drive

In [1]:
!jupyter nbconvert --to notebook --clear-output --inplace TFRecord_Pipeline_HAM10000.ipynb


This application is used to convert notebook files (*.ipynb)
        to various other formats.


Options
The options below are convenience aliases to configurable class-options,
as listed in the "Equivalent to" description-line of the aliases.
To see all configurable class-options for some <cmd>, use:
    <cmd> --help-all

--debug
    set log level to logging.DEBUG (maximize logging output)
    Equivalent to: [--Application.log_level=10]
--show-config
    Show the application's configuration (human-readable format)
    Equivalent to: [--Application.show_config=True]
--show-config-json
    Show the application's configuration (json format)
    Equivalent to: [--Application.show_config_json=True]
--generate-config
    generate default config file
    Equivalent to: [--JupyterApp.generate_config=True]
-y
    Answer yes to any questions instead of prompting.
    Equivalent to: [--JupyterApp.answer_yes=True]
--execute
    Execute the notebook prior to export.
    Equivalent to: [--ExecutePr

In [2]:
from google.colab import drive
drive.mount('/content/drive')

print("✓ Google Drive mounted successfully")

Mounted at /content/drive
✓ Google Drive mounted successfully


## 2. Setup și Import-uri

In [3]:
import tensorflow as tf
import pandas as pd
import numpy as np
import json
import os
from pathlib import Path
from sklearn.model_selection import train_test_split
from collections import Counter
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

SEED = 42
np.random.seed(SEED)
tf.random.set_seed(SEED)

print(f"TensorFlow version: {tf.__version__}")
print(f"GPU available: {tf.config.list_physical_devices('GPU')}")

TensorFlow version: 2.19.0
GPU available: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


## 3. Configurare Paths și Constante

In [4]:
DRIVE_ROOT = '/content/drive/MyDrive/HAM10000'
METADATA_PATH = os.path.join(DRIVE_ROOT, 'HAM10000_metadata.csv')
IMAGE_ROOTS = [ os.path.join(DRIVE_ROOT, 'HAM10000_images_part_1'), os.path.join(DRIVE_ROOT, 'HAM10000_images_part_2') ]

OUTPUT_DIR = '/content/ham10000_processed'
TFRECORD_DIR = os.path.join(OUTPUT_DIR, 'tfrecords')
METADATA_OUTPUT_DIR = os.path.join(OUTPUT_DIR, 'metadata')

# Create directories
os.makedirs(TFRECORD_DIR, exist_ok=True)
os.makedirs(METADATA_OUTPUT_DIR, exist_ok=True)

# Constante
IMG_SIZE = 224
EXAMPLES_PER_SHARD = 256
TRAIN_SPLIT = 0.90
TEST_SPLIT = 0.10

print("Checking paths...")
print(f"  Metadata exists: {os.path.exists(METADATA_PATH)}")
print(f"  Images part 1 exists: {os.path.exists(IMAGE_ROOTS[0])}")
print(f"  Images part 2 exists: {os.path.exists(IMAGE_ROOTS[1])}")
print(f"\nOutput directory: {OUTPUT_DIR}")
print(f"TFRecord directory: {TFRECORD_DIR}")

Checking paths...
  Metadata exists: True
  Images part 1 exists: True
  Images part 2 exists: True

Output directory: /content/ham10000_processed
TFRecord directory: /content/ham10000_processed/tfrecords


## 4. Citire Metadata și Construire Paths

In [5]:
df = pd.read_csv(METADATA_PATH)
print(f"Total samples in metadata: {len(df)}")
print(f"Columns: {df.columns.tolist()}")
print(f"\nClass distribution:\n{df['dx'].value_counts()}")

def find_image_path(image_id, roots):
    """Caută imaginea în toate directoarele rădăcină"""

    for root in roots:
        path = os.path.join(root, f"{image_id}.jpg")
        if os.path.exists(path):
            return path
    return None

print("\nBuilding image paths...")
df['image_path'] = df['image_id'].apply(lambda x: find_image_path(x, IMAGE_ROOTS))

missing = df['image_path'].isna().sum()
if missing > 0:
    print(f"WARNING: {missing} images not found!")
    df = df[df['image_path'].notna()].reset_index(drop=True)
    print(f"Continuing with {len(df)} images")
else:
    print(f"✓ All {len(df)} images found successfully")

df.head()

Total samples in metadata: 10015
Columns: ['lesion_id', 'image_id', 'dx', 'dx_type', 'age', 'sex', 'localization']

Class distribution:
dx
nv       6705
mel      1113
bkl      1099
bcc       514
akiec     327
vasc      142
df        115
Name: count, dtype: int64

Building image paths...
✓ All 10015 images found successfully


Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization,image_path
0,HAM_0000118,ISIC_0027419,bkl,histo,80.0,male,scalp,/content/drive/MyDrive/HAM10000/HAM10000_image...
1,HAM_0000118,ISIC_0025030,bkl,histo,80.0,male,scalp,/content/drive/MyDrive/HAM10000/HAM10000_image...
2,HAM_0002730,ISIC_0026769,bkl,histo,80.0,male,scalp,/content/drive/MyDrive/HAM10000/HAM10000_image...
3,HAM_0002730,ISIC_0025661,bkl,histo,80.0,male,scalp,/content/drive/MyDrive/HAM10000/HAM10000_image...
4,HAM_0001466,ISIC_0031633,bkl,histo,75.0,male,ear,/content/drive/MyDrive/HAM10000/HAM10000_image...


## 5. Creare Label Map (Stabil și Salvat)

In [6]:
unique_labels = sorted(df['dx'].unique())
label_map = {label: idx for idx, label in enumerate(unique_labels)}
inverse_label_map = {idx: label for label, idx in label_map.items()}

print("Label mapping:")
for label, idx in label_map.items():
    count = (df['dx'] == label).sum()
    print(f"  {idx}: {label} ({count} samples)")

label_map_path = os.path.join(METADATA_OUTPUT_DIR, 'label_map.json')
with open(label_map_path, 'w') as f:
    json.dump({
        'label_to_idx': label_map,
        'idx_to_label': inverse_label_map
    }, f, indent=2)
print(f"\n✓ Label map saved to: {label_map_path}")

df['label'] = df['dx'].map(label_map)
NUM_CLASSES = len(label_map)
print(f"\nTotal classes: {NUM_CLASSES}")

Label mapping:
  0: akiec (327 samples)
  1: bcc (514 samples)
  2: bkl (1099 samples)
  3: df (115 samples)
  4: mel (1113 samples)
  5: nv (6705 samples)
  6: vasc (142 samples)

✓ Label map saved to: /content/ham10000_processed/metadata/label_map.json

Total classes: 7


## 6. Split Stratificat (Train/Test) cu Seed Fix

In [7]:
train_df, test_df = train_test_split( df, test_size=TEST_SPLIT, stratify=df['label'], random_state=SEED )

train_df = train_df.copy()
test_df = test_df.copy()
train_df['split'] = 'train'
test_df['split'] = 'test'
splits_df = pd.concat([train_df, test_df], ignore_index=True)

print("Split distribution:")
print(f"  Train: {len(train_df)} samples ({len(train_df)/len(df)*100:.1f}%)")
print(f"  Test:  {len(test_df)} samples ({len(test_df)/len(df)*100:.1f}%)")

print("\nClass distribution per split:")
for split in ['train', 'test']:
    split_df = splits_df[splits_df['split'] == split]
    print(f"\n{split.upper()}:")
    dist = split_df['dx'].value_counts()
    for label, count in dist.items():
        pct = count / len(split_df) * 100
        print(f"  {label}: {count} ({pct:.1f}%)")

splits_path = os.path.join(METADATA_OUTPUT_DIR, 'splits.csv')
splits_df[['image_id', 'image_path', 'dx', 'label', 'split']].to_csv(splits_path, index=False)
print(f"\n✓ Splits saved to: {splits_path}")

Split distribution:
  Train: 9013 samples (90.0%)
  Test:  1002 samples (10.0%)

Class distribution per split:

TRAIN:
  nv: 6034 (66.9%)
  mel: 1002 (11.1%)
  bkl: 989 (11.0%)
  bcc: 463 (5.1%)
  akiec: 294 (3.3%)
  vasc: 128 (1.4%)
  df: 103 (1.1%)

TEST:
  nv: 671 (67.0%)
  mel: 111 (11.1%)
  bkl: 110 (11.0%)
  bcc: 51 (5.1%)
  akiec: 33 (3.3%)
  vasc: 14 (1.4%)
  df: 12 (1.2%)

✓ Splits saved to: /content/ham10000_processed/metadata/splits.csv


## 7. Funcții Helper pentru TFRecord

In [8]:
def _bytes_feature(value):
    """Returns a bytes_list from a string / byte."""

    if isinstance(value, type(tf.constant(0))):
        value = value.numpy()
    if isinstance(value, str):
        value = value.encode('utf-8')
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _int64_feature(value):
    """Returns an int64_list from a bool / enum / int / uint."""

    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def create_tf_example(image_id, image_path, label, dx):
    """
    Creează un tf.train.Example pentru un eșantion.

    Args:
        image_id: ID-ul imaginii (string)
        image_path: Path către imaginea JPEG
        label: Label numeric (int)
        dx: Label text (string)

    Returns:
        tf.train.Example serializat
    """
    with open(image_path, 'rb') as f:
        image_bytes = f.read()

    feature = { 'image_id': _bytes_feature(image_id), 'label': _int64_feature(label), 'image_bytes': _bytes_feature(image_bytes), 'dx': _bytes_feature(dx) }

    example = tf.train.Example(features=tf.train.Features(feature=feature))
    return example.SerializeToString()

print("✓ TFRecord helper functions defined")

✓ TFRecord helper functions defined


## 8. Generare TFRecords Shard-uite

In [9]:
def write_tfrecords(df, split_name, output_dir, examples_per_shard=256):
    """
    Scrie TFRecords shard-uite pentru un split.

    Args:
        df: DataFrame cu datele
        split_name: 'train' sau 'test'
        output_dir: Director pentru TFRecords
        examples_per_shard: Număr de exemple per shard

    Returns:
        Lista de paths către TFRecords create
    """
    num_examples = len(df)
    num_shards = (num_examples + examples_per_shard - 1) // examples_per_shard

    print(f"\n{'='*60}")
    print(f"Writing {split_name.upper()} TFRecords")
    print(f"  Total examples: {num_examples}")
    print(f"  Examples per shard: {examples_per_shard}")
    print(f"  Number of shards: {num_shards}")
    print(f"{'='*60}")

    tfrecord_paths = []

    for shard_idx in tqdm(range(num_shards), desc=f"Writing {split_name} shards"):
        # Compute the range for this shard
        start_idx = shard_idx * examples_per_shard
        end_idx = min(start_idx + examples_per_shard, num_examples)

        # Name for shard_file
        shard_filename = f"{split_name}-{shard_idx:04d}-of-{num_shards:04d}.tfrecord"
        shard_path = os.path.join(output_dir, shard_filename)
        tfrecord_paths.append(shard_path)

        # Write shard
        with tf.io.TFRecordWriter(shard_path) as writer:
            for idx in range(start_idx, end_idx):
                row = df.iloc[idx]
                tf_example = create_tf_example( image_id=row['image_id'], image_path=row['image_path'], label=row['label'], dx=row['dx'] )
                writer.write(tf_example)

    print(f"✓ Wrote {num_shards} shards with {num_examples} total examples")
    return tfrecord_paths

# Write TFRecords for each split
tfrecord_info = {}

for split_name in ['train', 'test']:
    split_df = splits_df[splits_df['split'] == split_name].reset_index(drop=True)
    paths = write_tfrecords(
        df=split_df,
        split_name=split_name,
        output_dir=TFRECORD_DIR,
        examples_per_shard=EXAMPLES_PER_SHARD
    )
    tfrecord_info[split_name] = {
        'num_examples': len(split_df),
        'num_shards': len(paths),
        'paths': paths
    }

# Save TFRecords informations
tfrecord_info_path = os.path.join(METADATA_OUTPUT_DIR, 'tfrecord_info.json')
with open(tfrecord_info_path, 'w') as f:
    # Convert paths to relative pentru portabilitate
    info_to_save = {}
    for split, data in tfrecord_info.items():
        info_to_save[split] = {
            'num_examples': data['num_examples'],
            'num_shards': data['num_shards'],
            'paths': [os.path.basename(p) for p in data['paths']]
        }
    json.dump(info_to_save, f, indent=2)

print(f"\n✓ TFRecord info saved to: {tfrecord_info_path}")


Writing TRAIN TFRecords
  Total examples: 9013
  Examples per shard: 256
  Number of shards: 36


Writing train shards:   0%|          | 0/36 [00:00<?, ?it/s]

KeyboardInterrupt: 