In [2]:
import os
from pathlib import Path
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt

print('TF version:', tf.__version__)

# Kaggle dataset paths
TRAIN_DIR = Path('/kaggle/input/rice-pistachio-and-grapevine-leaf-classification/train')
TRAIN_CSV = Path('/kaggle/input/rice-pistachio-and-grapevine-leaf-classification/train.csv')
SAMPLE_SUB = Path('/kaggle/input/rice-pistachio-and-grapevine-leaf-classification/sample_submission.csv')
TEST_DIR = Path('/kaggle/input/rice-pistachio-and-grapevine-leaf-classification/test')

print('Paths exist:')
print('TRAIN_DIR', TRAIN_DIR.exists())
print('TRAIN_CSV', TRAIN_CSV.exists())
print('SAMPLE_SUB', SAMPLE_SUB.exists())
print('TEST_DIR', TEST_DIR.exists())


TF version: 2.18.0
Paths exist:
TRAIN_DIR True
TRAIN_CSV True
SAMPLE_SUB True
TEST_DIR True


## Load labels and build train dataframe

In [3]:
# Read train.csv and create file paths
train_df = pd.read_csv(str(TRAIN_CSV))
train_df.columns = [c.strip() for c in train_df.columns]
train_df.head()

# Expecting columns like ['ID','TARGET'] or similar. Normalize names.
if 'ID' not in train_df.columns and 'id' in train_df.columns:
    train_df = train_df.rename(columns={'id':'ID'})
if 'TARGET' not in train_df.columns and 'target' in train_df.columns:
    train_df = train_df.rename(columns={'target':'TARGET'})

print('Columns:', train_df.columns.tolist())

# Helper: find image path for an ID
from functools import lru_cache
@lru_cache(maxsize=None)
def find_image_path(image_id):
    # first, check if file exists directly inside TRAIN_DIR
    cand = TRAIN_DIR / image_id
    if cand.exists():
        return str(cand)
    # else search recursively (this is slightly slower but robust)
    for p in TRAIN_DIR.rglob(image_id):
        return str(p)
    # fallback: maybe image_id has extension missing; try common extensions
    name = Path(image_id).stem
    for ext in ['.jpg', '.jpeg', '.png']:
        for p in TRAIN_DIR.rglob(name + ext):
            return str(p)
    return None

train_df['filepath'] = train_df['ID'].astype(str).apply(find_image_path)
missing = train_df['filepath'].isnull().sum()
print('Missing file paths:', missing)
if missing>0:
    display(train_df[train_df['filepath'].isnull()].head())

# Drop missing rows (if any)
train_df = train_df.dropna(subset=['filepath']).reset_index(drop=True)
train_df.shape


Columns: ['ID', 'TARGET']
Missing file paths: 0


(6400, 3)

## Label encoding and class distribution

In [4]:
le = LabelEncoder()
train_df['label_enc'] = le.fit_transform(train_df['TARGET'])

print('Number of classes:', len(le.classes_))
class_counts = train_df['TARGET'].value_counts()
class_counts.head(20)


Number of classes: 20


TARGET
BASMATI       400
SIIRT         400
KIRMIZI       400
BINADHAN25    400
BINADHAN16    400
BRRI67        400
BINADHAN7     400
BD30          400
JASMINE       400
BD95          400
ARBORIO       400
IPSALA        400
BD72          400
KARACADAG     400
BR22          400
AK             80
NAZLI          80
DIMNIT         80
BUZGULU        80
ALA_IDRIS      80
Name: count, dtype: int64

## Stratified K-Fold

In [5]:
N_FOLDS = 5
skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=42)
train_df['fold'] = -1
for fold, (_, val_idx) in enumerate(skf.split(train_df, train_df['label_enc'])):
    train_df.loc[val_idx, 'fold'] = fold

train_df.groupby('fold')['ID'].count()


fold
0    1280
1    1280
2    1280
3    1280
4    1280
Name: ID, dtype: int64

## TF Dataset pipeline

In [6]:
IMG_SIZE = (224,224)
BATCH_SIZE = 32
AUTOTUNE = tf.data.AUTOTUNE


def read_and_preprocess(img_path, label=None, img_size=IMG_SIZE, is_training=False):
    img = tf.io.read_file(img_path)
    img = tf.image.decode_jpeg(img, channels=3)
    img = tf.image.resize(img, img_size)
    img = tf.cast(img, tf.float32)
    if is_training:
        img = tf.image.random_flip_left_right(img)
        img = tf.image.random_flip_up_down(img)
        img = tf.image.random_brightness(img, 0.1)
    img = tf.keras.applications.efficientnet.preprocess_input(img)
    if label is None:
        return img
    else:
        return img, label


def make_dataset(filepaths, labels=None, batch_size=BATCH_SIZE, is_training=False):
    files = tf.constant(filepaths)
    if labels is None:
        ds = tf.data.Dataset.from_tensor_slices(files)
        ds = ds.map(lambda x: read_and_preprocess(x, None, IMG_SIZE, False), num_parallel_calls=AUTOTUNE)
    else:
        ds = tf.data.Dataset.from_tensor_slices((files, labels))
        ds = ds.map(lambda x,y: read_and_preprocess(x, y, IMG_SIZE, is_training), num_parallel_calls=AUTOTUNE)
        if is_training:
            ds = ds.shuffle(2048)
    ds = ds.batch(batch_size).prefetch(AUTOTUNE)
    return ds

print('Dataset utilities ready')


Dataset utilities ready


## Model builder (EfficientNetB0)

In [7]:
from tensorflow.keras import layers, models, optimizers, callbacks

def build_model(n_classes, input_shape=(224,224,3), train_base=False):
    base = tf.keras.applications.EfficientNetB0(include_top=False, weights='imagenet', input_shape=input_shape, pooling='avg')
    base.trainable = train_base
    inputs = tf.keras.Input(shape=input_shape)
    x = tf.keras.applications.efficientnet.preprocess_input(inputs)
    x = base(x, training=False)
    x = layers.Dropout(0.4)(x)
    outputs = layers.Dense(n_classes, activation='softmax')(x)
    model = models.Model(inputs, outputs)
    model.compile(optimizer=optimizers.Adam(learning_rate=1e-4), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

# Quick instantiation
model = build_model(len(le.classes_), input_shape=(IMG_SIZE[0], IMG_SIZE[1], 3), train_base=False)
model.summary()


I0000 00:00:1758889029.175365      36 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 13942 MB memory:  -> device: 0, name: Tesla T4, pci bus id: 0000:00:04.0, compute capability: 7.5
I0000 00:00:1758889029.176177      36 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 13942 MB memory:  -> device: 1, name: Tesla T4, pci bus id: 0000:00:05.0, compute capability: 7.5


Downloading data from https://storage.googleapis.com/keras-applications/efficientnetb0_notop.h5
[1m16705208/16705208[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


## Training loop (fold-wise)

In [8]:
# Training settings
RUN_TRAIN = True
EPOCHS = 15
BATCH_SIZE = 32
MODEL_DIR = Path('/kaggle/working')
MODEL_DIR.mkdir(parents=True, exist_ok=True)

from tqdm import tqdm

fold_hist = {}
all_val_scores = []
models_for_inference = []

if RUN_TRAIN:
    for fold in range(N_FOLDS):
        print('\n=== Fold', fold, '===')
        train_f = train_df[train_df['fold']!=fold]
        val_f = train_df[train_df['fold']==fold]
        train_ds = make_dataset(train_f['filepath'].tolist(), train_f['label_enc'].values, batch_size=BATCH_SIZE, is_training=True)
        val_ds = make_dataset(val_f['filepath'].tolist(), val_f['label_enc'].values, batch_size=BATCH_SIZE, is_training=False)

        model = build_model(len(le.classes_), input_shape=(IMG_SIZE[0], IMG_SIZE[1], 3), train_base=False)
        ckpt_path = MODEL_DIR / f'best_fold_{fold}.h5'
        cb = [
            callbacks.ModelCheckpoint(str(ckpt_path), monitor='val_loss', save_best_only=True, verbose=1),
            callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, verbose=1),
            callbacks.EarlyStopping(monitor='val_loss', patience=3, verbose=1, restore_best_weights=True)
        ]
        history = model.fit(train_ds, validation_data=val_ds, epochs=EPOCHS, callbacks=cb)
        fold_hist[fold] = history.history
        # load best
        model.load_weights(str(ckpt_path))
        models_for_inference.append(model)
        # evaluate on val
        val_preds = model.predict(val_ds)
        val_pred_labels = val_preds.argmax(axis=1)
        f1 = f1_score(val_f['label_enc'].values, val_pred_labels, average='micro')
        print(f'Fold {fold} micro F1: {f1:.4f}')
        all_val_scores.append(f1)

    print('\nAll folds micro F1:', all_val_scores)
    print('Mean:', np.mean(all_val_scores))
else:
    print('Training skipped. Set RUN_TRAIN=True to train models.')



=== Fold 0 ===
Epoch 1/15


I0000 00:00:1758889059.294289     103 service.cc:148] XLA service 0x7912e0003390 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1758889059.295779     103 service.cc:156]   StreamExecutor device (0): Tesla T4, Compute Capability 7.5
I0000 00:00:1758889059.295799     103 service.cc:156]   StreamExecutor device (1): Tesla T4, Compute Capability 7.5
I0000 00:00:1758889061.286609     103 cuda_dnn.cc:529] Loaded cuDNN version 90300


[1m  3/160[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m11s[0m 72ms/step - accuracy: 0.0122 - loss: 3.4025      

I0000 00:00:1758889072.172893     103 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m160/160[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step - accuracy: 0.1074 - loss: 2.9408
Epoch 1: val_loss improved from inf to 1.98765, saving model to /kaggle/working/best_fold_0.h5
[1m160/160[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 127ms/step - accuracy: 0.1080 - loss: 2.9388 - val_accuracy: 0.5664 - val_loss: 1.9877 - learning_rate: 1.0000e-04
Epoch 2/15
[1m160/160[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step - accuracy: 0.4460 - loss: 1.9339
Epoch 2: val_loss improved from 1.98765 to 1.46847, saving model to /kaggle/working/best_fold_0.h5
[1m160/160[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 56ms/step - accuracy: 0.4463 - loss: 1.9331 - val_accuracy: 0.7039 - val_loss: 1.4685 - learning_rate: 1.0000e-04
Epoch 3/15
[1m159/160[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 42ms/step - accuracy: 0.5991 - loss: 1.4784
Epoch 3: val_loss improved from 1.46847 to 1.17962, saving model to /kaggle/working/best_fo

## Inference on test set and submission

In [37]:
if TEST_DIR.exists():
    # include jpg and png
    test_files = sorted([p for p in TEST_DIR.rglob("*") if p.suffix.lower() in [".jpg", ".png"]])
    test_ids = [p.name for p in test_files]
    test_paths = [str(p) for p in test_files]
else:
    # try to load sample_submission and use IDs
    if SAMPLE_SUB.exists():
        sample = pd.read_csv(str(SAMPLE_SUB))
        test_ids = sample['ID'].astype(str).tolist()
        test_paths = []
        for tid in test_ids:
            p = None
            for q in TEST_DIR.rglob(tid):   # works for jpg/png
                p = q
                break
            if p is None:
                p = TEST_DIR / tid
            test_paths.append(str(p))
    else:
        test_ids = [f'{i:04d}.jpg' for i in range(1,1601)]
        test_paths = [str(TEST_DIR/tid) for tid in test_ids]

print('Test samples:', len(test_ids))


Test samples: 1600


In [38]:
# Make test dataset
test_ds = make_dataset(test_paths, labels=None, batch_size=BATCH_SIZE, is_training=False)

# Prepare inference models: if none trained, attempt to load first checkpoint
if len(models_for_inference)==0:
    ckpt0 = MODEL_DIR / 'best_fold_0.h5'
    if ckpt0.exists():
        print('Loading', ckpt0)
        m = build_model(len(le.classes_), input_shape=(IMG_SIZE[0], IMG_SIZE[1], 3), train_base=False)
        m.load_weights(str(ckpt0))
        models_for_inference.append(m)

if len(models_for_inference)==0:
    print('No model available for inference. Set RUN_TRAIN=True and run training, or upload a checkpoint to /kaggle/working.')
else:
    # average predictions
    import numpy as np
    preds = None
    for m in models_for_inference:
        p = m.predict(test_ds, verbose=1)
        if preds is None:
            preds = p
        else:
            preds += p
    preds = preds / len(models_for_inference)
    pred_labels_idx = preds.argmax(axis=1)
    pred_labels = le.inverse_transform(pred_labels_idx)

    submission_df = pd.DataFrame({'ID': test_ids, 'TARGET': pred_labels})
    out_path = Path('/kaggle/working/submission.csv')
    submission_df.to_csv(out_path, index=False)
    print('Saved submission to', out_path)
    display(submission_df.head())

[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 91ms/step
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 39ms/step
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 39ms/step
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 40ms/step
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 39ms/step
Saved submission to /kaggle/working/submission.csv


Unnamed: 0,ID,TARGET
0,0000.jpg,KARACADAG
1,0001.jpg,BRRI67
2,0002.jpg,BINADHAN16
3,0003.jpg,BINADHAN16
4,0004.jpg,KARACADAG
