In [1]:
import os
import datasets
import torch
import numpy as np
import pandas as pd
from PIL import Image
from torchvision.transforms import Compose, Normalize, ToTensor, Resize
from transformers import AutoFeatureExtractor, AutoModelForImageClassification, TrainingArguments, Trainer
from sklearn.model_selection import train_test_split

checkpoint = "microsoft/resnet-18"
folder_path = './images/'
csv_path = './scp_codes.csv'

2023-06-07 19:44:44.639554: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
df = pd.read_csv(csv_path, delimiter=";")
images, labels = [], []

for _, row in df.iterrows():
    image_name = row['filename_hr']
    image_path = os.path.join(folder_path, f"{image_name}.jpg")
    if os.path.exists(image_path):
        images.append(Image.open(image_path))
        labels.append(row['normal'])

In [3]:
train_X, hold_X, train_y, hold_y = train_test_split(images[:1000], labels[:1000], test_size=0.2)
eval_X, test_X, eval_y, test_y = train_test_split(hold_X, hold_y, test_size=0.5)

train_dataset = datasets.Dataset.from_dict({"image": train_X, "label": train_y})
eval_dataset = datasets.Dataset.from_dict({"image": eval_X, "label": eval_y})
test_dataset = datasets.Dataset.from_dict({"image": test_X, "label": test_y})

In [4]:
extractor = AutoFeatureExtractor.from_pretrained(checkpoint)
normalize = Normalize(mean=extractor.image_mean, std=extractor.image_std)
resize = Resize((extractor.size['shortest_edge'], extractor.size['shortest_edge']))
class DivideBy255:
    def __call__(self, image):
        return image / 255.0

transform = Compose([resize, ToTensor(), normalize])

def preprocess(example):
    example["pixel_values"] = [transform(image.convert('RGB')) for image in example["image"]]
    return example



In [5]:
train_dataset.set_transform(preprocess)
eval_dataset.set_transform(preprocess)
test_dataset.set_transform(preprocess)

In [6]:
train_dataset[0]

{'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=2000x1200>,
 'label': 1,
 'pixel_values': tensor([[[1.3584, 1.6495, 1.6495,  ..., 1.6495, 1.6495, 1.6495],
          [1.8893, 2.2489, 2.2489,  ..., 2.2489, 2.2489, 2.2147],
          [1.9064, 2.2489, 2.2489,  ..., 2.2489, 2.2489, 2.2147],
          ...,
          [1.9064, 2.2489, 2.2489,  ..., 2.2489, 2.2489, 2.2147],
          [1.9064, 2.2489, 2.2489,  ..., 2.2489, 2.2489, 2.2147],
          [1.8893, 2.2147, 2.2147,  ..., 2.2147, 2.2147, 2.1975]],
 
         [[1.5182, 1.8158, 1.8158,  ..., 1.8158, 1.8158, 1.8158],
          [2.0609, 2.4286, 2.4286,  ..., 2.4286, 2.4286, 2.3936],
          [2.0784, 2.4286, 2.4286,  ..., 2.4286, 2.4286, 2.3936],
          ...,
          [2.0784, 2.4286, 2.4286,  ..., 2.4286, 2.4286, 2.3936],
          [2.0784, 2.4286, 2.4286,  ..., 2.4286, 2.4286, 2.3936],
          [2.0609, 2.3936, 2.3936,  ..., 2.3936, 2.3936, 2.3761]],
 
         [[1.7337, 2.0300, 2.0300,  ..., 2.0300, 2.0300, 2.0300],


In [7]:
model = AutoModelForImageClassification.from_pretrained(
    checkpoint,
    label2id = {'Normal': 1, 'Abnormal': 0},
    id2label = {'1': 'Normal', '0': 'Abnormal'},
    ignore_mismatched_sizes = True,
    )

Some weights of ResNetForImageClassification were not initialized from the model checkpoint at microsoft/resnet-18 and are newly initialized because the shapes did not match:
- classifier.1.weight: found shape torch.Size([1000, 512]) in the checkpoint and torch.Size([2, 512]) in the model instantiated
- classifier.1.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([2]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
training_args = TrainingArguments(
    output_dir="./output",
    remove_unused_columns=False,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    warmup_ratio=0.1,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy"
)

def collate_fn(examples):
    pixel_values = torch.stack([example["pixel_values"] for example in examples])
    labels = torch.tensor([example["label"] for example in examples])
    return {"pixel_values": pixel_values, "labels": labels}

metric = datasets.load_metric("accuracy")

def compute_metrics(eval_pred):
    """Computes accuracy on a batch of predictions"""
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return metric.compute(predictions=predictions, references=eval_pred.label_ids)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=extractor,
    compute_metrics=compute_metrics,
    data_collator=collate_fn,
)


  metric = datasets.load_metric("accuracy")


In [9]:
trainer.train()



  0%|          | 0/36 [00:00<?, ?it/s]

{'loss': 0.6445, 'learning_rate': 4.0625000000000005e-05, 'epoch': 0.8}


  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 0.48189041018486023, 'eval_accuracy': 0.75, 'eval_runtime': 8.2428, 'eval_samples_per_second': 12.132, 'eval_steps_per_second': 0.849, 'epoch': 0.96}
{'loss': 0.4221, 'learning_rate': 2.5e-05, 'epoch': 1.6}


  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 0.4240489900112152, 'eval_accuracy': 0.79, 'eval_runtime': 8.2148, 'eval_samples_per_second': 12.173, 'eval_steps_per_second': 0.852, 'epoch': 2.0}
{'loss': 0.3787, 'learning_rate': 9.375000000000001e-06, 'epoch': 2.4}


  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 0.41271865367889404, 'eval_accuracy': 0.78, 'eval_runtime': 8.1836, 'eval_samples_per_second': 12.22, 'eval_steps_per_second': 0.855, 'epoch': 2.88}
{'train_runtime': 395.2942, 'train_samples_per_second': 6.071, 'train_steps_per_second': 0.091, 'train_loss': 0.45118772983551025, 'epoch': 2.88}


TrainOutput(global_step=36, training_loss=0.45118772983551025, metrics={'train_runtime': 395.2942, 'train_samples_per_second': 6.071, 'train_steps_per_second': 0.091, 'train_loss': 0.45118772983551025, 'epoch': 2.88})

In [10]:
metrics = trainer.evaluate(test_dataset)

trainer.log_metrics("test", metrics)
trainer.save_metrics("test", metrics)

  0%|          | 0/7 [00:00<?, ?it/s]

***** test metrics *****
  epoch                   =       2.88
  eval_accuracy           =        0.8
  eval_loss               =     0.5051
  eval_runtime            = 0:00:07.88
  eval_samples_per_second =     12.689
  eval_steps_per_second   =      0.888
