In [1]:
import logging
import numpy as np
from sklearn.metrics import precision_score, recall_score, roc_auc_score, roc_curve, accuracy_score, classification_report, confusion_matrix
from transformers import AutoImageProcessor, DefaultDataCollator
from transformers import AutoModelForImageClassification, TrainingArguments, Trainer
from transformers import pipeline
from torchvision.transforms import ToPILImage
import matplotlib.pyplot as plt
import evaluate
import itertools
import sys
sys.path.append('../src')
import datafeed as feed

# Load Data

In [34]:
ls = feed.get_image_list()
#feed.collect_statistics(ls)
ds = feed.get_dataset_with_transform(ls, feed.transform_vit)

# Split Data

In [3]:
ds_split = feed.split_dataset_train_test_validate(ds)
ds_split

DatasetDict({
    validation: Dataset({
        features: ['label', 'image'],
        num_rows: 355
    })
    test: Dataset({
        features: ['label', 'image'],
        num_rows: 288
    })
    train: Dataset({
        features: ['label', 'image'],
        num_rows: 2902
    })
})

# Initialize Model

In [4]:
checkpoint = feed.CHECKPOINT_VIT_ONLINE
processor = feed.get_processor(checkpoint)

model = AutoModelForImageClassification.from_pretrained(
    checkpoint,
    num_labels=len(feed.id2label),
    id2label=feed.id2label,
    label2id=feed.label2id,
)

Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Train Model

In [5]:
data_collator = DefaultDataCollator()
accuracy = evaluate.load("accuracy")


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1) # highest prediction per label index
    return accuracy.compute(predictions=predictions, references=labels)

Change the following cell to fine-tune the parameters. Note the first line which defines the name of the directoy under which the results will be saved after training. Check if this directory name is valid; e.g. check if saving the results would accidently overwrite previous models with different parameters.

In [6]:
MODEL_SAVE_DIR = "vit-32-6"   # ViT Model with batch size 32 and 6 epochs

training_args = TrainingArguments(
    output_dir=f"./local_checkpoints/",
    remove_unused_columns=False,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=32,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=16,
    num_train_epochs=6,
    warmup_ratio=0.1,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=ds_split["train"],
    eval_dataset=ds_split["test"],
    tokenizer=processor,
    compute_metrics=compute_metrics,
)


**This cell can take hours:**

In [7]:

train_results=trainer.train()

  0%|          | 0/132 [00:00<?, ?it/s]

{'loss': 0.6867, 'learning_rate': 3.571428571428572e-05, 'epoch': 0.44}
{'loss': 0.6461, 'learning_rate': 4.745762711864407e-05, 'epoch': 0.88}


  0%|          | 0/18 [00:00<?, ?it/s]

{'eval_loss': 0.6607353687286377, 'eval_accuracy': 0.625, 'eval_runtime': 24.2388, 'eval_samples_per_second': 11.882, 'eval_steps_per_second': 0.743, 'epoch': 0.97}
{'loss': 0.6064, 'learning_rate': 4.3220338983050854e-05, 'epoch': 1.32}
{'loss': 0.6048, 'learning_rate': 3.898305084745763e-05, 'epoch': 1.76}


  0%|          | 0/18 [00:00<?, ?it/s]

{'eval_loss': 0.5971559286117554, 'eval_accuracy': 0.6770833333333334, 'eval_runtime': 23.6842, 'eval_samples_per_second': 12.16, 'eval_steps_per_second': 0.76, 'epoch': 1.98}
{'loss': 0.5894, 'learning_rate': 3.474576271186441e-05, 'epoch': 2.2}
{'loss': 0.5922, 'learning_rate': 3.050847457627119e-05, 'epoch': 2.64}


  0%|          | 0/18 [00:00<?, ?it/s]

{'eval_loss': 0.5772931575775146, 'eval_accuracy': 0.7013888888888888, 'eval_runtime': 23.5318, 'eval_samples_per_second': 12.239, 'eval_steps_per_second': 0.765, 'epoch': 2.99}
{'loss': 0.5555, 'learning_rate': 2.627118644067797e-05, 'epoch': 3.08}
{'loss': 0.5599, 'learning_rate': 2.2033898305084748e-05, 'epoch': 3.52}
{'loss': 0.5784, 'learning_rate': 1.7796610169491526e-05, 'epoch': 3.96}


  0%|          | 0/18 [00:00<?, ?it/s]

{'eval_loss': 0.5877248048782349, 'eval_accuracy': 0.6493055555555556, 'eval_runtime': 23.6413, 'eval_samples_per_second': 12.182, 'eval_steps_per_second': 0.761, 'epoch': 4.0}
{'loss': 0.5354, 'learning_rate': 1.3559322033898305e-05, 'epoch': 4.4}
{'loss': 0.5492, 'learning_rate': 9.322033898305085e-06, 'epoch': 4.84}


  0%|          | 0/18 [00:00<?, ?it/s]

{'eval_loss': 0.5633439421653748, 'eval_accuracy': 0.7152777777777778, 'eval_runtime': 23.7163, 'eval_samples_per_second': 12.144, 'eval_steps_per_second': 0.759, 'epoch': 4.97}
{'loss': 0.5264, 'learning_rate': 5.084745762711865e-06, 'epoch': 5.27}
{'loss': 0.5491, 'learning_rate': 8.474576271186441e-07, 'epoch': 5.71}


  0%|          | 0/18 [00:00<?, ?it/s]

{'eval_loss': 0.5665701627731323, 'eval_accuracy': 0.7152777777777778, 'eval_runtime': 23.7568, 'eval_samples_per_second': 12.123, 'eval_steps_per_second': 0.758, 'epoch': 5.8}
{'train_runtime': 4296.8054, 'train_samples_per_second': 4.052, 'train_steps_per_second': 0.031, 'train_loss': 0.5820643432212599, 'epoch': 5.8}


In [39]:
trainer.evaluate()

  0%|          | 0/18 [00:00<?, ?it/s]

{'eval_loss': 0.5964696407318115,
 'eval_accuracy': 0.6527777777777778,
 'eval_runtime': 23.3199,
 'eval_samples_per_second': 12.35,
 'eval_steps_per_second': 0.772,
 'epoch': 5.8}

In [41]:
trainer.evaluate(eval_dataset=ds_split['validation'])

  0%|          | 0/23 [00:00<?, ?it/s]

{'eval_loss': 0.601269006729126,
 'eval_accuracy': 0.6816901408450704,
 'eval_runtime': 28.586,
 'eval_samples_per_second': 12.419,
 'eval_steps_per_second': 0.805,
 'epoch': 5.8}

In [42]:
trainer.predict(test_dataset=ds_split['validation'])

  0%|          | 0/23 [00:00<?, ?it/s]

PredictionOutput(predictions=array([[-0.45727688,  0.39374343],
       [ 0.19372267, -0.40982282],
       [-0.45725194,  0.3842692 ],
       [-0.33426622,  0.19398004],
       [-0.27707326,  0.25142244],
       [-0.07052396, -0.1930378 ],
       [ 0.04972164, -0.25155026],
       [-0.59427726,  0.5436399 ],
       [ 0.01345248, -0.26816756],
       [-0.18235205,  0.02675513],
       [-0.04115207, -0.19526823],
       [ 0.42058393, -0.67936426],
       [ 0.3364772 , -0.6116127 ],
       [-0.62883484,  0.72885936],
       [-0.7444541 ,  0.8014414 ],
       [ 0.35695732, -0.52664196],
       [-0.46527618,  0.4616736 ],
       [-0.6018098 ,  0.66557527],
       [ 0.18824115, -0.36064586],
       [-0.6143384 ,  0.34624022],
       [ 0.11123888, -0.3578469 ],
       [-0.6063022 ,  0.56417483],
       [-0.0026225 , -0.27700022],
       [ 0.11617779, -0.10700905],
       [ 0.6356916 , -0.6343152 ],
       [ 0.7863762 , -0.8869826 ],
       [ 0.39358395, -0.5815444 ],
       [ 0.19098374, -0.49

# Save Model

In [8]:
trainer.save_model(MODEL_SAVE_DIR)                   # Will save the model, so you can reload it using from_pretrained().
trainer.log_metrics("train", train_results.metrics)  
trainer.log_metrics("test", train_results.metrics)  
trainer.save_metrics("train", train_results.metrics) # save metrics into a .json file
trainer.save_state()

***** train metrics *****
  epoch                    =        5.8
  train_loss               =     0.5821
  train_runtime            = 1:11:36.80
  train_samples_per_second =      4.052
  train_steps_per_second   =      0.031
***** test metrics *****
  epoch                    =        5.8
  train_loss               =     0.5821
  train_runtime            = 1:11:36.80
  train_samples_per_second =      4.052
  train_steps_per_second   =      0.031


# Get Predictions for Evaluation Metrics

**Warning**: Untested code 

In [21]:
classifier = pipeline("image-classification", model=model, image_processor=processor)

def get_pred_and_probablities(ds):
    """Return all predicitions and probabilities for DS."""
    images = []
    y_pred = []
    y_prob = []
    for row in ds:
        images += [ ToPILImage()(row['pixel_values']) ]  # Convert tensor to PIL Image
    for img in images:
        pred = classifier(img)  # returns an array of dicts {'label', 'score'}
        y_pred.append(feed.label2id[pred[0]['label']]) # add first label as predicted label
        y_prob.append(pred[0]['score']) # add first score as predicted score

    return (np.array(y_pred), np.array(y_prob))

y_pred, y_prob = get_pred_and_probablities(ds_split['validation'])
y_true = np.array([row['label'] for row in ds_split['validation']])


In [33]:
import pandas as pd
from datetime import datetime

now = datetime.now().strftime("%H:%M")

# list(zip([1,2],[3,4],[5,6])) # -> [(1, 3, 5), (2, 4, 6)]
df = pd.DataFrame([ (true, pred, prob) for (true, pred, prob)  in zip(y_true, y_pred,y_prob)],
                  columns = ["true", "pred", "prob"])

df.to_csv(f"../data/{MODEL_SAVE_DIR}-{now}.csv", index=False, header=True)



In [22]:
confusion_matrix(y_pred, y_true)

array([[137,  54],
       [ 54, 110]])

In [23]:
accuracy_score(y_true, y_pred)

0.6957746478873239