In [None]:
!pip install huggingface_hub datasets detectors
!pip install mlflow
!pip install torch torchinfo torchvision pytorch-lightning optuna
!pip install timm
!pip install torchmetrics

In [None]:
#@title Common Imports
import torch
import torch.nn as nn
import torch.nn.functional as F

from torch import utils
from torch.utils.data import DataLoader
from torchsummary import summary
import pytorch_lightning as pl

from torchvision import transforms


In [None]:
from dotenv import load_dotenv
load_dotenv()

In [None]:
#@title MLFlow Tracking Auth
import mlflow
import os
from getpass import getpass

# MLFlow auth
if 'MLFLOW_TRACKING_URI' not in os.environ:
    from google.colab import userdata
    os.environ['MLFLOW_TRACKING_URI'] = userdata.get('MLFLOW_TRACKING_URI')
    os.environ['MLFLOW_TRACKING_USERNAME'] = userdata.get('MLFLOW_TRACKING_USERNAME')
    os.environ['MLFLOW_TRACKING_PASSWORD'] = userdata.get('MLFLOW_TRACKING_PASSWORD')
  except (userdata.SecretNotFoundError, ImportError):
    os.environ['MLFLOW_TRACKING_URI'] = input('Enter MLflow uri: ')
    os.environ['MLFLOW_TRACKING_USERNAME'] = input('Enter your MLflow username: ')
    os.environ['MLFLOW_TRACKING_PASSWORD'] = getpass('Enter your MLflow password: ')

if 'MLFLOW_TRACKING_URI' in os.environ:
    mlflow.set_tracking_uri(os.environ['MLFLOW_TRACKING_URI'])

    experiment = mlflow.set_experiment("imagewoof_test")

    print(mlflow.get_artifact_uri())

    mlflow.end_run()
    with mlflow.start_run():
        print(mlflow.get_artifact_uri())
        print(experiment.artifact_location)

ModuleNotFoundError: No module named 'mlflow'

In [None]:
#@title HF auth
from google.colab import userdata
from huggingface_hub import login
import os

os.environ['HF_TOKEN'] = userdata.get('HF_TOKEN')
login(token=userdata.get('HF_TOKEN'))

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
#@title Model download
import timm

model_name = 'timm/vit_large_patch16_224'
model = timm.create_model(model_name, pretrained=True, num_classes=10)

model.safetensors:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

In [None]:
#@title HF dataset load and datamodule def
import datasets
from datasets import load_dataset
from timm.data import resolve_data_config
from timm.data.transforms_factory import create_transform


# TODO add pinned memory and other loader optimizations to class as params
# make a method to make the 3 loaders
# TODO REVIEW TRAINING OPTIMIZATION NOTES
# still unsure what of that Lightning does for you I dont think it does anything on dataset?
class ImageWoofHGData(pl.LightningDataModule):
  def __init__(self, batch_size: int = 32, model=None):
    super().__init__()
    self.save_hyperparameters(ignore=['model'])
    self.batch_size = batch_size
    self.model = model
    self.datasets = []

  # THIS PART WAS THE BIGGEST PAIN IN THE ASS I DO NOT LIKE HUGGING FACE DATASETS THE DATAMODULES ARE AWESOME BUT HUGGING FACE DATA SETS SUCK
  # The data module guide said to do download in prepare but then
  # technically supposed to do splits in setup?
  def setup(self, stage: str):
    # Download loading as splits
    #image woof only has train and validation
    datasets = load_dataset("frgfm/imagewoof",
                           "full_size",
                           trust_remote_code=True,
                           split=['train', 'validation[:50%]', 'validation[50%:]'])


    # Create transform from model and assign to each dataset
    # yes gross reaching into a global for model
    vit_transform = create_transform(**resolve_data_config(model.pretrained_cfg, model=self.model))

    def transforms(examples):
      # is a list of all of that column
      examples["pixels"] = [vit_transform(image.convert("RGB")) for image in examples["image"]]
      return examples

    for ds in datasets:
      dataset = ds.map(transforms, remove_columns=["image"], batched=True)
      dataset = dataset.with_format("torch")
      self.datasets.append(dataset)


  def train_dataloader(self):
    return DataLoader(self.datasets[0], batch_size=self.batch_size, shuffle=True)

  def val_dataloader(self):
    return DataLoader(self.datasets[1], batch_size=self.batch_size)

  def test_dataloader(self):
    return DataLoader(self.datasets[2], batch_size=self.batch_size)

# apparently there is also a predict

# datamodule = ImageWoofHGData()
# datamodule.setup('fit')
# train_loader = datamodule.train_dataloader()
# for batch in train_loader:
#   print(batch)
#   break

In [None]:
vit_transform = create_transform(**resolve_data_config(model.pretrained_cfg, model=model))
print(vit_transform)
print(model)

Compose(
    Resize(size=248, interpolation=bicubic, max_size=None, antialias=True)
    CenterCrop(size=(224, 224))
    MaybeToTensor()
    Normalize(mean=tensor([0.5000, 0.5000, 0.5000]), std=tensor([0.5000, 0.5000, 0.5000]))
)
VisionTransformer(
  (patch_embed): PatchEmbed(
    (proj): Conv2d(3, 1024, kernel_size=(16, 16), stride=(16, 16))
    (norm): Identity()
  )
  (pos_drop): Dropout(p=0.0, inplace=False)
  (patch_drop): Identity()
  (norm_pre): Identity()
  (blocks): Sequential(
    (0): Block(
      (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
      (attn): Attention(
        (qkv): Linear(in_features=1024, out_features=3072, bias=True)
        (q_norm): Identity()
        (k_norm): Identity()
        (attn_drop): Dropout(p=0.0, inplace=False)
        (proj): Linear(in_features=1024, out_features=1024, bias=True)
        (proj_drop): Dropout(p=0.0, inplace=False)
      )
      (ls1): Identity()
      (drop_path1): Identity()
      (norm2): LayerNorm((1024,), 

In [None]:
#@title datamodule init
datamodule = ImageWoofHGData(model=model, batch_size=16)
datamodule.setup('fit')

NameError: name 'ImageWoofHGData' is not defined

In [None]:
#@title Model def and init(including lighting module def and loop)
import timm
import torch.optim as optim

class LightningVitL(pl.LightningModule):
  def __init__(self, backbone, learning_rate=1e-4):
    super().__init__()
    # saves all args as hyper params that can then be accessed as self.ARG
    self.save_hyperparameters(ignore=['backbone']) # already saved as part of checkpointing according to warning1
    self.backbone = backbone
    self.loss = nn.CrossEntropyLoss()


  # this doesnt work anyways as I learned autologger overrides this

  # I do it this way because for some reason the hijacked tensorboard autologger isnt logging params like this otherwise
  # and it lets me include trainer params
  def on_train_start(self):
    # should really use decision code from here
    #https://github.com/Lightning-AI/pytorch-lightning/blob/5dea36c5e2969aa8823213d6602e058db093ec57/src/lightning/pytorch/loggers/utilities.py#L59
    # could also do this as a callback on the trainer itself
    params = {**self.hparams,**self.trainer.datamodule.hparams} if self.trainer.datamodule else self.hparams
    self.logger.log_hyperparams({**params, "precision": self.trainer.precision, "accumulate_grad_batches": self.trainer.accumulate_grad_batches, "gradient_clip_val": self.trainer.gradient_clip_val, "gradient_clip_algorithm": self.trainer.gradient_clip_algorithm, "lr_scheduler_configs": self.trainer.lr_scheduler_configs})

  def forward(self, x):
    return self.backbone(x)

  def compute_metrics(self, batch, batch_idx):
    x, y = batch['pixels'], batch['label']
    scores = self.forward(x)
    loss = self.loss(scores, y)

    # calculate acc
    labels_hat = torch.argmax(scores, dim=1)
    accuracy = torch.sum(y == labels_hat).item() / (float(len(y)))
    return loss, accuracy

  def training_step(self, batch, batch_idx):
    loss, accuracy = self.compute_metrics(batch, batch_idx)
    self.log_dict({'train_loss': loss, 'train_acc': accuracy})
    return loss

  def validation_step(self, batch, batch_idx):
    loss, accuracy = self.compute_metrics(batch, batch_idx)
    self.log_dict({'val_loss': loss, 'val_acc': accuracy})


  def test_step(self, batch, batch_idx):
    loss, accuracy = self.compute_metrics(batch, batch_idx)
    self.log_dict({'test_loss': loss, 'test_acc': accuracy})

  # MUST RETURN THE OPTIMIZER
  def configure_optimizers(self):
    optimizer = optim.Adam(self.parameters(), lr=self.hparams.learning_rate)
    return optimizer
    # lr_scheduler = optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9)
    # return {'optimizer': optimizer,
    #         'lr_scheduler': {
    #             'scheduler': lr_scheduler,
    #             'interval': 'step',
    #             'frequency': 1
    #             }
    #         }


# print(model)
# for name, param in model.named_parameters():
#   if param.requires_grad:
#     print(name)

# print(is_model_frozen_anywhere(model))
lightning_model = LightningVitL(backbone=model)
# Compile the model
# lightning_model = torch.compile(lightning_model)

In [None]:
#@title Training
from pytorch_lightning import Trainer
from pytorch_lightning.tuner import Tuner

import mlflow.pytorch
from mlflow import MlflowClient


class log_hyperparameters(pl.Callback):
    def on_train_start(self, trainer, pl_module):
      print("logging hyperparameters")
      params = {**pl_module.hparams,**trainer.datamodule.hparams} if trainer.datamodule else pl_module.hparams
      pl_module.logger.log_hyperparams({**params, "precision": pl_module.trainer.precision, "accumulate_grad_batches": trainer.accumulate_grad_batches, "gradient_clip_val": trainer.gradient_clip_val, "gradient_clip_algorithm": trainer.gradient_clip_algorithm, "lr_scheduler_configs": trainer.lr_scheduler_configs})

class nvidia_smi(pl.Callback):
    def __init__(self, n_steps):
        self.n_steps = n_steps
        self.current_step = 0

    def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx):
      self.current_step += 1
      self.current_step %= self.n_steps
      if self.current_step == 0:
        # Your custom logic here
        print(f"Custom action at global step {self.global_step}")


# even this callback is still being overriden by the autologger
trainer = Trainer(max_epochs=10, callbacks=[log_hyperparameters()])


# Auto log all MLflow entities
mlflow.pytorch.autolog(log_every_n_step=10, checkpoint_save_freq='epoch', checkpoint_save_best_only=False)
# mlflow.pytorch.autolog(log_every_n_step=10)

# Train the model.
with mlflow.start_run() as run:
  print(run.info)
  trainer.fit(lightning_model, datamodule=datamodule)


  # from pytorch_lightning.loggers import MLFlowLogger
  # mlf_logger = MLFlowLogger(experiment_name="lightning_logs", log_model='all', tracking_uri=userdata.get('MLFLOW_TRACKING_URI'))
  # trainer = Trainer(enable_checkpointing='all', log_every_n_steps=1, logger=mlf_logger, max_epochs=10)

  # trainer.fit(lightning_model, datamodule=datamodule)


  # checkpoint_callback = pl.callbacks.ModelCheckpoint(every_n_train_steps=1)
  # trainer = Trainer(log_every_n_steps=1, logger=mlf_logger, max_epochs=10, callbacks=[checkpoint_callback])

  # trainer = Trainer(log_every_n_steps=10, max_epochs=10, callbacks=[checkpoint_callback])

  # trainer = Trainer(log_every_n_steps=1,
  #                   max_epochs=10,
  #                   callbacks=[checkpoint_callback, nvidiasmi(10)])


  # datamodule = ImageWoofHGData(model=model, batch_size=16)
  # will crash after using all available ram
  # tuner = Tuner(trainer)
  # tuner.scale_batch_size(lightning_model, mode="binsearch", datamodule=datamodule) # will override LIGHTNING MODULES HPARAM (so trainer doesnt need result)

# at least I Know how to use these parts...
  # profiler = pl.profiler.PyTorchProfiler(profile_memory=False)
  # trainer.fit(lightning_model, datamodule=datamodule, profiler=profiler)

  # trainer.fit(lightning_model, datamodule=datamodule)

def print_auto_logged_info(r):
    tags = {k: v for k, v in r.data.tags.items() if not k.startswith("mlflow.")}
    artifacts = [f.path for f in MlflowClient().list_artifacts(r.info.run_id, "model")]
    print(f"run_id: {r.info.run_id}")
    print(f"artifacts: {artifacts}")
    print(f"params: {r.data.params}")
    print(f"metrics: {r.data.metrics}")
    print(f"tags: {tags}")

print_auto_logged_info(mlflow.get_run(run_id=run.info.run_id))