In [1]:
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from PIL import Image
import warnings
warnings.filterwarnings('ignore')
import os
import time

# Import PyTorch libraries
import torch
import torchvision
import torchvision.transforms as transforms
import torchvision.datasets as datasets
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torchsummary import summary
from torch.optim.lr_scheduler import ReduceLROnPlateau
from tqdm import tqdm
import copy
import multiprocessing

# Import custom libraries
from src.classifier import ResNet, Bottleneck
from src.utils import train, test, seed_everything
from src.trainer import Trainer

## Model Instantiated

In [None]:
device = ('cuda' if torch.cuda.is_available() else 'cpu')
seed_everything(42)

resnet_model = ResNet(Bottleneck, [3, 4, 6, 3]).to(device)

resnet_model_exp = copy.deepcopy(resnet_model)
resnet_model_exp = nn.DataParallel(resnet_model_exp)
resnet_model_exp = resnet_model_exp.to(device)

## LR Finder

In [None]:
train_transformation = transforms.Compose(
        [
            transforms.ToTensor(),
            transforms.RandomResizedCrop(
                224,
                interpolation=transforms.InterpolationMode.BILINEAR,
                antialias=True,
            ),
            transforms.RandomHorizontalFlip(0.5),
            # Normalize the pixel values (in R, G, and B channels)
            transforms.Normalize(
                mean=[0.485, 0.485, 0.406], std=[0.229, 0.224, 0.225]
            ),
        ]
    )

train_dataset = torchvision.datasets.ImageFolder(
    root="/home/ec2-user/ebs/volumes/imagenet/ILSVRC/Data/CLS-LOC/train", 
    transform=train_transformation
)
train_sampler = torch.utils.data.RandomSampler(train_dataset)
train_loader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size=128,
    sampler=train_sampler,
    num_workers=multiprocessing.cpu_count(),
    pin_memory=True,
)

In [None]:
from torch_lr_finder import LRFinder

optimizer = optim.SGD(resnet_model_exp.parameters(), lr=0.01, momentum=0.9, weight_decay=5e-4)

amp_config = {
    'device_type': 'cuda',
    'dtype': torch.float16,
}
grad_scaler = torch.cuda.amp.GradScaler()

lr_finder = LRFinder(
    model, 
    optimizer, 
    criterion, 
    device='cuda',
    amp_backend='torch', 
    amp_config=amp_config, 
    grad_scaler=grad_scaler
)
lr_finder.range_test(train_loader, end_lr=10, num_iter=100, step_mode='exp')
lr_finder.plot()
lr_finder.reset()

## Training

In [2]:
## Checking number of cores available
import multiprocessing
multiprocessing.cpu_count()

4

In [None]:
config = {
    "train_path" : "/home/ec2-user/ebs/volumes/imagenet/ILSVRC/Data/CLS-LOC/train",
    "val_path" : "/home/ec2-user/ebs/volumes/imagenet/imagenet_validation",
    "batch_size" : 128,
    "num_workers" : multiprocessing.cpu_count(),
    "epochs" : 2,
    "artifact_path" : "/home/ec2-user/ebs/volumes/era_session9",
}

device = ('cuda' if torch.cuda.is_available() else 'cpu')
seed_everything(42)

optimizer = optim.SGD(resnet_model.parameters(), lr=0.01, momentum=0.9, weight_decay=5e-4)
scheduler = ReduceLROnPlateau(
    optimizer,
    mode="max",
    factor=0.1,
    patience=5,
    verbose=False,
    threshold_mode='rel',
    threshold=0.0001,
    min_lr=1e-6,
)

In [4]:
training = Trainer(
    model=resnet_model,
    optimizer=optimizer,
    scheduler=scheduler,
    device=device,
    train_path=config["train_path"],
    val_path=config["val_path"],
    batch_size=config["batch_size"],
    num_workers=config["num_workers"],
    epochs=config["epochs"],
    artifact_path=config["artifact_path"],
)
training.main()

********* Epoch = 1 *********


loss=6.7770 batch_id=436:   4%|▍         | 437/10010 [08:54<3:15:17,  1.22s/it]


KeyboardInterrupt: 