In [1]:
import os


In [2]:
os.chdir("../")

In [3]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class TrainingConfig:
    root_dir: Path
    trained_model_path: Path
    base_model_path: Path
    training_data: Path
    params_arch_name: str
    params_num_classes: int
    params_epochs: int
    params_batch_size: int
    params_learning_rate: float
    params_momentum: float
    params_is_augmentation: bool
    params_image_size: list

In [4]:
from Skin_Cancer_Classifier.constants import CONFIG_FILE_PATH, PARAMS_FILE_PATH
from Skin_Cancer_Classifier.utils.common import read_yaml, create_directories


In [5]:
class ConfigurationManager:
    def __init__(self, config_filepath=CONFIG_FILE_PATH, params_filepath=PARAMS_FILE_PATH):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])
    
    def get_training_config(self) -> TrainingConfig:
        training = self.config.training
        prepare_base_model = self.config.prepare_base_model
        params = self.params
        training_data = self.config.data_ingestion.unzip_dir

        create_directories([Path(training.root_dir)])

        root_dir_train = Path(training.root_dir)
        root_dir_model = Path(prepare_base_model.root_dir)
        arch_name = self.params.ARCH_NAME
        base_model_path = root_dir_model / f"model_{arch_name}.pth"
        trained_model_path = root_dir_train / f"model_{arch_name}.pth"

        training_config = TrainingConfig(
            root_dir=root_dir_train,
            trained_model_path=trained_model_path,
            base_model_path= base_model_path,
            training_data=Path(training_data),
            params_arch_name = arch_name,
            params_num_classes= params.CLASSES,
            params_epochs=params.EPOCHS,
            params_batch_size=params.BATCH_SIZE,
            params_learning_rate=params.LEARNING_RATE,
            params_momentum= params.MOMENTUM,
            params_is_augmentation=params.AUGMENTATION,
            params_image_size=params.IMAGE_SIZE)
        

        return training_config

        

In [6]:
from Skin_Cancer_Classifier.components.prepare_base_model import Baseline
from Skin_Cancer_Classifier.components.data_ingestion import FedIsic2019, BaselineLoss, acc_metric
from torch.utils.data import ConcatDataset, DataLoader

In [None]:
import torch
from tqdm import tqdm

class Training:
    def __init__(self, config: TrainingConfig):
        self.config = config
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    def get_base_model(self):
        self.model = Baseline(True, self.config.params_arch_name, self.config.params_num_classes)
        self.model.load_state_dict(torch.load(self.config.base_model_path))

    def train_valid_generator(self):
        trainset = FedIsic2019(train = True, data_path = self.config.training_data, centers = [0, 5])
        valset = FedIsic2019(train = False, data_path = self.config.training_data, centers = [0, 5])
        self.trainloader = DataLoader(trainset, batch_size=self.config.params_batch_size, shuffle=True)
        self.valloader= DataLoader(valset, batch_size=self.config.params_batch_size, shuffle=False)

    def train(self):
        optimizer = torch.optim.SGD(self.model.parameters(), lr=self.config.params_learning_rate, momentum=self.config.params_momentum)
        criterion = 
        for _ in range(self.config.params_epochs):
            for batch in tqdm(self.trainloader):
                batch = list(batch)
                images, labels = batch[0], batch[1]
                optimizer.zero_grad()
                loss = criterion(net(images.to(device)), labels.to(device))    
                loss.backward()
                optimizer.step()

        self.save_model(
            path=self.config.trained_model_path,
            model=self.model
        )


        

In [28]:
import os, random
import numpy as np
import pandas as pd
import sklearn
import albumentations
import matplotlib.pyplot as plt
from PIL import Image

import torch
import torchvision
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms

class Isic2019Raw(torch.utils.data.Dataset):
    """Pytorch dataset containing all the features, labels and datacenter
    information for Isic2019.

    Attributes
    ----------
    image_paths: list[str]
        the list with the path towards all features
    targets: list[int]
        the list with all classification labels for all features
    centers: list[int]
        the list for all datacenters for all features
    X_dtype: torch.dtype
        the dtype of the X features output
    y_dtype: torch.dtype
        the dtype of the y label output
    augmentations:
        image transform operations from the albumentations library,
        used for data augmentation
    data_path: str
        If data_path is given it will ignore the config file and look for the
        dataset directly in data_path. Defaults to None.

    Parameters
    ----------
    X_dtype :
    y_dtype :
    augmentations :
    """

    def __init__(self, X_dtype=torch.float32, y_dtype=torch.int64, augmentations=None, data_path=None,):
        """
        Cf class docstring
        """

        if not (os.path.exists(data_path)):
            raise ValueError(f"The string {data_path} is not a valid path.")
        

        self.input_path = data_path

        self.dic = {"input_preprocessed": os.path.join(self.input_path, "ISIC_2019_Training_Input_preprocessed"),
                    "train_test_split": os.path.join(self.input_path, "train_test_split"),}
        self.X_dtype = X_dtype
        self.y_dtype = y_dtype
        df2 = pd.read_csv(self.dic["train_test_split"])
        images = df2.image.tolist()
        self.image_paths = [os.path.join(self.dic["input_preprocessed"], image_name + ".jpg") for image_name in images ]
        self.targets = df2.target
        self.augmentations = augmentations
        self.centers = df2.center

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        image_path = self.image_paths[idx]
        image = np.array(Image.open(image_path))
        target = self.targets[idx]

        # Image augmentations
        if self.augmentations is not None:
            augmented = self.augmentations(image=image)
            image = augmented["image"]

        image = np.transpose(image, (2, 0, 1)).astype(np.float32)

        return (
            torch.tensor(image, dtype=self.X_dtype),
            torch.tensor(target, dtype=self.y_dtype),
        )


class FedIsic2019(Isic2019Raw):
    """
    Pytorch dataset containing for each center the features and associated labels
    for the Isic2019 federated classification.
    One can instantiate this dataset with train or test data coming from either of
    the 6 centers it was created from or all data pooled.
    The train/test split is fixed and given in the train_test_split file.

    Parameters
    ----------
    center : int, optional
        Default to 0
    train : bool, optional
        Default to True
    pooled : bool, optional
        Default to False
    debug : bool, optional
        Default to False
    X_dtype : torch.dtype, optional
        Default to torch.float32
    y_dtype : torch.dtype, optional
        Default to torch.int64
    data_path: str
        If data_path is given it will ignore the config file and look for the
        dataset directly in data_path. Defaults to None.
    """

    def __init__(self, train: bool = True, X_dtype: torch.dtype = torch.float32, y_dtype: torch.dtype = torch.int64, data_path: str = None, centers: list = None):
        """Cf class docstring"""
        sz = 200
        if train:
            augmentations = albumentations.Compose(
                [
                    albumentations.RandomScale(0.07),
                    albumentations.Rotate(50),
                    albumentations.RandomBrightnessContrast(0.15, 0.1),
                    albumentations.Flip(p=0.5),
                    albumentations.Affine(shear=0.1),
                    albumentations.RandomCrop(sz, sz),
                    albumentations.CoarseDropout(random.randint(1, 8), 16, 16),
                    albumentations.Normalize(always_apply=True),
                ]
            )
        else:
            augmentations = albumentations.Compose(
                [
                    albumentations.CenterCrop(sz, sz),
                    albumentations.Normalize(always_apply=True),
                ]
            )

        super().__init__(
            X_dtype=X_dtype,
            y_dtype=y_dtype,
            augmentations=augmentations,
            data_path=data_path,
        )

        self.centers_list = centers
        self.train_test = "train" if train else "test"
        df = pd.read_csv(self.dic["train_test_split"])

        df2 = df[(df['fold'] == self.train_test) & (df['center'].isin(self.centers_list))].reset_index(drop=True)

        images = df2.image.tolist()
        self.image_paths = [os.path.join(self.dic["input_preprocessed"], image_name + ".jpg") for image_name in images]
        self.targets = df2.target
        self.centers = df2.center

In [29]:
trainset = FedIsic2019(train = True, data_path = '/home/fmlpc/Shashank/Course_Work/MLOPS/MLOPS_Project_Skin_Cancer_Detection/artifacts/data_ingestion', centers = [0, 5])
testset = FedIsic2019(train = False, data_path = '/home/fmlpc/Shashank/Course_Work/MLOPS/MLOPS_Project_Skin_Cancer_Detection/artifacts/data_ingestion', centers = [0, 5])

In [14]:
len(trainset.targets), len(testset.targets)

(10281, 2571)

In [30]:
weights = [0] * 8
for x in trainset:
    weights[int(x[1])] += 1

In [32]:
N = len(trainset)
class_weights = torch.FloatTensor([N / weights[i] for i in range(8)])
lossfunc = BaselineLoss(alpha=class_weights)

In [34]:
N

10281

In [33]:
class_weights

tensor([  4.3992,   2.8252,   4.5693,  16.9653,  11.0667, 101.7921, 115.5169,
         31.1545])

In [31]:
weights

[2337, 3639, 2250, 606, 929, 101, 89, 330]