In [1]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Wed Dec 20 21:25:37 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 546.33                 Driver Version: 546.33       CUDA Version: 12.3     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                     TCC/WDDM  | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 4060      WDDM  | 00000000:01:00.0  On |                  N/A |
|  0%   37C    P8              N/A / 115W |   1124MiB /  8188MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [2]:
!pip install torch>=1.9
!pip install fastai>=2.7
!pip install matplotlib
!pip install numpy
!pip install pandas
!pip install scikit-learn
!pip install torchvision
!pip install tqdm
!pip install wandb



In [3]:
WANDB_PROJECT = "cad_dl"
ENTITY = "iconet" # set this to team name if working in a team

RAW_DATA_AT = 'skin_data'
PROCESSED_DATA_AT = 'skin_data_proc'

In [None]:
import wandb
from sklearn.metrics import ConfusionMatrixDisplay
from IPython.display import display, Markdown
from fastai.vision.all import *
import pandas as pd
from fastai.callback.wandb import WandbCallback
import cv2

# EDA (Exploratory Data Analysis)

In this notebook, we will download the given dataset and use W&B Artifacts and Tables to version and analyze our data

In [None]:
DEBUG = False # set this flag to True to use a small subset of data for testing

In [None]:
URL_TRAINING = 'http://dixie.udg.edu/CAD2022/Chall1/train.tgz'

In [None]:
path_training = Path(untar_data(URL_TRAINING, force_download=True))

In [None]:
path_training.ls()

In [None]:
URL_VAL = 'http://dixie.udg.edu/CAD2022/Chall1/val.tgz'

In [None]:
path_val = Path(untar_data(URL_VAL, force_download=True))

In [None]:
path_val.ls()

In [None]:
URL_TEST = 'http://dixie.udg.edu/CAD2022/Chall1/test.tgz'

In [None]:
path_test = Path(untar_data(URL_TEST, force_download=True))

In [None]:
path_test.ls()

In [None]:
def label_func(fname):
    if fname.name.startswith("nev"):
        return 0
    elif fname.name.startswith("xxx"):
        return "None"
    else:
        return 1

def _create_table(image_files, image_files_test=None, image_files_val=None):
    "Create a table with the dataset"
    table = wandb.Table(columns=["File_Name", "Images", "Split", "Label"])

    for i, image_file in progress_bar(enumerate(image_files), total=len(image_files)):
      image = Image.open(image_file)
      label_train = label_func(image_file)
      file_name_without_extension = os.path.splitext(image_file.name)[0]
      table.add_data(
          file_name_without_extension,
          wandb.Image(
              image
              ),
          "None", # we don't have a dataset split yet
          label_train
          )

    if image_files_test is not None:
        for i, image_file_test in progress_bar(enumerate(image_files_test), total=len(image_files_test)):
          image_test = Image.open(image_file_test)
          label_test = label_func(image_file_test)
          file_name_without_extension_test = os.path.splitext(image_file_test.name)[0]
          table.add_data(
              file_name_without_extension_test,
              wandb.Image(
                  image_test
                  ),
              "None",
              label_test
              )

    if image_files_val is not None:
        for i, image_file_val in progress_bar(enumerate(image_files_val), total=len(image_files_val)):
          image_val = Image.open(image_file_val)
          label_val = label_func(image_file_val)
          file_name_without_extension_val = os.path.splitext(image_file_val.name)[0]
          table.add_data(
              file_name_without_extension_val,
              wandb.Image(
                  image_val
                  ),
              "None",
              label_val
              )

    return table

In [None]:
run = wandb.init(project=WANDB_PROJECT, entity=ENTITY, job_type="upload")

In [None]:
raw_data_at = wandb.Artifact(RAW_DATA_AT, type="raw_data")

In [None]:
# raw_at = run.use_artifact(f'{RAW_DATA_AT}:latest')
# path = Path(raw_at.download())

In [None]:
raw_data_at.add_dir(path_training/'nevus', name='nevus_train_images')
raw_data_at.add_dir(path_training/'others', name='others_train_images')


raw_data_at.add_dir(path_val/'nevus', name='nevus_val_images')
raw_data_at.add_dir(path_val/'others', name='others_val_images')



In [None]:
raw_data_at.add_dir(path_test, name='xxx_test_images')

In [None]:
image_files_nevus = get_image_files(path_training/'nevus', recurse=False)
image_files_others = get_image_files(path_training/'others', recurse=False)

In [None]:
image_files_val_nevus = get_image_files(path_val/'nevus', recurse=False)
image_files_val_others = get_image_files(path_val/'others', recurse=False)

In [None]:
image_files_test = get_image_files(path_test, recurse=False)

In [None]:
table_train = _create_table(image_files_nevus, image_files_others)

In [None]:
table_val = _create_table(image_files_val_nevus, image_files_val_others)

In [None]:
table_test = _create_table(image_files_test)

In [None]:
raw_data_at.add(table_train, "raw_train_table")
raw_data_at.add(table_val, "raw_val_table")

In [None]:
raw_data_at.add(table_test, "raw_test_table")

In [None]:
run.log_artifact(raw_data_at)

In [None]:
run.finish()

# Data preparation

In [None]:
import os, warnings
import wandb

import pandas as pd
from fastai.vision.all import *
from sklearn.model_selection import *


warnings.filterwarnings('ignore')

In [None]:
run = wandb.init(project=WANDB_PROJECT, entity=ENTITY, job_type="data_prep")

In [None]:
raw_data_at = run.use_artifact(f'{RAW_DATA_AT}:latest')
path = Path(raw_data_at.download())

In [None]:
path.ls()

In [None]:
fnames_nevus = os.listdir(path/'nevus_train_images')
fnames_others = os.listdir(path/'others_train_images')
fnames_nevus_v = os.listdir(path/'nevus_val_images')
fnames_others_v = os.listdir(path/'others_val_images')

fnames_test = os.listdir(path/'xxx_test_images')

#groups = [s.split('-')[0] for s in fnames]

In [None]:
train_table = raw_data_at.get("raw_train_table")

In [None]:
val_table = raw_data_at.get("raw_val_table")

In [None]:
test_table = raw_data_at.get("raw_test_table")

In [None]:
y = train_table.get_column('Label')

In [None]:
x = train_table.get_column('File_Name')

In [None]:
x_val = val_table.get_column('File_Name')

In [None]:
y_val = val_table.get_column('Label')

In [None]:
x_test = test_table.get_column('File_Name')

In [None]:
y_test = test_table.get_column('Label')

In [None]:
df = pd.DataFrame()
df['File_Name'] = x

In [None]:
df["image_fname"] = [f'{f}.jpg' for f in df.File_Name.values]
df["Label"] = y

In [None]:
df2 = pd.DataFrame()
df2['File_Name'] = x_val

In [None]:
df['Stage'] = 'train'

In [None]:
df2["image_fname"] = [f'{f}.jpg' for f in df2.File_Name.values]
df2["Label"] = y_val

In [None]:
df2['Stage'] = 'valid'

In [None]:
df3 = pd.DataFrame()
df3['File_Name'] = x_test

In [None]:
df3["image_fname"] = [f'{f}.jpg' for f in df3.File_Name.values]
df3["Label"] = y_test

In [None]:
df3['Stage'] = 'test'

In [None]:
frames = [df, df2, df3]

result = pd.DataFrame()


result = pd.concat(frames)

In [None]:
result = pd.concat([
   df,
   df2.rename(columns=dict(zip(df2.columns, df.columns)))
]).sort_values("File_Name")

In [None]:
result

In [None]:
result_final = pd.concat([
   result,
   df3.rename(columns=dict(zip(result.columns, df3.columns)))
]).sort_values("File_Name")

In [None]:
result_final

In [None]:
result_final.to_csv('data.csv', index=False)

In [None]:
result.to_csv('train_val.csv', index=False)

In [None]:
df.to_csv('train.csv', index=False)

In [None]:
df2.to_csv('val.csv', index=False)

In [None]:
df3.to_csv('test.csv', index=False)

In [None]:
processed_data_at = wandb.Artifact(PROCESSED_DATA_AT, type="csv_data")

In [None]:
processed_data_at.add_file('data.csv')
processed_data_at.add_file('test.csv')
processed_data_at.add_file('train.csv')
processed_data_at.add_file('train_val.csv')
processed_data_at.add_file('val.csv')
processed_data_at.add(train_table, "raw_train_table")
processed_data_at.add(val_table, "raw_val_table")
processed_data_at.add(test_table, "raw_test_table")

In [None]:
result_table = wandb.Table(dataframe=result_final[['File_Name', 'image_fname', 'Stage']])

In [None]:
processed_data_at.add(result_table, "data_result_table")

In [None]:
run.log_artifact(processed_data_at)
run.finish()

# Baseline solution

In [4]:
import wandb
import pandas as pd
import torchvision.models as tvmodels
from fastai.vision.all import *
from fastai.callback.wandb import WandbCallback

In [5]:
train_config = SimpleNamespace(
    framework="fastai",
    img_size=(128, 128),
    batch_size=32,
    augment=None, # use data augmentation
    epochs=10,
    lr=2e-3,
    arch="resnet50",
    pretrained=True,  # whether to use pretrained encoder
    seed=42,
)

In [6]:
set_seed(train_config.seed, reproducible=True)

In [7]:
run = wandb.init(project=WANDB_PROJECT, entity=ENTITY, job_type="training", config=train_config)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


[34m[1mwandb[0m: Currently logged in as: [33menriquefvrc[0m ([33miconet[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
# processed_data_at = run.use_artifact(f'{PROCESSED_DATA_AT}:latest')
# processed_dataset_dir = Path(processed_data_at.download())
# df = pd.read_csv(processed_dataset_dir / 'data.csv')

In [None]:
# raw_data_at = run.use_artifact(f'{RAW_DATA_AT}:latest')
# raw_dir = Path(raw_data_at.download())

In [8]:
processed_data_at = wandb.use_artifact(f'{PROCESSED_DATA_AT}:latest')
processed_dataset_dir = Path(processed_data_at.download())

[34m[1mwandb[0m:   9 of 9 files downloaded.  


In [19]:
raw_data_at = wandb.use_artifact(f'{RAW_DATA_AT}:latest')
path = Path(raw_data_at.download())

[34m[1mwandb[0m: Downloading large artifact skin_data:latest, 27383.12MB. 50615 files... 
[34m[1mwandb[0m:   50615 of 50615 files downloaded.  
Done. 0:2:23.5


In [20]:
def get_df(processed_dataset_dir, is_test = False):
  df = pd.read_csv(processed_dataset_dir / 'data.csv')

  if not is_test:
    df = df[df.Stage != 'test'].reset_index(drop=True)
    df['is_valid'] = df.Stage == 'valid'
  else:
    df = df[df.Stage == 'test'].reset_index(drop=True)

  df["Label"] = [f'{f}' for f in df.Label.values]

  return df

In [21]:
df = get_df(processed_dataset_dir, is_test = False)

In [22]:
def find_image_path(file_name):
    folders = ['nevus_train_images', 'others_train_images', 'nevus_val_images', 'others_val_images']

    for folder in folders:
        potential_path = path / folder / f'{file_name}.jpg'
        if potential_path.exists():
            return potential_path

    return None  # If the path couldn't be determined

# Apply the function to create a new column with the correct paths
df["image_fname"] = df["File_Name"].apply(find_image_path)

In [20]:
def get_data(df, bs=4, img_size=(128, 128), augment=True):
    block = DataBlock(blocks=(ImageBlock, CategoryBlock),
                  get_x=ColReader("image_fname"),
                  get_y=ColReader("Label"),
                  splitter=ColSplitter(),
                  item_tfms=Resize(img_size),
                  batch_tfms=aug_transforms() if augment else None,
                 )
    return block.dataloaders(df, bs=bs)

In [12]:
def get_data(df, bs=4, img_size=(128, 128), augment=True):
    block = DataBlock(blocks=(ImageBlock, CategoryBlock),
                  get_x=ColReader("image_fname"),
                  get_y=ColReader("Label"),
                  splitter=ColSplitter(),
                  item_tfms=Resize(img_size),
                  batch_tfms=[aug_transforms() if augment else None, Normalize.from_stats(*imagenet_stats)]
                 )
    return block.dataloaders(df, bs=bs)

In [21]:
config = wandb.config

In [22]:
dls = get_data(df, bs=config.batch_size, img_size=config.img_size, augment=config.augment)

  return getattr(torch, 'has_mps', False)


Let's train our model!

In [23]:
def log_final_metrics(learn):
  scores = learn.validate()
  metric_names = ['final_loss'] + [f'final_{x.name}' for x in learn.metrics]
  final_results = {metric_names[i] : scores[i] for i in range(len(scores))}
  for k,v in final_results.items():
      wandb.summary[k] = v

In [24]:
def train(config=None):

  # Initialize a new wandb run
  with wandb.init(config=config):
      # If called by wandb.agent, as below,
      # this config will be set by Sweep Controller
      config = wandb.config

      set_seed(config.seed, reproducible=True)
      run = wandb.init(project=WANDB_PROJECT, entity=ENTITY, job_type="training", config=config)

      # processed_dataset_dir = download_csv()
      # df = get_df(processed_dataset_dir)



      dls = get_data(df, bs=config.batch_size, img_size=config.img_size, augment=config.augment)

      metrics = [Precision(), Recall(), F1Score(), MatthewsCorrCoef()]

      learn = vision_learner(dls, arch=getattr(tvmodels, config.arch), pretrained=config.pretrained, metrics=metrics)


      cbs = [WandbCallback(log_preds=True, log_model=True),
              SaveModelCallback(fname=f'run-{wandb.run.id}-model', monitor='f1_score')]

      #learn.lr_find(stop_div=False, num_it=200)
      #learn.recorder.plot(suggestion=True)


      learn.fit_one_cycle(config.epochs, config.lr, cbs=cbs)

      log_final_metrics(learn)

      wandb.finish()

# Run the training

In [25]:
train(train_config)

  return getattr(torch, 'has_mps', False)
Traceback (most recent call last):
  File "C:\Users\Enrique\AppData\Local\Temp\ipykernel_15272\2727373245.py", line 19, in train
    metrics = [Precision(), Recall(), Accuracy(), F1Score(), MatthewsCorrCoef()]
                                      ^^^^^^^^
NameError: name 'Accuracy' is not defined


NameError: name 'Accuracy' is not defined

# Sweep RUN

In [25]:
sweep_config = {
    'method': 'bayes',
    'name': 'sweep',
    'metric': {
        'goal': 'maximize',
        'name': 'f1_score'
        },
    'parameters': {
        'batch_size': {'values': [8, 32]},
        'epochs': {'values': [10, 15]},
        'lr': {
            'distribution': 'log_uniform_values',
            'max': 1e-2,
            'min': 1e-5
        },
        'augment': {'value': 'True'},
        'pretrained': {'value': 'True'},
        'img_size': {'value': (256, 256)},
        'seed': {'values': [42]},
        'arch': {
            'values': ['vgg11', 'vgg13', 'vgg16','vgg19']
            }
        },
    'early_terminate': {
        'type': 'hyperband',
        'min_iter': 3
    }
}

In [26]:
sweep_id = wandb.sweep(sweep_config, project=WANDB_PROJECT)

Create sweep with ID: 5kfgxmpr
Sweep URL: https://wandb.ai/iconet/cad_dl/sweeps/5kfgxmpr


In [27]:
wandb.agent(sweep_id, train)

[34m[1mwandb[0m: Agent Starting Run: ol4mmhxx with config:
[34m[1mwandb[0m: 	arch: vgg16
[34m[1mwandb[0m: 	augment: True
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	img_size: [256, 256]
[34m[1mwandb[0m: 	lr: 2.2019094169865347e-05
[34m[1mwandb[0m: 	pretrained: True
[34m[1mwandb[0m: 	seed: 42




Exception in thread Exception in thread IntMsgThrNetStatThr:
:
Traceback (most recent call last):
Traceback (most recent call last):
  File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.11_3.11.1776.0_x64__qbz5n2kfra8p0\Lib\threading.py", line 1045, in _bootstrap_inner
  File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.11_3.11.1776.0_x64__qbz5n2kfra8p0\Lib\threading.py", line 1045, in _bootstrap_inner
    self.run()
  File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.11_3.11.1776.0_x64__qbz5n2kfra8p0\Lib\threading.py", line 982, in run
    self.run()
  File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.11_3.11.1776.0_x64__qbz5n2kfra8p0\Lib\threading.py", line 982, in run
    self._target(*self._args, **self._kwargs)
  File "c:\Users\Enrique\Desktop\Udg\CAD\Project\cad_dl\.venv\Lib\site-packages\wandb\sdk\wandb_run.py", line 268, in check_network_status
    self._target(*self._args, **self._kwargs)
  File "c:\Use

Traceback (most recent call last):
  File "C:\Users\Enrique\AppData\Local\Temp\ipykernel_15884\4260176863.py", line 17, in train
    dls = get_data(df, bs=config.batch_size, img_size=config.img_size, augment=config.augment)
          ^^^^^^^^
NameError: name 'get_data' is not defined
[34m[1mwandb[0m: [32m[41mERROR[0m Run ol4mmhxx errored: NameError("name 'get_data' is not defined")
