In [None]:
WANDB_PROJECT = "analysis3"
ENTITY = 'course'
CLASSES = {i:c for i,c in enumerate(['displaced', 'enucleated', 'irregular', 'micronucleus', 'normal', 'other'])}
RAW_DATA_ARTIFACT = 'dataset_23062023'
PROCESSED_DATA_ARTIFACT = 'dataset_23062023_split'

# Configs

In [None]:
# Installs
!pip install wandb

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting wandb
  Downloading wandb-0.15.4-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m20.8 MB/s[0m eta [36m0:00:00[0m
Collecting GitPython!=3.1.29,>=1.0.0 (from wandb)
  Downloading GitPython-3.1.31-py3-none-any.whl (184 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m184.3/184.3 kB[0m [31m14.8 MB/s[0m eta [36m0:00:00[0m
Collecting sentry-sdk>=1.0.0 (from wandb)
  Downloading sentry_sdk-1.26.0-py2.py3-none-any.whl (209 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.4/209.4 kB[0m [31m15.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting docker-pycreds>=0.4.0 (from wandb)
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB)
Collecting pathtools (from wandb)
  Downloading pathtools-0.1.2.tar.gz (11 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting

In [None]:
# Imports
from fastai.vision.all import *
import wandb
import shutil
import pandas as pd

# Functions

In [None]:
# Functions for data processing and table creation

def label_func(fname):
    if parent_label(fname) not in ['notched', 'segmented', 'tailed']:
        return parent_label(fname)
    else:
        return 'irregular'

def _create_table(image_files, class_labels):
    "Create a table with the dataset"
    labels = [str(class_labels[_lab]) for _lab in list(class_labels)]
    table = wandb.Table(columns=["file_name", "original_file", "image", "label", "split"])

    for i, image_file in progress_bar(enumerate(image_files), total=len(image_files)):
        image = Image.open(image_file)
        label = label_func(image_file)
        table.add_data(
            str(image_file.parent.name+'/'+image_file.name),  # image folder + filename
            str(image_file.name).split('_')[1],                 # original image filename (without crop)
            wandb.Image(image),                                 # the image
            label,                                              # image class
            None                                                # split, we don´t have one yet
        )

    return table

In [None]:
import random

def _get_files(p, fs, extensions=None):
    p = Path(p)
    res = [p/f for f in fs if not f.startswith('.')
           and ((not extensions) or f'.{f.split(".")[-1].lower()}' in extensions)]
    return res

def get_limited_files(path, files_number=600, extensions=image_extensions, recurse=True, folders=None, followlinks=True):
    "Get all the files in `path` with optional `extensions`, optionally with `recurse`, only in `folders`, if specified."
    path = Path(path)
    folders=L(folders)
    extensions = setify(extensions)
    extensions = {e.lower() for e in extensions}
    if recurse:
        res = []
        for i,(p,d,f) in enumerate(os.walk(path, followlinks=followlinks)): # returns (dirpath, dirnames, filenames)
            if len(folders) !=0 and i==0: d[:] = [o for o in d if o in folders]
            else:                         d[:] = [o for o in d if not o.startswith('.')]
            if len(folders) !=0 and i==0 and '.' not in folders: continue
            l = _get_files(p, f, extensions)
            random.Random(18).shuffle(l)
            res += l[:files_number]
    else:
        f = [o.name for o in os.scandir(path) if o.is_file()]
        res = _get_files(path, f, extensions)
    return L(res)

# Filter images

In [None]:
# Folder where the images are stored
project_folder = Path('/content/drive/MyDrive/Eritrocitos/')
path = project_folder/'data'

In [None]:
# Print number of images for each class
print(len(get_image_files(path/'displaced')))
print(len(get_image_files(path/'enucleated')))
print(len(get_image_files(path/'irregular')) + len(get_image_files(path/'tailed')) + len(get_image_files(path/'notched')) + len(get_image_files(path/'segmented')))
print(len(get_image_files(path/'micronucleus')))
print(len(get_image_files(path/'normal')))
print(len(get_image_files(path/'other')))

58
8
285
18
2871
296


In [None]:
# Get images
image_files = get_limited_files(path, files_number=400)

In [None]:
# Create new directory
!mkdir /content/images
# Add images
!cp -r /content/drive/MyDrive/Eritrocitos/data/displaced /content/images
!cp -r /content/drive/MyDrive/Eritrocitos/data/enucleated /content/images
!cp -r /content/drive/MyDrive/Eritrocitos/data/irregular /content/images
!cp -r /content/drive/MyDrive/Eritrocitos/data/micronucleus /content/images
!cp -r /content/drive/MyDrive/Eritrocitos/data/notched /content/images
!cp -r /content/drive/MyDrive/Eritrocitos/data/other /content/images
!cp -r /content/drive/MyDrive/Eritrocitos/data/segmented /content/images
!cp -r /content/drive/MyDrive/Eritrocitos/data/tailed /content/images

# For mature erythrocytes, only add the images that will be used
!mkdir /content/images/normal
for i in image_files:
    if str(i).split('/')[-2]=='normal':
        shutil.copy(i, '/content/images/normal/'+str(i).split('/')[-1])

# W&B

In [None]:
# Init run and create artifact
run = wandb.init(project=WANDB_PROJECT, entity=ENTITY, job_type="upload", name="eda")
raw_data_artifact = wandb.Artifact(RAW_DATA_ARTIFACT, type="raw_data")

In [None]:
# Add folder with the images to the artifact
raw_data_artifact.add_dir('/content/images', name='images')

[34m[1mwandb[0m: Adding directory to artifact (/content/images)... Done. 0.7s


In [None]:
# Create table and add to artifact
table = _create_table(image_files, CLASSES)
raw_data_artifact.add(table, "eda_table")

<wandb.sdk.artifacts.artifact_manifest_entry.ArtifactManifestEntry at 0x7fa9dc6681f0>

In [None]:
# Log artifact to W&B and finish run
run.log_artifact(raw_data_artifact)
run.finish()