In [None]:
WANDB_PROJECT = "analysis3"
ENTITY = 'course'
CLASSES = {i:c for i,c in enumerate(['displaced', 'enucleated', 'irregular', 'micronucleus', 'normal', 'other'])}
RAW_DATA_ARTIFACT = 'dataset_23062023'
PROCESSED_DATA_ARTIFACT = 'dataset_23062023_split'

# Configs

In [None]:
# Installs
!pip install wandb

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting wandb
  Downloading wandb-0.15.4-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m62.2 MB/s[0m eta [36m0:00:00[0m
Collecting GitPython!=3.1.29,>=1.0.0 (from wandb)
  Downloading GitPython-3.1.31-py3-none-any.whl (184 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m184.3/184.3 kB[0m [31m20.7 MB/s[0m eta [36m0:00:00[0m
Collecting sentry-sdk>=1.0.0 (from wandb)
  Downloading sentry_sdk-1.26.0-py2.py3-none-any.whl (209 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.4/209.4 kB[0m [31m22.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting docker-pycreds>=0.4.0 (from wandb)
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB)
Collecting pathtools (from wandb)
  Downloading pathtools-0.1.2.tar.gz (11 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting

In [None]:
# Imports
import os
import wandb
import shutil
import pandas as pd
from fastai.vision.all import *
from sklearn.model_selection import StratifiedGroupKFold, StratifiedKFold

# Split

In [None]:
# Init run and get the artifact
run = wandb.init(project=WANDB_PROJECT, entity=ENTITY, job_type="data_split", name='split_2')
raw_data_artifact = run.use_artifact(f'{RAW_DATA_ARTIFACT}:latest')
path = Path(raw_data_artifact.download())

In [None]:
# Get the table and select relevant columns
orig_eda_table = raw_data_artifact.get("eda_table")

fnames = orig_eda_table.get_column('file_name')
groups = orig_eda_table.get_column('original_file')
y = orig_eda_table.get_column('label')

[34m[1mwandb[0m:   2130 of 2130 files downloaded.  


In [None]:
# Create df for the data
df = pd.DataFrame()
df['file_name'] = fnames
df['fold'] = -1

In [None]:
# Create folds using stratify partition (by label)
cv = StratifiedKFold(n_splits=10)
for i, (train_idxs, valid_idxs) in enumerate(cv.split(fnames, y)):
    df.loc[valid_idxs, ['fold']] = i



In [None]:
# Assign two folds for validation (20%)
df['split'] = 'train'
df.loc[df.fold == 0, ['split']] = 'valid'
df.loc[df.fold == 1, ['split']] = 'valid'
del df['fold']
df.split.value_counts()

train    851
valid    214
Name: split, dtype: int64

In [None]:
# Save df
df.to_csv('data_split.csv', index=False)

In [None]:
# Create new artifact and add data
processed_data_artifact = wandb.Artifact(PROCESSED_DATA_ARTIFACT, type="split_data")
processed_data_artifact.add_file('data_split.csv')
processed_data_artifact.add_dir(path)

[34m[1mwandb[0m: Adding directory to artifact (./artifacts/dataset_23062023:v0)... Done. 1.1s


In [None]:
# Create and join table
data_split_table = wandb.Table(dataframe=df[['file_name', 'split']])
join_table = wandb.JoinedTable(orig_eda_table, data_split_table, "file_name")
processed_data_artifact.add(join_table, "eda_table_data_split")

<wandb.sdk.artifacts.artifact_manifest_entry.ArtifactManifestEntry at 0x7eff64575b10>

In [None]:
# Log artifact to W&B and finish run
run.log_artifact(processed_data_artifact)
run.finish()