In [1]:
import os, warnings
import wandb

import pandas as pd
from fastai.vision.all import *
from sklearn.model_selection import StratifiedGroupKFold

import params # local import
warnings.filterwarnings("ignore")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
run = wandb.init(project=params.WANDB_PROJECT, entity=params.ENTITY, job_type="data_split")

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin


In [3]:
# retrieve the latest version of the eda job run
raw_data_artifact = run.use_artifact(f"{params.RAW_DATA_AT}:latest", type="raw_data")
path = Path(raw_data_artifact.download())

[34m[1mwandb[0m: Downloading large artifact bdd_simple_1k:latest, 813.77MB. 4007 files... 
[34m[1mwandb[0m:   4007 of 4007 files downloaded.  
Done. 0:0:5.7


In [4]:
path.ls()

(#5) [Path('artifacts/bdd_simple_1k-v0/eda_table.table.json'),Path('artifacts/bdd_simple_1k-v0/images'),Path('artifacts/bdd_simple_1k-v0/labels'),Path('artifacts/bdd_simple_1k-v0/LICENSE.txt'),Path('artifacts/bdd_simple_1k-v0/media')]

### Spliting the dataset using `StratifiedGroupKFold`

This process is similar to using a regular train-test split or k-fold cross-validation, but with the additional consideration of groups and stratification. `StratifiedGroupKFold` is particularly useful when you have a dataset with a large number of groups and/or imbalanced class distributions. By ensuring that each fold has a representative sample of each group and a similar distribution of target labels, you can reduce the risk of overfitting and obtain more accurate estimates of model performance.

In [5]:
# first lets get the table
eda_table = raw_data_artifact.get("eda_table")

[34m[1mwandb[0m: Downloading large artifact bdd_simple_1k:latest, 813.77MB. 4007 files... 
[34m[1mwandb[0m:   4007 of 4007 files downloaded.  
Done. 0:0:5.9


In [6]:
# group data based on geographical location which is the P1 column
groups = eda_table.get_column("P1")
target_label = eda_table.get_column("bicycle")
fnames = eda_table.get_column("File_name")

In [7]:
df = pd.DataFrame()
df["File_Name"] = fnames
df["fold"] = -1

In [8]:
df.head()

Unnamed: 0,File_Name,fold
0,0027eed2-09c90000,-1
1,0027eed2-09c90001,-1
2,00aad4a0-ee8135fe,-1
3,00d79c0a-23befe54,-1
4,00e69ee0-9656df95,-1


In [9]:
cv = StratifiedGroupKFold(n_splits=10) # using 10 folds
for i, (_, test_idx) in enumerate(cv.split(fnames, target_label, groups)):
    df.loc[test_idx, ["fold"]] = i

In [10]:
df.head()

Unnamed: 0,File_Name,fold
0,0027eed2-09c90000,4
1,0027eed2-09c90001,4
2,00aad4a0-ee8135fe,5
3,00d79c0a-23befe54,6
4,00e69ee0-9656df95,7


In [11]:
# make 80% training data, 10% validation and 10% for testing
df["Stage"] = "train"
df.loc[df.fold == 0, ["Stage"]] = "test"
df.loc[df.fold == 1, ["Stage"]] = "valid"
del df["fold"]
df.Stage.value_counts()

train    800
test     100
valid    100
Name: Stage, dtype: int64

In [12]:
df.to_csv("data_split.csv", index=False)

In [13]:
# save dataset
processed_data_artifact = wandb.Artifact(params.PROCESSED_DATA_AT, type="split_data")

In [14]:
processed_data_artifact.add_file("data_split.csv")
processed_data_artifact.add_dir(path)

[34m[1mwandb[0m: Adding directory to artifact (.\artifacts\bdd_simple_1k-v0)... Done. 11.6s


In [15]:
data_split_table = wandb.Table(dataframe=df[["File_Name", "Stage"]])

In [16]:
join_table = wandb.JoinedTable(eda_table, data_split_table, "File_Name")

In [17]:
processed_data_artifact.add(join_table, "eda_table_data_slit")

<ManifestEntry digest: qouFIb432UcqV3ybKdeR6A==>

In [18]:
wandb.log_artifact(processed_data_artifact)
wandb.finish()