# Imports

In [None]:
import os
import wandb
import tempfile
import pandas as pd

from pathlib import Path
from sklearn.model_selection import StratifiedGroupKFold

import project_config as pc

# Retrieve artifact

In [None]:
# Init run
run = wandb.init(project=pc.WANDB_PROJECT, 
                 entity=pc.WANDB_ENTITY, 
                 dir=pc.WANDB_LOCAL_LOGS_PATH,
                 job_type='data_processing')

# Download latest dataset version (if not already downloaded)
dataset_artifact = run.use_artifact(f'{pc.DATASET_ARTIFACT_NAME}:latest')
dataset_dir = pc.WANDB_LOCAL_ARTIFACTS_PATH+Path(dataset_artifact._default_root()).stem
if not os.path.exists(dataset_dir):
	_ = dataset_artifact.download(root=dataset_dir)

[34m[1mwandb[0m: Currently logged in as: [33mfacuroffet99[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: Downloading large artifact oxford-iiit-pet:latest, 757.96MB. 7392 files... 
[34m[1mwandb[0m:   7392 of 7392 files downloaded.  
Done. 0:0:20.9 (36.3MB/s)


# Data processing (filtering, cleaning, etc)

In [4]:
# Read dataframe
df = pd.read_csv(dataset_dir + '/data.csv')

# Filter dataframe
df = df[df['label_breed'] != 'Abyssinian'].reset_index(drop=True)
df

Unnamed: 0,file_path,group,label_breed,label_animal
0,images/american_bulldog_10.jpg,10,american_bulldog,dog
1,images/american_bulldog_100.jpg,100,american_bulldog,dog
2,images/american_bulldog_101.jpg,101,american_bulldog,dog
3,images/american_bulldog_102.jpg,102,american_bulldog,dog
4,images/american_bulldog_103.jpg,103,american_bulldog,dog
...,...,...,...,...
7185,images/yorkshire_terrier_95.jpg,95,yorkshire_terrier,dog
7186,images/yorkshire_terrier_96.jpg,96,yorkshire_terrier,dog
7187,images/yorkshire_terrier_97.jpg,97,yorkshire_terrier,dog
7188,images/yorkshire_terrier_98.jpg,98,yorkshire_terrier,dog


# Data split

In [6]:
# Splits parameters
n_splits = 10
valid_splits_ids = [0, 1]
test_splits_ids = [2]

# Select columns for splits
X = df['file_path'].values
y = df['label_breed'].values
groups = df['group'].values

# Create splits
cv = StratifiedGroupKFold(n_splits=n_splits, shuffle=True, random_state=18)
df['split'] = -1
for i, (train_idxs, valid_idxs) in enumerate(cv.split(X, y, groups)):
	df.loc[valid_idxs, ['split']] = i

# Assign splits
df['is_valid'] = df['split'].apply(lambda x: x in valid_splits_ids)
df['is_test'] = df['split'].apply(lambda x: x in test_splits_ids)
df.drop(columns=['split'], inplace=True)

# Separate dataframe
df_train_valid = df[~df['is_test']].drop(columns=['is_test'])
df_test = df[df['is_test']].drop(columns=['is_test', 'is_valid'])

# Logs

In [7]:
# Create new artifact version
new_dataset_artifact = dataset_artifact.new_draft()

# Save dataframes in temporary files and add them to the artifact
with tempfile.TemporaryDirectory() as temp_dir:
	df_train_valid.to_csv(temp_dir+'/dataset.csv', index=False)
	new_dataset_artifact.add_file(temp_dir+'/dataset.csv', 'dataset.csv')
	if len(df_test) > 0:
		df_test.to_csv(temp_dir+'/dataset_test.csv', index=False)
		new_dataset_artifact.add_file(temp_dir+'/dataset_test.csv', 'dataset_test.csv')

# Log artifact and finish run
run.log_artifact(new_dataset_artifact)
run.finish()