# Imports

In [None]:
import wandb
import tempfile
import pandas as pd

from fastai.data.transforms import get_image_files

import project_config as pc

# Paths

In [2]:
# Local paths
root = 'C:/Users/Facu/.fastai/'
data_folder = root+'data/oxford-iiit-pet/'
images_folder = data_folder+'images/'

# Data

In [3]:
# Gather data from the dataset
image_files = [f.relative_to(data_folder).as_posix() for f in get_image_files(images_folder)]
labels_breeds = ['_'.join(x.split('.')[0].split('/')[-1].split('_')[:-1]) for x in image_files]
labels_animals = ['cat' if l.istitle() else 'dog' for l in labels_breeds]
groups = [x.split('.')[0].split('_')[-1] for x in image_files]
	
# Combine in a single dataframe
df = pd.DataFrame({'file_path': image_files, 
                   'group': groups, 
                   'label_breed': labels_breeds, 
                   'label_animal': labels_animals})
df

Unnamed: 0,file_path,group,label_breed,label_animal
0,images/Abyssinian_1.jpg,1,Abyssinian,cat
1,images/Abyssinian_10.jpg,10,Abyssinian,cat
2,images/Abyssinian_100.jpg,100,Abyssinian,cat
3,images/Abyssinian_101.jpg,101,Abyssinian,cat
4,images/Abyssinian_102.jpg,102,Abyssinian,cat
...,...,...,...,...
7385,images/yorkshire_terrier_95.jpg,95,yorkshire_terrier,dog
7386,images/yorkshire_terrier_96.jpg,96,yorkshire_terrier,dog
7387,images/yorkshire_terrier_97.jpg,97,yorkshire_terrier,dog
7388,images/yorkshire_terrier_98.jpg,98,yorkshire_terrier,dog


# Log data and EDA

In [None]:
# Init run
run = wandb.init(project=pc.WANDB_PROJECT, 
				 entity=pc.WANDB_ENTITY, 
				 dir=pc.WANDB_LOCAL_LOGS_PATH,
				 job_type='data_upload')

# Create artifact and link data
dataset_artifact = wandb.Artifact(pc.DATASET_ARTIFACT_NAME, type='dataset')
dataset_artifact.add_dir(images_folder, 'images')

# Save dataframe to a temporary file and add it to the artifact
with tempfile.TemporaryDirectory() as temp_dir:
	df.to_csv(temp_dir+'/data.csv', index=False)
	dataset_artifact.add_file(temp_dir+'/data.csv', name='data.csv')

# Create table referencing local files
table = wandb.Table(columns=['image']+list(df.columns.values))
for _, row in df.iterrows():
	local_path = data_folder + row['file_path']
	table.add_data(
		wandb.Image(local_path),
		*row.values
	)
dataset_artifact.add(table, 'eda_table')

# Log artifact and finish run
run.log_artifact(dataset_artifact)
run.finish()

[34m[1mwandb[0m: Currently logged in as: [33mfacuroffet99[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: Adding directory to artifact (C:\Users\Facu\.fastai\data\oxford-iiit-pet\images)... Done. 6.6s
