In [None]:
# installation of Weights & Biases
!pip install wandb

Collecting wandb
  Downloading wandb-0.16.0-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m19.2 MB/s[0m eta [36m0:00:00[0m
Collecting GitPython!=3.1.29,>=1.0.0 (from wandb)
  Downloading GitPython-3.1.40-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.6/190.6 kB[0m [31m19.9 MB/s[0m eta [36m0:00:00[0m
Collecting sentry-sdk>=1.0.0 (from wandb)
  Downloading sentry_sdk-1.35.0-py2.py3-none-any.whl (248 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m248.6/248.6 kB[0m [31m18.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting docker-pycreds>=0.4.0 (from wandb)
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB)
Collecting setproctitle (from wandb)
  Downloading setproctitle-1.3.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (30 kB)
Collecting gitdb<5,>=4.0.1 (from GitPython!=3.1.29,>=1.0.0->w

In [None]:
# Importing libraries
import logging
import tempfile
import pandas as pd
import os
import wandb
from sklearn.model_selection import train_test_split

In [None]:
# wandb login
!wandb login --relogin

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit: 
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


##1.3 Data segregation


In [None]:
# variables of the code

# proportion used to separate the dataset between training and testing
# we chose the proportion 80% training and 20% testing, using a simple random division
# This was the simplest and most recommended way for an initial division
test_size = 0.2

# seed used for the randomness of the splint to be controlled and reused
seed = 41

# reference (column) to stratify the data
# separates maintaining an equal proportion of column values
stratify = "class"

# name of the artifact that we will get from wandb and divide between training and testing
artifact_input_name = "flamigos/Murshroom-Kmeans/clean_data_dataset:latest"

# artifact type
artifact_type = "segregated_data"

In [None]:
# configure logging
logging.basicConfig(level=logging.INFO,
                    format="%(asctime)s %(message)s",
                    datefmt='%d-%m-%Y %H:%M:%S')

# reference to a logger object
logger = logging.getLogger()

# initializing a new wandb run
run = wandb.init(entity="flamigos", project="Murshroom-Kmeans", job_type="split_data")

logger.info("Downloading and reading artifact")
artifact = run.use_artifact(artifact_input_name)
artifact_path = artifact.file()
df = pd.read_csv(artifact_path)


logger.info("Splitting data into train and test")
splits = {}

# dividing the dataset between training and testing and storing it in the splits dictionary
splits["train"], splits["test"] = train_test_split(df,
                                                   test_size=test_size,
                                                   random_state=seed,
                                                   stratify=df[stratify])

[34m[1mwandb[0m: Currently logged in as: [33mfrancisco-valmir[0m ([33mflamigos[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
# to save the artifacts. We use a temporary directory to leave no traces
with tempfile.TemporaryDirectory() as tmp_dir:

    # does the entire operation of saving the 2 artifacts in wandb
    for split, df in splits.items():

        # creates the artifact name through the dictionary value
        artifact_name = f"{split}.csv"

        # Gets the location of the temporary volume created
        temp_path = os.path.join(tmp_dir, artifact_name)

        logger.info(f"Uploading the {split} dataset to {artifact_name}")

        # save to wandb
        df.to_csv(temp_path,index=False)

        artifact = wandb.Artifact(name=artifact_name,
                                  type=artifact_type,
                                  description=f"{split} split of dataset {artifact_input_name}",
        )
        artifact.add_file(temp_path)

        logger.info("Logging artifact")
        run.log_artifact(artifact)

        # This waits for the artifact to be loaded into W&B. If you don't add this,
        # the temporary directory may be removed before W&B has a chance to do so.
        # uploading the datasets, and the upload may fail
        artifact.wait()

In [None]:
# closes wandb run
run.finish()

VBox(children=(Label(value='0.358 MB of 0.367 MB uploaded\r'), FloatProgress(value=0.9763064100597669, max=1.0…