In [None]:
# installation of Weights & Biases
!pip install wandb

In [None]:
# Importing libraries
import wandb
import pandas as pd
import numpy as np
import tempfile
import logging
import os

## Preprocessing

In [None]:
# wandb login
!wandb login --relogin

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit: 
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [None]:
# initializing a wandb run to get the original dataset and do Preprocessing
run = wandb.init(entity="flamigos", project="Murshroom-Kmeans", job_type="process_data")

[34m[1mwandb[0m: Currently logged in as: [33mfrancisco-valmir[0m ([33mflamigos[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
# getting the artifact
artifact = run.use_artifact("Murshroom-Kmeans/raw_data_dataset:latest")

# creating an artifact dataframe
df = pd.read_csv(artifact.file())
df

### Checking NaN values

In [None]:
# getting columns
col = df.columns

# stores the number of nulls per category in an array
nulls = []
for i in col:
  x = df[i].isnull().sum()
  nulls.append(x)

# dataset of columns by nulls
data = {'columns': col,
        'nulls': nulls}
murshroom_col_nulls = pd.DataFrame(data)
murshroom_col_nulls

Unnamed: 0,Colunas,Nulos
0,class,0
1,cap-shape,0
2,cap-surface,0
3,cap-color,0
4,bruises,0
5,odor,0
6,gill-attachment,0
7,gill-spacing,0
8,gill-size,0
9,gill-color,0


### Deleting duplicate rows from the dataset

In [None]:
# Deleting duplicate lines
df.reset_index()
df.drop_duplicates(inplace=True)

# generating the "clean_data" file
df.to_csv("preprocessed_data.csv", index=False)
df

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,e,k,s,n,f,n,a,c,b,y,...,s,o,o,p,o,o,p,b,c,l
8120,e,x,s,n,f,n,a,c,b,y,...,s,o,o,p,n,o,p,b,v,l
8121,e,f,s,n,f,n,a,c,b,n,...,s,o,o,p,o,o,p,b,c,l
8122,p,k,y,n,f,y,f,c,n,b,...,k,w,w,p,w,o,e,w,v,l


### transforming categorical values into numeric values

In [None]:
# checking the type of dataframe values
df.dtypes

class                       object
cap-shape                   object
cap-surface                 object
cap-color                   object
bruises                     object
odor                        object
gill-attachment             object
gill-spacing                object
gill-size                   object
gill-color                  object
stalk-shape                 object
stalk-root                  object
stalk-surface-above-ring    object
stalk-surface-below-ring    object
stalk-color-above-ring      object
stalk-color-below-ring      object
veil-type                   object
veil-color                  object
ring-number                 object
ring-type                   object
spore-print-color           object
population                  object
habitat                     object
dtype: object

In [None]:
def Encoder(val):
  # Returns the numeric value corresponding to the row value.
  # val: the value of that line.
  # Returns: The numeric value corresponding to `val` using the order of encounter in the line

  # Checks if the row value is already in the dictionary.
  if val in category:
    # If yes, return the numeric value corresponding to the row value.
    return category[val]
  # Otherwise, add the row value to the dictionary and return the new numeric value.
  else:
    category[val]=len(category)
  return category[val]

# Iterates over the values in each column, where a value is added to a dictionary with each iteration.
for i in range(df.shape[1]):
    category={}
    # Calls the encoder function to convert categorical values to numeric values.
    df.iloc[:,i]=df.iloc[:,i].apply(Encoder)

df.head()

  df.iloc[:,i]=df.iloc[:,i].apply(Encoder)


Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,1,0,1,0,0,1,0,...,0,0,0,0,0,0,0,1,1,1
2,1,1,0,2,0,2,0,0,1,1,...,0,0,0,0,0,0,0,1,1,2
3,0,0,1,2,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,3,1,3,0,1,1,0,...,0,0,0,0,0,0,1,1,2,1


In [None]:
df.dtypes

### Send clean data to wandb

In [None]:
# configure logging
logging.basicConfig(level=logging.INFO,
                    format="%(asctime)s %(message)s",
                    datefmt='%d-%m-%Y %H:%M:%S')

# reference to a logger object
logger = logging.getLogger()
with tempfile.TemporaryDirectory() as tmp_dir:
        temp_path = os.path.join(tmp_dir, "clean_data_dataset.csv")
        df.to_csv(temp_path,index=False)

        artifact = wandb.Artifact(name="clean_data_dataset",
                                  type="dataset",
                                  description="clean dataset for use of kmeans",
        )

        artifact.add_file(temp_path)

        logger.info("Logging artifact")
        run.log_artifact(artifact)

        # Isso aguarda que o artefato seja carregado no W&B. Se você não adicionar isso,
        # o diretório temporário poderá ser removido antes que o W&B tenha a chance de fazer
        # upload dos conjuntos de dados, e o upload poderá falhar
        artifact.wait()

In [None]:
# Upload the artifact to Wandb
run.log_artifact(artifact)

<Artifact QXJ0aWZhY3Q6NjQxNTc2OTk0>

In [None]:
# finishing the run
run.finish()