In [None]:
# installation of Weights & Biases
!pip install wandb

In [None]:
# Import of libraries
import pandas as pd
import numpy as np
%matplotlib inline

from matplotlib import pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.colors import ListedColormap
import wandb

import logging

from sklearn.neighbors import LocalOutlierFactor
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score
import pickle

In [None]:
# wandb login
!wandb login --relogin

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit: 
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [None]:
# starting a run in wandb
run = wandb.init(entity="flamigos", project="Murshroom-Kmeans", job_type="train")

# downloading the training artifact
artifact = run.use_artifact("flamigos/Murshroom-Kmeans/train.csv:v0").file()
artifact = pd.read_csv(artifact)
artifact

# 1 - identify if there are outliers in the dataset and remove them


## 1.1 identify outliers

In [None]:
# identify outliers

# temporary variable
x = artifact.select_dtypes("int64").copy()
x = x.join(artifact.select_dtypes("float64").copy())

# identify outliers in the dataset
lof = LocalOutlierFactor()
outlier = lof.fit_predict(x)
mask = outlier != -1

## 1.2 remover outliers

In [None]:
# remove outliers
artifact = artifact.loc[mask, :].copy()
artifact

# 2 - fill data
As there is no missing data in this data set, it will not be necessary to treat this

In [None]:
# columns
col = artifact.columns

# stores the number of nulls per category in an array
nulls = []
for i in col:
  x = artifact[i].isnull().sum()
  nulls.append(x)

# dataset of columns by nulls
data = {'Colunas': col,
        'Nulos': nulls}
murshroom_col_nulls = pd.DataFrame(data)
murshroom_col_nulls

Unnamed: 0,Colunas,Nulos
0,class,0
1,cap-shape,0
2,cap-surface,0
3,cap-color,0
4,bruises,0
5,odor,0
6,gill-attachment,0
7,gill-spacing,0
8,gill-size,0
9,gill-color,0


# 3 - separate the "class" column that contains the results (poisonous, edible) from the rest of the dataset

In [None]:
# labels -> column of the results we want to obtain from training
labels = artifact["class"]

# take the other columns
columns = artifact.keys()[1:]

# forms a data set with the columns
data = artifact[columns]
data = data.values

# 4 - Train the model

In [None]:
# Kmeans model training
kmean = KMeans(n_clusters=2, init='k-means++', max_iter=300, n_init=10)
kmean.fit(data)

## 4.1 saves plk file with our trained model

In [None]:
# save file with the model
with open("murshrooms_kmeans.plk", "wb") as f:
  pickle.dump(kmean, f)

In [None]:
# read the file
with open("murshrooms_kmeans.plk", "rb") as f:
  pickle.load(f)

## 4.2 check the accuracy of the model

In [None]:
test = kmean.predict(data)
test

# check training accuracy
accuracy = accuracy_score(labels.values, test)
accuracy

0.8400987806760303

# 5 - upload model to wandb

In [None]:
# reference to a logger object
logger = logging.getLogger()

# creates the artifact in the variable
artfc = wandb.Artifact(name="murshrooms_kmeans.plk",
                                type="modelo",
                                description="modelo treinado",
      )

# add the model to the artifact
artfc.add_file("murshrooms_kmeans.plk")

logger.info("Logging artifact")
run.log_artifact(artfc)

# This waits for the artifact to be loaded into W&B.
# If you don't add this, the temporary directory may be removed before W&B has a
# chance to upload the datasets, and the upload may fail
artfc.wait()

<Artifact QXJ0aWZhY3Q6NjQxNTgyNjk1>

In [None]:
# close wandb run
run.finish()

VBox(children=(Label(value='0.035 MB of 0.035 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

# 6 - displaying the result of the groupings

In [None]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

pca = PCA(n_components=2).fit(artifact)

pca_2d = pca.transform(artifact)


plt.scatter(pca_2d[test == 0, 0], pca_2d[test == 0, 1], s = 100, c = 'green', label = 'edible')
plt.scatter(pca_2d[test == 1, 0], pca_2d[test == 1, 1], s = 100, c = 'black', label = 'poisonous')
plt.title('clustering result')
plt.legend()

