In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

data = pd.read_csv('data_tp1.csv',header=None)
data = data.rename(columns={0:'y'})
data.head(2)

Unnamed: 0,y,1,2,3,4,5,6,7,8,9,...,775,776,777,778,779,780,781,782,783,784
0,7,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [2]:
from sklearn.model_selection import train_test_split

y = data.y.values
X = data.drop(columns='y').values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)

In [3]:
print(X_train.shape)
print(y_train.shape)
idx = 0
print((X_train[idx,:]).shape)
print(y_train[idx])
(y_train[idx]).shape 

(3350, 784)
(3350,)
(784,)
7


()

In [4]:
from torch.utils.data import DataLoader, Dataset
import torch

# Pytorch exige que seja utilizado um dataset por algum motivo aparentemente
class custom_mnist_dataset():
    def __init__(self, X,y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        # Restrições de uso do pytorch: (leia os docs...)
        # inputs tem que ser float para multiplicar pelos pesos, 
        # outputs tem que ser longint para calcular crossentropy 
        return torch.tensor(self.X[idx,:], dtype=torch.float), self.y[idx]

training_data = custom_mnist_dataset(X = X_train, y = y_train)
test_data = custom_mnist_dataset(X = X_test, y = y_test)

# batch_size = 10,50
batch_size = 25

# Create data loaders.
train_dataloader = DataLoader(training_data, batch_size=batch_size)
test_dataloader = DataLoader(test_data, batch_size=batch_size)

for X, y in test_dataloader:
    print(f"Shape, type: X: {X.shape} {X.dtype}")
    print(f"Shape, type: y: {y.shape} {y.dtype}")
    break
    

Shape of X [N, C, H, W]: torch.Size([25, 784]) torch.float32
Shape of y: torch.Size([25]) torch.int64


In [5]:
# Get cpu, gpu or mps device for training.
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

Using cpu device


In [6]:
from torch import nn
import torch.nn.functional as F

"""
NeuralNetwork with  ...
"""
class NeuralNetwork(nn.Module):
    def __init__(self, input_size, hidden_dim, output_size):
        super(NeuralNetwork, self).__init__()

        self.entrada = nn.Linear(input_size, hidden_dim)
        self.oculta = nn.Linear(hidden_dim, hidden_dim)
        self.saida = nn.Linear(hidden_dim, output_size)

    def forward(self, input):

        x1 = self.entrada(input)
        x2 = F.sigmoid(self.oculta(x1))
        x3 = self.saida(x2)

        return x3
    
# hidden dim = 25,50,100
print(NeuralNetwork(input_size = X_train.shape[1], hidden_dim= 25, output_size=10).to(device))


NeuralNetwork(
  (entrada): Linear(in_features=784, out_features=25, bias=True)
  (oculta): Linear(in_features=25, out_features=25, bias=True)
  (saida): Linear(in_features=25, out_features=10, bias=True)
)


In [7]:
# https://www.kaggle.com/code/shrutimechlearn/pytorch-custom-model-step-by-step

def train_step(model, loss_func, optimizer, dataloader):
    
    # to capture loss
    train_loss = 0 

    # to get the model in training mode
    model.train()
    
    for batch, (x_batch, y_batch) in enumerate(dataloader):
        # sending data to the device where rest of the artifacts are
        x_batch, y_batch = x_batch.to(device), y_batch.to(device)
        
        # forward pass/model prediction with the given input
        y_pred = model(x_batch)
        
        # loss calculation by comparison between predicted and ground truth values
        loss = loss_func(y_pred, y_batch)
        train_loss += loss.item()
    
        # setting previously collected gradient values in the optimizer to zero so it translates only current gradients
        optimizer.zero_grad()
        
        # calculate the gradients for this iteration (independent gradients because previous values have been reset to 0)
        loss.backward()
        
        # update the weights and biases based on the calculated gradients ~(wi = wi + delta_wi)
        optimizer.step()

    # Adjust metrics to get average loss and accuracy per batch 
    train_loss = train_loss / len(dataloader)

    return train_loss

In [8]:
def test_step(model, loss_func, test_dataloader):
    
    test_loss = 0
    
    model.eval()
    
    
    with torch.inference_mode():
        
        for batch, (x_batch, y_batch) in enumerate(test_dataloader):
            
            x_batch, y_batch = x_batch.to(device), y_batch.to(device)
            y_pred = model(x_batch)
            
            loss = loss_func(y_pred, y_batch)
            test_loss+= loss.item()

    test_loss = test_loss/len(test_dataloader)   

    return test_loss
            

In [33]:
import optuna
import mlflow

# Set up MLflow tracking URI
mlflow.set_tracking_uri('http://localhost:8080')
mlflow.autolog()

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
def eval_step_with_mlflowlogging(model,test_dataloader):
        
    with torch.no_grad():
        model.eval()
        total_preds = []
        total_labels = []
        for inputs, labels in test_dataloader:
            outputs = model(inputs)
            _, preds = torch.max(outputs, dim=1)
            total_preds.extend(preds.cpu().numpy())
            total_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(total_labels, total_preds)
    precision = precision_score(total_labels, total_preds, average='weighted', labels=np.unique(total_preds))
    recall = recall_score(total_labels, total_preds, average='weighted', labels=np.unique(total_preds))
    f1 = f1_score(total_labels, total_preds, average='weighted', labels=np.unique(total_preds))

    return accuracy, precision, recall, f1

def train_with_mlflowlogging(model, loss_func, optimizer, train_dataloader, test_dataloader, n_epochs):
    
    train_loss = []
    
    test_loss = []
    
    for epoch in range(n_epochs):
        
        tr_loss = train_step(model, loss_func, optimizer, train_dataloader)
        train_loss.append(tr_loss)
    
        ts_loss = test_step(model, loss_func, test_dataloader)
        test_loss.append(ts_loss)    

        accuracy, precision, recall, f1 = eval_step_with_mlflowlogging(model, test_dataloader)
        
        mlflow.log_metric("loss", tr_loss)#, step = epoch)
        mlflow.log_metric("val_loss", ts_loss)#, step = epoch)

        mlflow.log_metric("loss", tr_loss)#, step = epoch)
        mlflow.log_metric("val_loss", ts_loss)#, step = epoch)

        mlflow.log_metric("accuracy", accuracy)#, step = epoch)
        mlflow.log_metric("precision", precision)#, step = epoch)
        mlflow.log_metric("recall", recall)#, step = epoch)
        mlflow.log_metric("f1", f1, step = epoch)#)   

    return train_loss, test_loss

# Example of starting an MLflow run and integrating Optuna for hyperparameter tuning
with mlflow.start_run():
    def objective(trial):
        # Use trial.suggest_* methods to suggest hyperparameters
        # batch_size = 10,50,len(training_data)
        batch_size = len(training_data)
        
        # hidden dim = 25,50,100
        hidden_dim =25

        # GB, SGD batch 1, SGD minibatch 10 e 50
        # lr = 0.5, 1, 10
        lr = 0.5

        # Log params
        mlflow.log_param("batch_size", batch_size)
        mlflow.log_param("hidden_dim", hidden_dim)
        mlflow.log_param("lr", lr)

        # Define your model here
        train_dataloader = DataLoader(training_data, batch_size=batch_size)
        test_dataloader = DataLoader(test_data, batch_size=batch_size)

        model = NeuralNetwork(input_size = X_train.shape[1], hidden_dim= hidden_dim, output_size=10).to(device)

        loss_func = nn.CrossEntropyLoss()
        
        optimizer = optim.SGD(model.parameters(), lr=lr)

        train_loss, test_loss = train_with_mlflowlogging(
            model, 
            loss_func, 
            optimizer, 
            train_dataloader, 
            test_dataloader, 
            n_epochs= 20)

        # mlflow.log_metric("loss", train_loss[-1])
        # mlflow.log_metric("val_loss", test_loss[-1])

        return  test_loss[-1]  # Return a value to minimize (e.g., negative accuracy)

    study = optuna.create_study(direction="minimize", study_name = "testing_mlflow_optuna_2", load_if_exists = True)
    study.optimize(objective, n_trials=10)
    
    # Log the best parameters and other relevant information
    mlflow.log_params(study.best_params)
    mlflow.log_metric("best_loss", study.best_value)

2024/06/04 11:50:59 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
[I 2024-06-04 11:51:01,786] A new study created in memory with name: testing_mlflow_optuna_2
[I 2024-06-04 11:51:08,089] Trial 0 finished with value: 1.6329529285430908 and parameters: {}. Best is trial 0 with value: 1.6329529285430908.
[I 2024-06-04 11:51:14,478] Trial 1 finished with value: 1.3337547779083252 and parameters: {}. Best is trial 1 with value: 1.3337547779083252.
[I 2024-06-04 11:51:21,765] Trial 2 finished with value: 1.3339941501617432 and parameters: {}. Best is trial 1 with value: 1.3337547779083252.
[I 2024-06-04 11:51:29,473] Trial 3 finished with value: 1.3230894804000854 and parameters: {}. Best is trial 3 with value: 1.3230894804000854.
[I 2024-06-04 11:51:36,761] Trial 4 finished with value: 1.3764618635177612 and parameters: {}. Best is trial 3 with value: 1.3230894804000854.
[I 2024-06-04 11:51:44,148] Trial 5 finished with value: 1.2828726768493652 and parameters: 

In [34]:
import optuna
import mlflow

# Set up MLflow tracking URI
mlflow.set_tracking_uri('http://localhost:8080')
mlflow.autolog()

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
def eval_step_with_mlflowlogging(model,test_dataloader):
        
    with torch.no_grad():
        model.eval()
        total_preds = []
        total_labels = []
        for inputs, labels in test_dataloader:
            outputs = model(inputs)
            _, preds = torch.max(outputs, dim=1)
            total_preds.extend(preds.cpu().numpy())
            total_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(total_labels, total_preds)
    precision = precision_score(total_labels, total_preds, average='weighted', labels=np.unique(total_preds))
    recall = recall_score(total_labels, total_preds, average='weighted', labels=np.unique(total_preds))
    f1 = f1_score(total_labels, total_preds, average='weighted', labels=np.unique(total_preds))

    return accuracy, precision, recall, f1

def train_with_mlflowlogging(model, loss_func, optimizer, train_dataloader, test_dataloader, n_epochs):
    
    train_loss = []
    
    test_loss = []
    
    for epoch in range(n_epochs):
        
        tr_loss = train_step(model, loss_func, optimizer, train_dataloader)
        train_loss.append(tr_loss)
    
        ts_loss = test_step(model, loss_func, test_dataloader)
        test_loss.append(ts_loss)    

        accuracy, precision, recall, f1 = eval_step_with_mlflowlogging(model, test_dataloader)
        
        mlflow.log_metric("loss", tr_loss)#, step = epoch)
        mlflow.log_metric("val_loss", ts_loss)#, step = epoch)

        mlflow.log_metric("loss", tr_loss)#, step = epoch)
        mlflow.log_metric("val_loss", ts_loss)#, step = epoch)

        mlflow.log_metric("accuracy", accuracy)#, step = epoch)
        mlflow.log_metric("precision", precision)#, step = epoch)
        mlflow.log_metric("recall", recall)#, step = epoch)
        mlflow.log_metric("f1", f1, step = epoch)#)   

    return train_loss, test_loss

# Example of starting an MLflow run and integrating Optuna for hyperparameter tuning
with mlflow.start_run():
    def objective(trial):
        # Use trial.suggest_* methods to suggest hyperparameters
        # batch_size = 10,50,len(training_data)
        batch_size = len(training_data)
        
        # hidden dim = 25,50,100
        hidden_dim =100

        # GB, SGD batch 1, SGD minibatch 10 e 50
        # lr = 0.5, 1, 10
        lr = 0.5

        # Log params
        mlflow.log_param("batch_size", batch_size)
        mlflow.log_param("hidden_dim", hidden_dim)
        mlflow.log_param("lr", lr)

        # Define your model here
        train_dataloader = DataLoader(training_data, batch_size=batch_size)
        test_dataloader = DataLoader(test_data, batch_size=batch_size)

        model = NeuralNetwork(input_size = X_train.shape[1], hidden_dim= hidden_dim, output_size=10).to(device)

        loss_func = nn.CrossEntropyLoss()
        
        optimizer = optim.SGD(model.parameters(), lr=lr)

        train_loss, test_loss = train_with_mlflowlogging(
            model, 
            loss_func, 
            optimizer, 
            train_dataloader, 
            test_dataloader, 
            n_epochs= 20)

        # mlflow.log_metric("loss", train_loss[-1])
        # mlflow.log_metric("val_loss", test_loss[-1])

        return  test_loss[-1]  # Return a value to minimize (e.g., negative accuracy)

    study = optuna.create_study(direction="minimize", study_name = "testing_mlflow_optuna_2", load_if_exists = True)
    study.optimize(objective, n_trials=10)
    
    # Log the best parameters and other relevant information
    mlflow.log_params(study.best_params)
    mlflow.log_metric("best_loss", study.best_value)

2024/06/04 11:54:00 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
[I 2024-06-04 11:54:00,290] A new study created in memory with name: testing_mlflow_optuna_2
[I 2024-06-04 11:54:06,695] Trial 0 finished with value: 0.8673474788665771 and parameters: {}. Best is trial 0 with value: 0.8673474788665771.
[I 2024-06-04 11:54:14,062] Trial 1 finished with value: 0.9595535397529602 and parameters: {}. Best is trial 0 with value: 0.8673474788665771.
[I 2024-06-04 11:54:21,963] Trial 2 finished with value: 0.875180721282959 and parameters: {}. Best is trial 0 with value: 0.8673474788665771.
[I 2024-06-04 11:54:29,926] Trial 3 finished with value: 0.7136643528938293 and parameters: {}. Best is trial 3 with value: 0.7136643528938293.
[I 2024-06-04 11:54:37,655] Trial 4 finished with value: 0.8426854610443115 and parameters: {}. Best is trial 3 with value: 0.7136643528938293.
[I 2024-06-04 11:54:45,584] Trial 5 finished with value: 0.7893187999725342 and parameters: {