# Pytorch
---------

### Author Information
**Author:** PJ Gibson  
**Email:** Peter.Gibson@doh.wa.gov  
**Github:**   https://github.com/DOH-PJG1303

### Project Information
**Created Date:** 2023-08-09  
**Last Updated:** 2023-08-09  
**Version:** 1  

### Description

In this notebook, we train a ML model for record linkage.
We'll use PyTorch.



### Notes

*\*If you are unfamiliar with the origins of this synthetic data, please see the [Synthetic-Gold](https://github.com/DOH-PJG1303/Synthetic-Gold) github project. We ran the simulation for the state of Nebraska, so all data is relevant to that state.
To manage the size of the data we'll have publicly stored on Github, we only captured relevant data for each table for the population living in years 2019-2022*


*\*\*Annotation improved with the help of chat-GPT*

## 1. Import Libraries

In [None]:
import torch
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn
import torch.optim as optim
import mlflow
import mlflow.pytorch
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import random

In [None]:
# Set random seed for code reproducibility
random.seed(42)

## 2. Prep Data

### 2.1 Read Data

In [None]:
df = pd.read_parquet('../../Data/Training/04. Training Data.parquet')

### 2.2 Test Train Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('label',axis=1), df['label'], test_size=0.2, random_state=42)

## 3. TensorFlow

### 3.1 Conversion

Can't have it resting as a pandas dataframe.
Needs to be tensor.

In [None]:
# Convert pandas DataFrames to PyTorch tensors
X_train_tensor = torch.tensor(X_train.values.astype('float32'))
y_train_tensor = torch.tensor(y_train.values.astype('float32'))
X_test_tensor = torch.tensor(X_test.values.astype('float32'))
y_test_tensor = torch.tensor(y_test.values.astype('float32'))

# Create TensorDatasets
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

# Create DataLoaders (you can adjust the batch_size and shuffle as needed)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

### 3.2 Define Param Grid Search Space


In [None]:
param_grid = [
    [64, 128, 256],               # Hidden dimensions
    [0.01, 0.001, 0.0001],         # Learning rates
    [10, 20, 30]                   # Number of epochs
]

### 3.3 Model Wrapping/Defining

In [None]:
class PyTorchModelWrapper(mlflow.pyfunc.PythonModel):
    """A wrapper class for PyTorch models that implements the mlflow.pyfunc.PythonModel interface."""
    
    def __init__(self, model):
        self.model = model
        
    def predict(self, context, model_input):
        with torch.no_grad():
            model_input_tensor = torch.tensor(model_input.values, dtype=torch.float32)
            output = self.model(model_input_tensor)
            return output.numpy()[:, 0]

# Define a MLP model
class MLP(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super(MLP, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, 1),
            nn.Sigmoid()
        )
    
    def forward(self, x):
        return self.layers(x)

## 4 MLFlow config

### 4.1 Create Experiment

In [None]:
# Define experiment name
experiment_name = 'Record Linkage ML Model Training'

try:
    mlflow.create_experiment(experiment_name)
except:
    pass

# Set the experiment ID
mlflow.set_experiment(experiment_name)

### 4.2 Define Run

In [None]:
# Naming it Iter1 (Iteration 1), 12 linking fields (to capture number raw data fields required)
# [fname, mname, lname, dob, sex, add, zip, city, county, state, phone, email]
run_name = 'Iter1, 12 linking fields'

### 4.3 Log Metrics Function

In [None]:
def log_metrics(y_test, pred_proba_test):
    """Log various performance metrics."""

    test_auc_score = roc_auc_score(y_test, pred_proba_test)
    mlflow.log_metric('auc', test_auc_score)
    predict_binary_test = (pred_proba_test > 0.5).astype(int)
    test_f1_score = f1_score(y_test, predict_binary_test)
    mlflow.log_metric('f1', test_f1_score)
    test_accuracy = accuracy_score(y_test, predict_binary_test)
    mlflow.log_metric('accuracy', test_accuracy)
    test_precision = precision_score(y_test, predict_binary_test)
    mlflow.log_metric('precision', test_precision)
    test_recall = recall_score(y_test, predict_binary_test)
    mlflow.log_metric('recall', test_recall)

## 5. Experiment

In [None]:
# Function to perform runs

def perform_run(run_name, param_grid, n_runs, train_loader, test_loader):

    input_size = train_loader.dataset.tensors[0].size(1)
    mlflow.pytorch.autolog()

    for i in range(n_runs):

        hidden_size, lr, epochs = [random.choice(param) for param in param_grid]

        with mlflow.start_run(run_name=run_name):

            # Log the parameter 
            mlflow.log_param('Model Type', 'Pytorch-MLPClassifier')

            # For whatever reason, mlflow pytorch autologging doesn't capture these.
            mlflow.log_param('learning_rate',lr)
            mlflow.log_param('hidden_layer_sizes',hidden_size)
            mlflow.log_param('num_epochs', epochs)

            model = MLP(input_size, hidden_size)
            criterion = nn.BCEWithLogitsLoss()
            optimizer = optim.Adam(model.parameters(), lr=lr)

            # Training
            for epoch in range(epochs): # You may adjust the number of epochs
                for batch_X, batch_y in train_loader:
                    optimizer.zero_grad()
                    outputs = model(batch_X)
                    loss = criterion(outputs.squeeze(), batch_y)
                    loss.backward()
                    optimizer.step()

            # Testing and logging metrics
            pred_proba_test = []
            with torch.no_grad():
                for batch_X, _ in test_loader:
                    outputs = model(batch_X)
                    pred_proba_test.extend(torch.sigmoid(outputs.squeeze()).numpy())
            pred_proba_test = np.array(pred_proba_test)

            log_metrics(test_loader.dataset.tensors[1].numpy(), pred_proba_test)

            # Logging model to MLflow
            mlflow.pytorch.log_model(model, "model")

In [None]:
# Define how many runs we want.  Do not exceed number of combinations of param_grid for repetition's sake
n_runs = 20

# Perform the runs!
perform_run(run_name, param_grid, n_runs, train_loader, test_loader)

### 4. Inspect Output

Open a CMD terminal.  Navigate to this directory. If you're starting in the root repo folder, this will look like:

```cmd
cd "Scripts\Train Model"
```

Then navigate into the MLFlow UI by typing:

```cmd
mlflow ui
```

There you can interact with models, compare different models, and select one to register.

In [None]:
input_size = train_loader.dataset.tensors[0].size(1)
mlflow.pytorch.autolog()

hidden_size, lr, epochs = [random.choice(param) for param in param_grid]

with mlflow.start_run(run_name=run_name):

    # Log the parameter 
    mlflow.log_param('Model Type', 'Pytorch-MLPClassifier')

    model = MLP(input_size, hidden_size)
    criterion = nn.BCEWithLogitsLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)

    # Training
    for epoch in range(epochs): # You may adjust the number of epochs
        for batch_X, batch_y in train_loader:
            optimizer.zero_grad()
            outputs = model(batch_X)
            loss = criterion(outputs.squeeze(), batch_y)
            loss.backward()
            optimizer.step()

    # Testing and logging metrics
    pred_proba_test = []
    with torch.no_grad():
        for batch_X, _ in test_loader:
            outputs = model(batch_X)
            pred_proba_test.extend(torch.sigmoid(outputs.squeeze()).numpy())
    pred_proba_test = np.array(pred_proba_test)

    log_metrics(test_loader.dataset.tensors[1].numpy(), pred_proba_test)

    # # Logging model to MLflow
    # mlflow.pytorch.log_model(model, "model")

In [None]:
model.parameters