# Generate Test Data for Evaluating our ML Models
---
__Note__
* We are currently looking at MLP Modes

In [1]:
# Note: Test if there is a GPU Available for Torch
import torch
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU is available")
else:
    device = torch.device("cpu")
    print("GPU is not available, using CPU")

GPU is available


# Create sample dataset to test multiclass classifier

__Create a simple pattern that the ML can find in the data__

[000,40,__20,20__,49,97,16,86,__20__]

[998,09,15,__30,30__,33,98,43,__30__]

[003,93,92,62,__40,40__,24,10,__40__]

[004,04,28,54,12,__50,50__,47,__50__]


In [2]:
# Create a simple pattern that the ML can find in the data

import pandas as pd
import random

# Set "display.max_rows" to "None" to see all rows in the output
pd.set_option("display.max_rows", 100) 

# Create a list of 7 items from 1 to 100
random_list = random.sample(range(1, 101), 7)

# Add a pttern into the data that should be discoverable by the ML program
# The pattern randomly puts two number ajacent to each other and sets that as the target
# Example:

test_data_dict = {}
pos=0
for index in range(0,1000):
    
    rl = random.sample(range(1, 101), 8)
    idx = random.sample(range(0, 6), 1)[0]
     
    rl[idx]=(idx+1)*10
    rl[idx+1]=(idx+1)*10
    rl[7]=(idx+1)*10

    # Using classical_life_stage format for testing the ML
    test_data_dict[index] = {'wormbase_id':index,'EE':rl[0],'LE':rl[1],'L1':rl[2],'L2':rl[3],'L3':rl[4],'L4':rl[5],'YA':rl[6],'category_1':rl[7]}
                                           
test_data = pd.DataFrame.from_dict(test_data_dict, orient='index')
test_data

Unnamed: 0,wormbase_id,EE,LE,L1,L2,L3,L4,YA,category_1
0,0,7,48,30,30,9,88,54,30
1,1,10,10,71,17,38,62,68,10
2,2,10,10,42,19,54,14,61,10
3,3,79,88,80,40,40,38,69,40
4,4,15,7,37,5,50,50,94,50
...,...,...,...,...,...,...,...,...,...
995,995,10,10,29,83,11,8,68,10
996,996,72,46,75,54,50,50,47,50
997,997,93,20,20,29,47,44,6,20
998,998,77,37,63,2,50,50,7,50


In [None]:
# Break the dataset into Train Test and save
import os
from sklearn.model_selection import train_test_split

out_dir ='./output_data/output_classical_life_stage'
if not os.path.exists(out_dir):
    os.makedirs(out_dir)
    
target = test_data['category_1']
features = test_data.drop('category_1', axis=1)

x_train, x_test, y_train, y_test = train_test_split(features,
                                                    target,
                                                    shuffle=True,
                                                    test_size=0.20,
                                                    random_state=42)
# Add the Target back before saving
x_train = x_train.assign(category_1=y_train.values)
x_test = x_test.assign(category_1=y_test.values)


# Show the results of the split
print(f"Training set has {x_train.shape[0]:,d} samples.")
print(f"Testing set has {x_test.shape[0]:,d} samples.")
print(f"Total set has {x_test.shape[0] + x_train.shape[0]:,d} samples.")
print(f"Total Features {x_test.shape[1]-1:,d}.")

x_train.to_csv(out_dir + '/test_classical_life-train.csv', index=None, header=True)
x_test.to_csv(out_dir + '/test_classical_life-test.csv', index=None, header=True)

In [None]:
# Check the distribution of the test data
test_classical_life['category_1'].value_counts()

# Execute against test dataset

In [None]:
# Load the Training methods from a file
from multiclass_classifier import *



In [None]:
# Define the model
class MultiClassClassifier(nn.Module):
    def __init__(self):
        super(MultiClassClassifier, self).__init__()
        self.hidden_1 = nn.Linear(7, 50)  # Input layer -> Hidden layer
        self.output = nn.Linear(50, 6) # SET_DATA_SET
    
    def forward(self, x):
        x = torch.relu(self.hidden_1(x))
        #x = torch.softmax(self.output(x), dim=1)
        x = self.output(x)
        return x

# Define a method to get the data to train with
def get_life_stages_data(base_dir=BASE_DIR, data_type='train'):
    file_name_prefix='test_classical_life'
    data_set = pd.read_csv(f"./output_data/{base_dir}/{file_name_prefix}-{data_type}.csv")
    print(f"{file_name_prefix}-{data_type}.shape {data_set.shape}")
                       
    target_label = 'category_1'
    x_train = data_set.drop(target_label, axis=1)
    x_train = x_train.drop('wormbase_id', axis=1)
    
    y_train = data_set[target_label]
    y_train = y_train.to_frame()

    one_hot_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False).fit(y_train)
    y_train = one_hot_encoder.transform(y_train)
    
    # Convert data to PyTorch tensors
    x_train = torch.from_numpy(x_train.values).float()
    y_train = torch.from_numpy(y_train)

    return x_train, y_train, one_hot_encoder


In [None]:
# One pass Train and Validate
x_train, y_train, one_hot_encoder = get_life_stages_data(BASE_DIR)
x_test, y_test, _ = get_life_stages_data(BASE_DIR, data_type='test')

model = MultiClassClassifier()
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

train_loader, validation_loader = get_dataloaders(
                    x_train, y_train, x_test, y_test, 
                    train_shuffle = True, validation_shuffle = True)


for batch_idx, (x_validation, y_validation) in enumerate(validation_loader):
    print(f'{x_validation[0]=} {len(y_validation[0])=}')
    
for batch_idx, (x_train, y_train) in enumerate(train_loader):
    print(f'{x_train[0]=} {len(y_train[0])=}')
    
ret_val = train_validate(model, criterion, optimizer, train_loader, validation_loader, n_epochs=300)
train_losses, train_accuracies, validation_losses, validation_accuracies  = ret_val

In [None]:
%matplotlib inline
# Plot the training curve
plot_learning_curve2(train_losses, validation_losses)

In [None]:
%matplotlib inline
# Run cross-valivation once with all the data

x_train, y_train, one_hot_encoder = get_life_stages_data(BASE_DIR)
x_test, y_test, _ = get_life_stages_data(BASE_DIR, data_type='test')

model = MultiClassClassifier()
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)


ret_val = cross_validation_training(x_train, y_train, model, criterion, optimizer, k=5,shuffle=False, n_epochs=100)
train_losses, train_accuracies, validation_losses, validation_accuracies = ret_val


plot_learning_curve2(train_losses, validation_losses)

In [None]:
%matplotlib inline
# Run cross-valivation once with with different amounts of data

x_train, y_train, one_hot_encoder = get_life_stages_data(BASE_DIR)
x_test, y_test, one_hot_encoder = get_life_stages_data(BASE_DIR, data_type='test')

model = MultiClassClassifier()
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

train_losses_lst =[]
train_accuracies_lst =[]
validation_losses_lst =[]
validation_accuracies_lst =[]
for percentage in [20,40,60,80,100]:
    print(f"Training with {percentage}% of data")
    x_data, y_data = get_percent_of_data(x_train, y_train, percentage)
    ret_val = cross_validation_training(x_data, y_data, model, criterion, optimizer, k=5,shuffle=False, n_epochs=300)
    train_losses, train_accuracies, validation_losses, validation_accuracies = ret_val
    train_losses_lst.append(train_losses)
    train_accuracies_lst.append(train_accuracies)
    validation_losses_lst.append(validation_losses)
    validation_accuracies_lst.append(validation_accuracies)



In [None]:
# Load the Training methods from a file

def plot_learning_curve(train_sizes, train_scores_lst, test_scores_lst, y_title="Score",y_as_percentage=True):
    plt.figure()
    plt.title("Learning Curve")
    plt.xlabel("% of training examples")
    plt.ylabel(y_title)
    
    train_scores = np.array(train_scores_lst)
    test_scores = np.array(test_scores_lst)
    
    if y_as_percentage:
        print("WE ARE HERE")
        plt.ylim(top=110)
        train_scores = train_scores * 100
        test_scores = test_scores * 100
        
    
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
    plt.plot(train_sizes, test_scores_mean, '^--', color="g", label="Cross-validation score")

    plt.legend(loc="best")
    return plt, train_scores_mean, test_scores_mean
        
#print(train_accuracies_lst)
print(validation_accuracies_lst)
ret_val = plot_learning_curve([20,40,60,80,100], train_accuracies_lst, validation_accuracies_lst, y_title="Score (Accuracy)")
plt, train_scores_mean, test_scores_mean = ret_val
print(f"{train_scores_mean=}")
print(f"{test_scores_mean=}")
plt.show()

In [None]:
results = create_results_df(model, x_train,y_train, one_hot_encoder)

In [None]:
results = create_results_df(model, x_test,y_test, one_hot_encoder)

In [None]:
y_pred.detach().numpy()

In [None]:
from sklearn.metrics import f1_score
y_pred = model(x_test)

def get_one_hot(arr):
    # Find the index of the maximum value in the array
    max_index = np.argmax(arr)
    # Create a new array of zeros with the same shape as the input array
    one_hot = np.zeros_like(arr)
    # Set the index of the maximum value to 1
    one_hot[max_index] = 1
    return one_hot

y_pred = model(x_test)
y_pred = y_pred.detach().numpy()
y_pred = np.apply_along_axis(get_one_hot, axis=1, arr=y_pred)
y_pred


f1 = f1_score(y_test, y_pred, average='weighted')
print("F1 score: {:.2f}".format(f1))


# Iris data test code

In [None]:
# Get the data for this project
!wget -O ./output_data/iris-train.csv https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data
# Original data has no header
!echo "pl,pw,sl,sw,flower" | cat - ./output_data/iris-train.csv > temp && mv temp ./output_data/iris-train.csv

In [None]:
from time import time
import platform
import math
import os 

import numpy as np
import pandas as pd

import matplotlib
matplotlib.use('agg')
import matplotlib.pyplot as plt

from sklearn.model_selection import learning_curve
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

import torch
import torch.nn as nn
import torch.utils.data as utils_data
from sklearn.model_selection import KFold

from sklearn.exceptions import ConvergenceWarning
from warnings import simplefilter
simplefilter("ignore", category=ConvergenceWarning)
os.environ["PYTHONWARNINGS"] = "ignore"

base_dir='./output_data'

In [None]:
from multiclass_classifier import *

In [None]:
class MultiClassClassifier(nn.Module):
    def __init__(self):
        super(MultiClassClassifier, self).__init__()
        self.hidden_1 = nn.Linear(4, 8)  # Input layer -> Hidden layer
        self.output = nn.Linear(8, 3) # Hidden layer -> Output layer
    
    def forward(self, x):
        x = torch.relu(self.hidden_1(x))
        x = torch.softmax(self.output(x), dim=1)
        return x


def get_iris_data(base_dir=base_dir, data_type='train'):
    data_set = pd.read_csv(f"{base_dir}/iris-{data_type}.csv")
    print(f"iris-{data_type}.shape {data_set.shape}")
                       
    target_label = 'flower'
    x_train = data_set.drop(target_label, axis=1)
    
    y_train = data_set[target_label]
    y_train = y_train.to_frame()

    one_hot_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False).fit(y_train)
    y_train = one_hot_encoder.transform(y_train)
    
    # Convert data to PyTorch tensors
    x_train = torch.from_numpy(x_train.values).float()
    y_train = torch.from_numpy(y_train)
    #y_train= y_train.unsqueeze(1)

    return x_train, y_train

x_train, y_train = get_iris_data()

In [None]:
x_train, y_train = get_iris_data()
x_test, y_test = get_iris_data()
model = MultiClassClassifier()
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

train_loader, validation_loader = get_dataloaders(
                    x_train, y_train, x_test, y_test, 
                    train_shuffle = False, validation_shuffle = False)

for batch_idx, (x_validation, y_validation) in enumerate(validation_loader):
    print(f'{x_validation[0]=} {len(y_validation[0])=}')
    
for batch_idx, (x_train, y_train) in enumerate(train_loader):
    print(f'{x_train[0]=} {len(y_train[0])=}')
    
    
train_losses, validation_losses, validation_accuracies = train_validate(model, criterion, optimizer, train_loader, validation_loader, n_epochs=10)

In [None]:
%matplotlib inline

x_train, y_train = get_iris_data()
x_test, y_test = get_iris_data()
model = MultiClassClassifier()
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

ret_val = cross_validation_training(x_train, y_train, model, criterion, optimizer, k=5, shuffle=True, n_epochs=100)
train_losses, validation_losses, validation_accuracies = ret_val


plot_learning_curve2(train_losses, validation_losses)

In [None]:
total_correct=0
for i in range(len(x_test)-1):
    x_batch = x_test[i:i+1]
    predicted = model(x_batch)
    #print(f'{X_batch} {predicted} {y_batch[i:i+1]}')
    p_a = torch.argmax(predicted, 1)[0]
    t_a = torch.argmax(y_test[i:i+1],1)[0]
    print(f'{i:<3} {p_a} {t_a} {p_a==t_a}')
    if p_a==t_a:
        total_correct +=1

print(f'Score {total_correct/len(x_test)}')