# Train and Inference

In [None]:
import os
import shutil
import wget
import zipfile
import pandas as pd
import datetime
import numpy as np
from glob import glob

import sys
sys.path.append("..")
from algorithms.Networks_pytorch import *
from algorithms.Dataset_manipulation import *
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import shuffle

In [None]:
# TODO: still a lot of work to do here

## Config

Make sure to change these configs before running the whole notebook.

In [None]:
#model = 'ST4000DM000'
# here you can select the model. This is the one tested.
model = 'ST3000DM001'
#years = ['2016', '2017', '2018']
years = ['2014', '2015', '2016', '2017', '2018']
# many parameters that could be changed, both for unbalancing, for networks and for features.
windowing = 1
min_days_HDD = 115
# TODO: Can be adjusted by dynamic parameters
days_considered_as_failure = 7
test_train_perc = 0.3
# type of oversampling
oversample_undersample = 2
# balancing factor (major/minor = balancing_normal_failed)
# TODO: We can calculate the imbalance ratio of the dataset and use this ratio to adjust the balancing factor.
balancing_normal_failed = 20
history_signal = 32
# type of classifier
classifier = 'LSTM'
# if you extract features for RF for example. Not tested
features_extraction_method = False
CUDA_DEV = "0"
# if automatically select best features
ranking = 'Ok'
num_features = 18
overlap = 1

# Train and Inference

In [None]:
def randomForestClassification(X_train, Y_train, X_test, Y_test, metric, **args):
    Y_test_real = []
    prediction = []
    # Train and validate the network using RandomForest
    X_train, Y_train = shuffle(X_train, Y_train)
    model = RandomForestClassifier(n_estimators=30, min_samples_split=10, random_state=3)
    model.fit(X_train[:, :], Y_train)
    prediction = model.predict(X_test)
    Y_test_real = Y_test
    report_metrics(Y_test_real, prediction, metric)

In [None]:
def TCNClassification(X_train, Y_train, X_test, Y_test, metric, **args):
    # Train and validate the network using TCN
    net_train_validate_tcn(args['net'], args['optimizer'], X_train, Y_train, X_test, Y_test, args['epochs'], args['batch_size'], args['lr'])

In [None]:
def LSTMClassification(X_train, Y_train, X_test, Y_test, metric, **args):
    # Train and validate the network using LSTM
    train_dataset = FPLSTMDataset(X_train, Y_train)
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args['batch_size'], shuffle=True, collate_fn=FPLSTM_collate)
    test_dataset = FPLSTMDataset(X_test, Y_test.values)
    test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=args['batch_size'], shuffle=True, collate_fn=FPLSTM_collate)
    net_train_validate_LSTM(args['net'], args['optimizer'], train_loader, test_loader, args['epochs'], X_test.shape[0], Xtrain.shape[0], args['lr'])

In [None]:
def classification(X_train, Y_train, X_test, Y_test, classifier, metric, **args):
	"""
	Perform classification using the specified classifier.

	Parameters:
	- X_train (array-like): Training data features.
	- Y_train (array-like): Training data labels.
	- X_test (array-like): Test data features.
	- Y_test (array-like): Test data labels.
	- classifier (str): The classifier to use. Options: 'RandomForest', 'TCN', 'LSTM'.
	- metric (str): The metric to evaluate the classification performance.
	- **args: Additional arguments specific to each classifier.

	Returns:
	- None
	"""
	print('Classification using {} is starting'.format(classifier))
	if classifier == 'RandomForest':
		randomForestClassification(X_train, Y_train, X_test, Y_test, metric, **args)
	elif classifier == 'TCN':
		TCNClassification(X_train, Y_train, X_test, Y_test, metric, **args)
	elif classifier == 'LSTM':
		LSTMClassification(X_train, Y_train, X_test, Y_test, metric, **args)



In [None]:
features = {
    'total_features': [
        'date',
        'serial_number',
        'model',
        'failure',
        'smart_1_normalized', 
        'smart_5_normalized',
        'smart_5_raw',
        'smart_7_normalized',
        'smart_9_raw',
        'smart_12_raw',
        'smart_183_raw',
        'smart_184_normalized',
        'smart_184_raw', 
        'smart_187_normalized',
        'smart_187_raw',
        'smart_189_normalized', 
        'smart_193_normalized',
        'smart_193_raw',
        'smart_197_normalized', 
        'smart_197_raw',
        'smart_198_normalized',
        'smart_198_raw',
        'smart_199_raw'
    ],
    'iSTEP': [
        'date',
        'serial_number',
        'model',
        'failure',
        'smart_5_raw',
        'smart_3_raw', 
        'smart_10_raw',
        'smart_12_raw',
        'smart_4_raw',
        'smart_194_raw', 
        'smart_1_raw',
        'smart_9_raw',
        'smart_192_raw',
        'smart_193_raw', 
        'smart_197_raw',
        'smart_198_raw',
        'smart_199_raw'
    ]
}
# many parameters that could be changed, both for unbalancing, for networks and for features.
windowing = 1
min_days_HDD = 115
# TODO: Can be adjusted by dynamic parameters
days_considered_as_failure = 7
test_train_perc = 0.3
# type of oversampling
oversample_undersample = 2
# balancing factor (major/minor = balancing_normal_failed)
# TODO: We can calculate the imbalance ratio of the dataset and use this ratio to adjust the balancing factor.
balancing_normal_failed = 20
history_signal = 32
# type of classifier
classifier = 'LSTM'
# if you extract features for RF for example. Not tested
features_extraction_method = False
CUDA_DEV = "0"
# if automatically select best features
ranking = 'Ok'
num_features = 18
overlap = 1

try:
    df = pd.read_pickle(os.path.join('..', 'temp', f'{model}_Dataset_windowed_{history_signal}_rank_{ranking}_{num_features}_overlap_{overlap}.pkl'))
except:
    if ranking == 'None':
        df = import_data(years=years, model=model, name='iSTEP', features=features)
    else:
        df = import_data(years=years, model=model, name='iSTEP')
    print(df.head())
    for column in list(df):
        missing = round(df[column].notna().sum() / df.shape[0] * 100, 2)
        print('{:.<27}{}%'.format(column, missing))
    # drop bad HDs
    bad_missing_hds, bad_power_hds, df = filter_HDs_out(df, min_days=min_days_HDD, time_window='30D', tolerance=30)
    # predict_val represents the prediction value of the failure
    # validate_val represents the validation value of the failure
    df['predict_val'], df['validate_val'] = generate_failure_predictions(df, days=days_considered_as_failure, window=history_signal) # define RUL (remaining useful life) piecewise
    if ranking != 'None':
        df = feature_selection(df, num_features)
    print('Used features')
    for column in list(df):
        print('{:.<27}'.format(column,))	
    ## -------- ##
    # random: stratified without keeping timw
    # hdd --> separate different hdd (need FIXes)
    # temporal --> separate by time (need FIXes)

Xtrain, Xtest, ytrain, ytest = dataset_partitioning(
    df,
    model,
    overlap=overlap,
    rank=ranking,
    num_features=num_features,
    technique='random',
    test_train_perc=test_train_perc,
    windowing=windowing,
    window_dim=history_signal,
    resampler_balancing=balancing_normal_failed,
    oversample_undersample=oversample_undersample
)

####### CLASSIFIER PARAMETERS #######
if classifier == 'RandomForest':
    pass
elif classifier == 'TCN':
    os.environ["CUDA_VISIBLE_DEVICES"] = CUDA_DEV
    batch_size = 256
    lr = 0.001
    num_inputs = Xtrain.shape[1]
    net, optimizer = init_net(lr, history_signal, num_inputs)
    epochs = 200
elif classifier == 'LSTM':
    lr = 0.001
    batch_size = 256
    epochs = 300
    dropout = 0.1
    #hidden state sizes (from [14])
    # The dimensionality of the output space of the LSTM layer
    lstm_hidden_s = 64
    # The dimensionality of the output space of the first fully connected layer
    fc1_hidden_s = 16
    num_inputs = Xtrain.shape[1]
    net = FPLSTM(lstm_hidden_s, fc1_hidden_s, num_inputs, 2, dropout)
    net.cuda()
    # We use the Adam optimizer, a method for Stochastic Optimization
    optimizer = optim.Adam(net.parameters(), lr=lr)
## ---------------------------- ##

if features_extraction_method == True:
    # Extract features for the train and test set
    Xtrain = feature_extraction(Xtrain)
    Xtest = feature_extraction(Xtest)

if classifier == 'RandomForest' and windowing == 1:
    Xtrain = Xtrain.reshape(Xtrain.shape[0], Xtrain.shape[1] * Xtrain.shape[2])
    Xtest = Xtest.reshape(Xtest.shape[0], Xtest.shape[1] * Xtest.shape[2])

try:
    classification(
        X_train=Xtrain,
        Y_train=ytrain,
        X_test=Xtest,
        Y_test=ytest,
        classifier=classifier,
        metric=['RMSE', 'MAE', 'FDR', 'FAR', 'F1', 'recall', 'precision'],
        net=net,
        optimizer=optimizer,
        epochs=epochs,
        batch_size=batch_size,
        lr=lr
    )
except:
    classification(
        X_train=Xtrain,
        Y_train=ytrain,
        X_test=Xtest,
        Y_test=ytest,
        classifier=classifier,
        metric=['RMSE', 'MAE', 'FDR', 'FAR', 'F1', 'recall', 'precision']
    )