In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import sklearn
import pytorch_lightning as pl
import warnings
from pytorch_lightning.loggers import MLFlowLogger
from pytorch_lightning.loggers.base import LightningLoggerBase
from pytorch_lightning.callbacks import Callback
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from torch import nn
from torch.utils.data import DataLoader
from typing import Optional, List, Dict, Any, Callable

warnings.filterwarnings("ignore")
pl.utilities.seed.seed_everything(42)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [None]:
def normalize(data):
    scaler = MinMaxScaler()
    return scaler.fit_transform(data)

def standarize(data):
    scaler = StandardScaler()
    return scaler.fit_transform(data)

# Load Data

In [None]:
class DataModule(pl.LightningDataModule):
    def __init__(self, data_dir: str, test_size: float = 0.3, val_size: float = 0.1, train_batch_size: int = 64, val_batch_size: int = 64, transforms: List[Callable] = [], no_batch: bool = False):
        super().__init__()
        self.data_dir = data_dir # Target as last DataFrame column for classification
        self.test_size = test_size
        self.val_size = val_size
        self.train_batch_size = train_batch_size
        self.val_batch_size = val_batch_size
        self.no_batch = no_batch # full dataset in one batch
        self.transforms = transforms
        
    def prepare_data(self):
        self.full_data = pd.read_csv(self.data_dir)
        X = self.full_data.values[:, :-1]
        y = self.full_data.values[:, -1]
        for transform in self.transforms:
            X = transform(X)
        self.transformed_data = X
            
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=self.test_size, stratify=y)
        X_train = torch.tensor(X_train).float()
        X_test = torch.tensor(X_test).float()
        y_train = torch.tensor(y_train).long()
        y_test = torch.tensor(y_test).long()
        
        self.train_data = (X_train, y_train)
        self.test_data = (X_test, y_test)
    
    def setup(self, stage: Optional[str] = None):
        if stage == 'fit' or stage is None:
            X_train, y_train = self.train_data
            X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=self.val_size, stratify=y_train)
            self.train_data = []
            self.val_data = []
            for x, y in zip(X_train, y_train):
                self.train_data.append((x, y))
                
            for x, y in zip(X_val, y_val):
                self.val_data.append((x, y))
            
            if self.no_batch:
                self.train_batch_size = len(self.train_data)
                self.val_batch_size = len(self.val_data)
        
        if stage == 'test' or stage is None:
            X_test, y_test = self.test_data
            self.test_data = []
            for x, y in zip(X_test, y_test):
                self.test_data.append((x, y))

    def train_dataloader(self):
        return DataLoader(self.train_data, self.train_batch_size, shuffle=True, num_workers=1, pin_memory=True)
    
    def val_dataloader(self):
        return DataLoader(self.val_data, self.val_batch_size, shuffle=False, num_workers=1, pin_memory=True)
    
    def test_dataloader(self):
        return DataLoader(self.test_data, shuffle=False, num_workers=1, pin_memory=True)

In [None]:
#Initialize data
datamodule = DataModule('', no_batch=True, transforms=[normalize])
datamodule.prepare_data()

In [None]:
datamodule.full_data.head(5)

In [None]:
datamodule.full_data.info()

In [None]:
datamodule.full_data.describe()

# Visualisation

In [None]:
def plot_hist(data, x, y, figsize=(13, 13), bins=15):
    fig, ax = plt.subplots(x, y, figsize=figsize)
    for i in range(data.transformed_data.shape[1]):
        ax[int(i / y), i % y].hist(data.transformed_data[:, i], bins=bins)
        ax[int(i / y), i % y].set_title(data.full_data.columns[i])
    plt.show()

In [None]:
plot_hist(datamodule, 2, 4)

# Models

In [None]:
class Classifier(pl.LightningModule):
    def __init__(self, model, lr=1e-3):
        super().__init__()
        self.model = model
        self.criterion = nn.CrossEntropyLoss()
        self.lr = lr
    
    def forward(self, x):
        return self.model(x)
        
    def shared_step(self, x, y):
        pred = self(x)
        loss = self.criterion(pred, y)
        return pred, loss
        
    def training_step(self, train_batch, batch_idx):
        x, y = train_batch
        pred, loss = self.shared_step(x, y)
        _, predicted = torch.max(pred.data, 1)
        return {'loss': loss, 'train_score': (predicted, y)}
    
    def training_epoch_end(self, outputs):
        loss = [output['loss'].detach().cpu().numpy() for output in outputs]
        mean_loss = np.mean(loss)
        
        results = [x['train_score'] for x in outputs]
        preds = []
        y = []
        for predicted, y_data in results:
            predicted = predicted.detach().cpu().numpy()
            y_data = y_data.detach().cpu().numpy()
            preds.extend(predicted)
            y.extend(y_data)
        f1 = f1_score(y, preds)
        self.log('loss', mean_loss, logger=True)
        self.log('train_f1', f1, prog_bar=True, logger=True)
    
    def validation_step(self, val_batch, val_batch_idx):
        x, y = val_batch
        pred, loss = self.shared_step(x, y)
        _, predicted = torch.max(pred.data, 1)
        return {'val_loss': loss, 'val_score': (predicted, y)}
    
    def validation_epoch_end(self, outputs):
        results = [x['val_score'] for x in outputs]
        preds = []
        y = []
        for predicted, y_data in results:
            predicted = predicted.detach().cpu().numpy()
            y_data = y_data.detach().cpu().numpy()
            preds.extend(predicted)
            y.extend(y_data)
        f1 = f1_score(y, preds)
        self.log('val_f1', f1, prog_bar=True, logger=True)
        
    def test_step(self, test_batch, test_batch_idx):
        x, y = test_batch
        pred, loss = self.shared_step(x, y)
        _, predicted = torch.max(pred.data, 1)
        return {'test_loss': loss, 'test_score': (predicted, y)}
    
    def test_epoch_end(self, outputs):
        results = [x['test_score'] for x in outputs]
        preds = []
        y = []
        for predicted, y_data in results:
            predicted = predicted.detach().cpu().numpy()
            y_data = y_data.detach().cpu().numpy()
            preds.extend(predicted)
            y.extend(y_data)
        acc = accuracy_score(y, preds)
        f1 = f1_score(y, preds)
        self.log('accuracy', acc, prog_bar=True, logger=True)
        self.log('f1', f1, prog_bar=True, logger=True)
    
    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.model.parameters(), lr=self.lr)
        scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9)
        return [optimizer], [scheduler]

In [None]:
class Model(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.nn = None
        self.name = '' # File name for model's checkpoint
    
    def forward(self, x):
        return self.nn(x)

# Evaluation

In [None]:
def generate(elem: Any, count: int, step: Any):
    """
    Generates a list of elements with a given step
    """
    current = elem
    output = []
    for _ in range(count):
        output.append(current)
        current += step
    return output

In [1]:
class Evaluator:
    def __init__(self, obj_models: List[pl.LightningModule], version: str, folder_path: str = './', trainer_params: Dict[str, Any] = {}, logger: LightningLoggerBase = MLFlowLogger, logger_params: Dict[str, Any] = {}, 
                 callbacks: List[Callback] = [], callbacks_params: List[Dict[str, Any]] = [{}], datamodule_params: Dict[str, Any] = {}):
        """
        Model saving path will be constructed via scheme:
        path_to_folder+model_name/version+model.name
        """
        self.models = obj_models
        self.classifiers = [Classifier(model) for model in self.models]
        self.version = version
        split_names = []
        for model in self.models:
            model_name, _ = model.name.split('.')
            split_names.append(model_name)
        self.paths = [folder_path + model_name + '/' + version + model.name for model, name in zip(self.models, split_names)]
        self.loggers = [logger(**logger_params) for _ in self.models] # Each model must get it's own logger for lightning trainer
        self.trainer_params = trainer_params
        self.callbacks = []
        for _ in self.models: # Each model must get it's own set of callbacks, because these objects don't work well when reused
            self.callbacks.append([callback(**params) for callback, params in zip(callbacks, callbacks_params)])
        self.datamodules = [datamodule(**datamodule_params) for _ in self.models]
        # Each model must have it's own datamodule that will be used for both fit AND test methods in lightning trainer
    
    def train(self, min_lr: float, datamodule: pl.LightningDataModule, transforms: List[Callable] = [], find_lr: bool = True, verbose: bool = False, no_batch: bool = False):
        for path, classifier, logger, callbacks, datamodule in zip(self.paths, self.classifiers, self.loggers, self.callbacks, self.datamodules):
            self.trainer = pl.Trainer(logger=logger, callbacks=callbacks, **self.trainer_params)
            if find_lr:
                lr_finder = self.trainer.tuner.lr_find(classifier, min_lr=min_lr, datamodule=datamodule(no_batch=no_batch, transforms=transforms), early_stop_threshold=None)
                classifier.lr = lr_finder.suggestion()
            else:
                classifier.lr = min_lr

            if verbose and find_lr:
                print(f'Best lr: {classifier.lr}')

            self.trainer.fit(classifier, datamodule=datamodule)
            self.trainer.save_checkpoint(path)
    
    def test(self, datamodule: pl.LightningDataModule, transforms: List[Callable]):
        for path, model, logger, datamodule in zip(self.paths, self.models, self.loggers, self.datamodules):
            self.trainer = pl.Trainer(logger=logger, **self.trainer_params)
            classifier = Classifier.load_from_checkpoint(path, model=model)
            self.trainer.test(classifier, datamodule=datamodule)

NameError: name 'MLFlowLogger' is not defined

In [None]:
class LinearSearch:
    def __init__(self, linear_params, linear_params_count):
        self.linear_params = linear_params
        self.linear_params_count = linear_params_count
    
    def search(self, models: List[pl.LightningModule], in_size: int, out_size: int, datamodule: pl.LightningDataModule, versions: List[str], min_lr: float = 1e-03, transforms: List[Callable] = [], 
               find_lr=False, verbose=False, callbacks: List[Callback] = [], callbacks_params: List[Dict[str, Any]] = [{}]):
        for i in range(self.linear_params_count):
            obj_models = [model(in_size, out_size) for model in models]
            params = {}
            for key in self.linear_params.keys():
                params[key] = self.linear_params[key][i]
            if verbose:
                print(params)
            evaluator = Evaluator(obj_models, versions[i], trainer_params=params, callbacks=callbacks, callbacks_params=callbacks_params)
            evaluator.train(min_lr, datamodule, transforms, find_lr=find_lr, verbose=verbose)
            evaluator.test(datamodule, transforms)

In [None]:
models = [] # Pass via class name
min_lr = 1e-03
in_size = 0
out_size = 0
datamodule = DataModule
transforms = [normalize]
callbacks = [] # List of callback class names
callbacks_params = [{}] # List of corresponding parameters
max_epochs = [20, 30, 40, 50, 75, 100]
linear_params_count = len(max_epochs)
linear_params = {
    'gpus': generate(1, linear_params_count, 0),
    'max_epochs': max_epochs,
    'gradient_clip_val': generate(0.5, linear_params_count, 0.),
    'stochastic_weight_avg': generate(True, linear_params_count, False),
    'amp_level': generate('O3', linear_params_count, ''),
    'precision': generate(16, linear_params_count, 0)
}
versions = [str(max_epoch) + 'epoch_' for max_epoch in params['max_epochs']]

In [None]:
linear_search = LinearSearch(linear_params, linear_params_count)
linear_search.search(models, in_size, out_size, datamodule, versions, min_lr, transforms, callbacks=callbacks, callbacks_params=callback_params)

In [3]:
class CrossValidation:
    def __init__(self, n_splits, X, y, models, in_size, out_size, datamodule, trainer_params, logger, logger_params, callbacks):
        self.skf = StratifiedKFold(n_split=n_splits)
        self.models = models
        self.obj_models = []
        self.models_in_size = in_size
        self.models_out_size = out_size
        self.trainer_params = trainer_params
        self.logger = logger
        self.logger_params = logger_params
        self.datamodule = datamodule
        self.data = X
        self.target = y
        self.acc_scores = []
        self.f1_scores = []
    
    def _prepare_data(self, train_idx, test_idx):
        X_train, X_test = self.data[train_idx], self.data[test_idx]
        y_train, y_test = self.target[train_idx], self.target[test_idx]
        return X_train, X_test, y_train, y_test

    def _prepare_objects(self):
        for model in slef.models:
            self.obj_models.append(model(self.models_in_size, self.models_out_size))
    
    def call(self, version: str, folder_path: str = './'):
        """
        Performs cross validation over given dataset. Uses LinearSearch for validating multiple models. 
        Params_count stands for length of list of one trainer parameter.
        Saves models after each k-th fold via scheme:
        k+version+
        """
        for i, idx in enumerate(self.skf.split(X, y)):
            train_idx, test_idx = idx
            X_train, X_test, y_train, y_test = self.prepare_data(train_idx, test_idx)
            obj_models = [model(self.models_in_size, self.models_out_size) for model in self.models]
            datamodule = self.datamodule
            loop_version = str(i) + '_' + version
            evaluator_params = {
                'obj_models': obj_models,
                'version': loop_version,
                'folder_path': folder_path,
                'trainer_params': trainer_params,
                'logger': logger,
                'logger_params': logger_params,
                'callbacks': = 
            }
            evaluator = Evaluator(obj_models=obj_models, version: str, folder_path: str = './', trainer_params: Dict[str, Any] = {}, 
                                  logger: LightningLoggerBase = MLFlowLogger, logger_params: Dict[str, Any] = {}, 
                 callbacks: List[Callback] = [], callbacks_params: List[Dict[str, Any]] = [{}], datamodule_params: Dict[str, Any] = {})
            for j, metrics in enumerate(zip(linear_search.acc_scores, linear_search.f1_scores)):
                acc_scores[j].append(metrics[0]) # 9 list po 10 słowników po 2 modele
                f1_scores[j].append(metrics[1])

SyntaxError: positional argument follows keyword argument (<ipython-input-3-48a8c77671c6>, line 39)

In [None]:
skf = StratifiedKFold(n_splits=10)
datamodule = PimaIndiansDataModule(no_batch=True, transforms=[normalize])
datamodule.prepare_data()
X = datamodule.transformed_data
y = datamodule.full_data.values[:, -1]
acc_scores = []
f1_scores = []
for _ in range(params_count):
    acc_scores.append(list())
    f1_scores.append(list())
    
for i, idx in enumerate(skf.split(X, y)):
    train_idx, test_idx = idx
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]
    train_dataset = pd.DataFrame(X_train)
    train_dataset['Output'] = y_train
    test_dataset = pd.DataFrame(X_test)
    test_dataset['Output'] = y_test
    datamodule = DataModule
    linear_search = LinearSearch(params, params_count)
    loop_versions = [str(i) + '_' + version for version in versions]
    linear_search.search(models, in_size, out_size, datamodule, loop_versions, min_lr, transforms, False, False, callbacks, early_stopping_params, train_dataset=train_dataset, test_dataset=test_dataset)
    for j, metrics in enumerate(zip(linear_search.acc_scores, linear_search.f1_scores)):
        acc_scores[j].append(metrics[0]) # 9 list po 10 słowników po 2 modele
        f1_scores[j].append(metrics[1])

In [None]:
models = acc_scores[0][0].keys()
final_models_acc_scores = []
final_models_f1_scores = []
for acc, f1 in zip(acc_scores, f1_scores):
    for model in models:
        model_acc_scores = [models_dict[model] for models_dict in acc]
        model_f1_scores = [models_dict[model] for models_dict in f1]
        final_models_acc_scores.append(np.mean(model_acc_scores))
        final_models_f1_scores.append(np.mean(model_f1_scores))

In [None]:
from os import listdir
from os.path import isfile, join
mypath = './model/'
datamodule = PimaIndiansDataModule
networks = [f for f in listdir(mypath) if isfile(join(mypath, f))]
params = {
    'gpus': 1,
    'amp_level': 'O3',
    'precision': 16
}
for network in networks:
    try:
        file_name = network
        network = mypath + network
        if 'double' in network:
            print('Double Big: ' + file_name)
            model = Classifier.load_from_checkpoint(network, model=SpreadNNNoPoolDoubleBig(in_size, out_size))
            trainer = pl.Trainer(logger=MLFlowLogger(), **params)
            trainer.test(model, datamodule=datamodule(no_batch=True, transforms=transforms))
        else:
            print('Big: ' + file_name)
            model = Classifier.load_from_checkpoint(network, model=SpreadNNNoPoolBig(in_size, out_size))
            trainer = pl.Trainer(logger=MLFlowLogger(), **params)
            trainer.test(model, datamodule=datamodule(no_batch=True, transforms=transforms))
    except:
        continue