In [3]:
import sys
import os
import numpy as np
BIN = '../'
sys.path.append(BIN)
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import pickle
import my_matplotlib_style as ms
from scipy import stats
import utils

import torch
import torch.nn as nn
import torch.utils.data

from torch.utils.data import TensorDataset
from fastai.callbacks.tracker import SaveModelCallback

from fastai import basic_train, basic_data
from fastai.callbacks import ActivationStats
from fastai import train as tr

from nn_utils import get_data, RMSELoss
from utils import plot_activations

from nn_utils import AE_basic, AE_bn_LeakyReLU

mpl.rc_file(BIN + 'my_matplotlib_rcparams')

from guppy import hpy; hp=hpy()

# Load data
train = pd.read_pickle(BIN + 'processed_data/aod/all_jets_train_27D_5_percent.pkl')
test = pd.read_pickle(BIN + 'processed_data/aod/all_jets_test_27D_5_percent.pkl')

#Remove irrelevant columns
#train.pop('JetGhostArea')
#test.pop('JetGhostArea')
#train.pop('BchCorrCell')
#test.pop('BchCorrCell')

# Remove extreme/bad jets
train = utils.filter_jets(train)
test = utils.filter_jets(test)

# Normalize
train_mean = train.mean()
train_std = train.std()

train, test = utils.custom_normalization(train, test)

train_x = train
test_x = test
train_y = train_x  # y = x since we are building and AE
test_y = test_x

train_ds = TensorDataset(torch.tensor(train_x.values), torch.tensor(train_y.values))
valid_ds = TensorDataset(torch.tensor(test_x.values), torch.tensor(test_y.values))
train_dl, valid_dl = get_data(train_ds, valid_ds, bs=1024)
db = basic_data.DataBunch(train_dl, valid_dl)

module_name = 'AE_bn_LeakyReLU'
module = AE_bn_LeakyReLU
grid_search_folder = "AE_bn_LeakyReLU_AOD_grid_search_custom_normalization_1500epochs/"

loss_func = nn.MSELoss()

#The folder to analyse
model_folder_name = "AE_27_200_200_200_18_200_200_200_27"

plt.close('all')

print(hp.heap())

Partition of a set of 702783 objects. Total size = 1204437943 bytes.
 Index  Count   %     Size   % Cumulative  % Kind (class / dict of class)
     0      2   0 558366640  46 558366640  46 pandas.core.frame.DataFrame
     1    179   0 538667532  45 1097034172  91 numpy.ndarray
     2 199392  28 27267757   2 1124301929  93 str
     3      2   0 19941712   2 1144243641  95 pandas.core.indexes.numeric.Int64Index
     4 182581  26 14614784   1 1158858425  96 tuple
     5  83153  12  6555106   1 1165413531  97 bytes
     6  42012   6  6078680   1 1171492211  97 types.CodeType
     7  38761   6  5271496   0 1176763707  98 function
     8   5151   1  5192336   0 1181956043  98 type
     9   9836   1  3648008   0 1185604051  98 dict (no owner)
<1778 more rows. Type e.g. '_.more' to view.>


In [2]:
#Just alter this if you want to iterate through every model
for model_folder in [x for x in os.scandir(grid_search_folder) if x.name == model_folder_name]:
    if model_folder.is_dir():
        for train_folder in os.scandir(grid_search_folder + model_folder.name):
            if train_folder.is_dir() and train_folder.name == 'models':
                plt.close('all')

                #Find the best model
                for f in os.scandir(grid_search_folder + model_folder.name + '/' + train_folder.name + '/'):
                    if f.name[:4] == "best":
                        saved_model_fname = f.name[:-4]
                        print(model_folder.name + " " + f.name[:-4])

                #Load model
                nodes = model_folder.name.split('AE_')[1].split('_')
                nodes = [int(x) for x in nodes]
                model = module(nodes)
                learn = basic_train.Learner(data=db, model=model, loss_func=loss_func, true_wd=True)
                learn.model_dir = grid_search_folder + model_folder.name + '/' + 'models/'
                learn.load(saved_model_fname)
                #model.load_state_dict(torch.load(path_to_saved_model))
                learn.model.eval()

                # Histograms
                idxs = (0, 1000)  # Choose events to compare
                data = torch.tensor(test_x[idxs[0]:idxs[1]].values)
                #Note, float conversion, this takes time
                pred = model(data.float()).detach().numpy()
                pred = np.multiply(pred, train_std.values)
                pred = np.add(pred, train_mean.values)
                data = np.multiply(data, train_std.values)
                data = np.add(data, train_mean.values)
                
                print(hp.heap())


AE_27_200_200_200_18_200_200_200_27 best_AE_bn_LeakyReLU_bs4096_lr1e-02_wd1e-02
Partition of a set of 703211 objects. Total size = 1204473585 bytes.
 Index  Count   %     Size   % Cumulative  % Kind (class / dict of class)
     0      2   0 558366640  46 558366640  46 pandas.core.frame.DataFrame
     1    179   0 538667532  45 1097034172  91 numpy.ndarray
     2 199333  28 27258847   2 1124293019  93 str
     3      2   0 19941712   2 1144234731  95 pandas.core.indexes.numeric.Int64Index
     4 182731  26 14627136   1 1158861867  96 tuple
     5  83160  12  6555630   1 1165417497  97 bytes
     6  42016   6  6079256   1 1171496753  97 types.CodeType
     7  38903   6  5290808   0 1176787561  98 function
     8   5151   1  5192336   0 1181979897  98 type
     9   9833   1  3649000   0 1185628897  98 dict (no owner)
<1795 more rows. Type e.g. '_.more' to view.>
