In [1]:
import sys
import os
import numpy as np
BIN = '../'
sys.path.append(BIN)
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import pickle
import my_matplotlib_style as ms
from scipy import stats
import utils

import torch
import torch.nn as nn
import torch.utils.data

from torch.utils.data import TensorDataset
from fastai.callbacks.tracker import SaveModelCallback

from fastai import basic_train, basic_data
from fastai.callbacks import ActivationStats
from fastai import train as tr

from nn_utils import get_data, RMSELoss
from utils import plot_activations

from nn_utils import AE_basic, AE_bn_LeakyReLU

mpl.rc_file(BIN + 'my_matplotlib_rcparams')

from guppy import hpy; hp=hpy()

# Load data
train = pd.read_pickle(BIN + 'processed_data/aod/all_jets_train_27D_5_percent.pkl')
test = pd.read_pickle(BIN + 'processed_data/aod/all_jets_test_27D_5_percent.pkl')

#Remove irrelevant columns
#train.pop('JetGhostArea')
#test.pop('JetGhostArea')
#train.pop('BchCorrCell')
#test.pop('BchCorrCell')

# Remove extreme/bad jets
train = utils.filter_jets(train)
test = utils.filter_jets(test)

# Normalize
train_mean = train.mean()
train_std = train.std()

train, test = utils.custom_normalization(train, test)

train_x = train
test_x = test
train_y = train_x  # y = x since we are building and AE
test_y = test_x

train_ds = TensorDataset(torch.tensor(train_x.values), torch.tensor(train_y.values))
valid_ds = TensorDataset(torch.tensor(test_x.values), torch.tensor(test_y.values))
train_dl, valid_dl = get_data(train_ds, valid_ds, bs=1024)
db = basic_data.DataBunch(train_dl, valid_dl)

module_name = 'AE_bn_LeakyReLU'
module = AE_bn_LeakyReLU
grid_search_folder = "AE_bn_LeakyReLU_AOD_grid_search_custom_normalization_1500epochs/"

loss_func = nn.MSELoss()

#The folder to analyse
model_folder_name = "AE_27_200_200_200_18_200_200_200_27"

plt.close('all')

print(hp.heap())



Partition of a set of 815529 objects. Total size = 1211896021 bytes.
 Index  Count   %     Size   % Cumulative  % Kind (class / dict of class)
     0      2   0 558366640  46 558366640  46 pandas.core.frame.DataFrame
     1    175   0 538451132  44 1096817772  91 numpy.ndarray
     2 324277  40 36400464   3 1133218236  94 str
     3      2   0 19941712   2 1153159948  95 pandas.core.indexes.numeric.Int64Index
     4 177758  22 14261192   1 1167421140  96 tuple
     5  79679  10  6345332   1 1173766472  97 bytes
     6  40383   5  5844800   0 1179611272  97 types.CodeType
     7  37651   5  5120536   0 1184731808  98 function
     8   4791   1  4812320   0 1189544128  98 type
     9   8177   1  3292440   0 1192836568  98 dict (no owner)
<1667 more rows. Type e.g. '_.more' to view.>


In [2]:
#Just alter this if you want to iterate through every model
for model_folder in [x for x in os.scandir(grid_search_folder)]: #if x.name == model_folder_name]:
    if model_folder.is_dir():
        for train_folder in os.scandir(grid_search_folder + model_folder.name):
            if train_folder.is_dir() and train_folder.name == 'models':
                plt.close('all')

                #Find the best model
                for f in os.scandir(grid_search_folder + model_folder.name + '/' + train_folder.name + '/'):
                    if f.name[:4] == "best":
                        saved_model_fname = f.name[:-4]
                        print(model_folder.name + " " + f.name[:-4])

                #Load model
                nodes = model_folder.name.split('AE_')[1].split('_')
                nodes = [int(x) for x in nodes]
                model = module(nodes)
                learn = basic_train.Learner(data=db, model=model, loss_func=loss_func, true_wd=True)
                learn.model_dir = grid_search_folder + model_folder.name + '/' + 'models/'
                learn.load(saved_model_fname)
                #model.load_state_dict(torch.load(path_to_saved_model))
                learn.model.eval()

                # Histograms
                idxs = (0, 1000)  # Choose events to compare
                data = torch.tensor(test_x[idxs[0]:idxs[1]].values)
                #Note, float conversion, this takes time
                pred = model(data.float()).detach().numpy()
                pred = np.multiply(pred, train_std.values)
                pred = np.add(pred, train_mean.values)
                data = np.multiply(data, train_std.values)
                data = np.add(data, train_mean.values)
                
                print(hp.heap())


AE_27_200_200_200_18_200_200_200_27 best_AE_bn_LeakyReLU_bs4096_lr1e-02_wd1e-02
Partition of a set of 816614 objects. Total size = 2289138252 bytes.
 Index  Count   %     Size   % Cumulative  % Kind (class / dict of class)
     0     56   0 1076857922  47 1076857922  47 pandas.core.series.Series
     1      2   0 558366640  24 1635224562  71 pandas.core.frame.DataFrame
     2    230   0 538672428  24 2173896990  95 numpy.ndarray
     3 324409  40 36414614   2 2210311604  97 str
     4      2   0 19941712   1 2230253316  97 pandas.core.indexes.numeric.Int64Index
     5 177709  22 14256664   1 2244509980  98 tuple
     6  79687  10  6346513   0 2250856493  98 bytes
     7  40384   5  5844944   0 2256701437  99 types.CodeType
     8  37541   5  5105576   0 2261807013  99 function
     9   4791   1  4814984   0 2266621997  99 type
<1725 more rows. Type e.g. '_.more' to view.>
AE_27_200_200_200_16_200_200_200_27 best_AE_bn_LeakyReLU_bs4096_lr3e-02_wd1e-04
Partition of a set of 816602 object