In [1]:
import sys
import time
sys.path.insert(0, '/home/anish/AutoNN')

In [2]:
from AutoNN.preprocessing import data_cleaning
from AutoNN.preprocessing import encoding_v2 as enc
import dask.dataframe as dd
from AutoNN.networkbuilding import final
import numpy as np
import pandas as pd



░█▀▀█ █░░█ ▀▀█▀▀ █▀▀█ ▒█▄░▒█ ▒█▄░▒█ 
▒█▄▄█ █░░█ ░░█░░ █░░█ ▒█▒█▒█ ▒█▒█▒█ 
▒█░▒█ ░▀▀▀ ░░▀░░ ▀▀▀▀ ▒█░░▀█ ▒█░░▀█

An AutoML framework by
Anish Konar, Arjun Ghosh, Rajarshi Banerjee, Sagnik Nayak.



2022-09-29 22:47:17.889065: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.1


In [3]:
import time

In [4]:
class AutoNN:
    def __init__(self, train_csv_path, label_name, loss = None):
        self._train_csv_path = train_csv_path
        self._label_name = label_name
        self._output_shape = None
        self._output_activation = None
        self._loss = loss
        
        self._train_X = None
        self._train_Y = None
        self._test_X = None
        self._test_Y = None
        
        self._input_shape = None
        
        self._EDA_data_container = None
    
    def preprocessing(self):
        df = dd.read_csv(self._train_csv_path, assume_missing=True, sample_rows=2000)
        d_clean = data_cleaning.DataCleaning(label = [self._label_name], train_dataframe=df)
        d_clean.dataset.train_test_split()
        d_clean.parse_dates()
        d_clean.generate_column_info()
        # d_clean.column_info
        encoder = enc.Encoding()

        train, validation, test = d_clean.dataset.get(['train', 'validation', 'test'])

        for column in d_clean.column_info.keys():
            if d_clean.column_info[column]['dtype'] == 'object':
                encoder.fit_column(column = train[column], column_name=column, label_name = self._label_name)
        train, validation, test = d_clean.dataset.get(['train', 'validation', 'test'])
        d_clean.dataset.set(encoder.label_encode(train), type = 'train')

        if validation is not None:
            d_clean.dataset.set(encoder.label_encode(validation), type = 'validation')
        if test is not None:
            d_clean.dataset.set(encoder.label_encode(test), type = 'test')


        d_clean.clean_data()
        
        self._EDA_data_container = d_clean.dataset()
        
        d_clean.feature_elimination_fit(type = "train", method = "correlation")
        d_clean.eliminate_features(type = "train")
        if validation is not None:
            d_clean.eliminate_features(type = "validation")
        if test is not None:
            d_clean.eliminate_features(type = "test")
            

#         train, validation, test = d_clean.dataset.get(['train', 'validation', 'test'])
# #         EDA DATA
#         self._EDA_train = train
#         self._EDA_test = test
        
        d_clean.dataset.set(encoder.inverse_label_encode(train), type = 'train')
        if validation is not None:
            d_clean.dataset.set(encoder.inverse_label_encode(validation), type = 'validation')
        if test is not None:
            d_clean.dataset.set(encoder.inverse_label_encode(test), type = 'test')
    
        train, validation, test = d_clean.dataset.get(['train', 'validation', 'test'])
        d_clean.generate_column_info()
        
        reg_clas_flag, label_cardinality = d_clean.is_regression()
        categorical_flag = False
        if reg_clas_flag == 1:
            self._output_shape = 1
            self._output_activation = None
            loss = "mean_squared_error"
        elif reg_clas_flag == 0 and label_cardinality == 2:
            self._output_shape = 1
            self._output_activation = "sigmoid"
            loss = "binary_crossentropy"
        elif reg_clas_flag == 0 and label_cardinality > 2:
            self._output_shape = label_cardinality
            self._output_activation = "softmax"
            loss = "categorical_crossentropy"
            categorical_flag = True
            print("CATEGORICAL")
#             d_clean.dataset.set(train.categorize(self._label_name), type = 'train')        
            
        if self._loss == None:
            self._loss = loss
            
        train, validation, test = d_clean.dataset.get(['train', 'validation', 'test'])
                    
        train_Y = train[self._label_name].to_frame()
        test_Y = test[self._label_name].to_frame()
        
        train_X = train.drop(self._label_name, axis = 1).copy()
        test_X = test.drop(self._label_name, axis = 1).copy()
        
        onehot_encoder_X = enc.Encoding()
        onehot_encoder_X.onehot_fit(train_X)
        
        train_X = onehot_encoder_X.onehot_encode(train_X)
        if validation is not None:
            d_clean.dataset.set(onehot_encoder_X.onehot_encode(validation), type = 'validation')
        if test is not None:
            test_X = onehot_encoder_X.onehot_encode(test_X)
        
        if categorical_flag:
            train_Y = train_Y.categorize()
            test_Y = test_Y.categorize()
            pr
            onehot_encoder_Y = enc.Encoding()
            onehot_encoder_Y.onehot_fit(train_Y)
            train_Y = onehot_encoder_Y.onehot_encode(train_Y)
            if test is not None:
                test_Y = onehot_encoder_Y.onehot_encode(test_Y)
                        
        d_clean.scaling_fit(train_X)
        train_X = d_clean.scaling_transform(train_X)
        if test is not None:
            test_X = d_clean.scaling_transform(test_X)
        
        if not categorical_flag:
            d_clean.scaling_fit(train_Y)
            train_Y = d_clean.scaling_transform(train_Y)
            if test is not None:
                test_Y = d_clean.scaling_transform(test_Y)
        
        self._train_X = np.asarray(train_X)
        self._train_Y = np.asarray(train_Y)
        self._test_X = np.asarray(test_X)
        self._test_Y = np.asarray(test_Y)
        print(self._test_Y.shape)
        
        self._input_shape = train_X.shape[-1]
        
#         return train, validation, test, train_Y, test_Y
    def neuralnetworkgeneration(self):
        f = final.Final(self._train_X, self._train_Y, self._test_X, self._test_Y, self._loss, 75, 64, input_shape = self._input_shape, 
                                   max_no_layers = 3, model_per_batch = 10, 
                        output_shape = self._output_shape, output_activation = self._output_activation)
        f.get_all_best_models()

In [5]:
# autonn = AutoNN("/home/anish/Downloads/winequality-red.csv", "quality")
# autonn = AutoNN("/home/anish/Downloads/diabetes1.csv", "Outcome")
autonn = AutoNN("/home/anish/Downloads/house-prices-advanced-regression-techniques/train.csv", "SalePrice")

In [6]:
start = time.time()
autonn.preprocessing()
end = time.time()

[(0.0008496176720475786, 'Electrical'), (0.00594732370433305, 'MasVnrArea'), (0.00594732370433305, 'MasVnrType'), (0.028037383177570093, 'BsmtCond'), (0.028037383177570093, 'BsmtFinType1'), (0.028037383177570093, 'BsmtQual'), (0.02888700084961767, 'BsmtExposure'), (0.02888700084961767, 'BsmtFinType2'), (0.05777400169923534, 'GarageCond'), (0.05777400169923534, 'GarageFinish'), (0.05777400169923534, 'GarageQual'), (0.05777400169923534, 'GarageType'), (0.05777400169923534, 'GarageYrBlt'), (0.16992353440951571, 'LotFrontage')]
(283, 1)


In [7]:
print(f"Time Taken {end-start}s")

Time Taken 71.45998334884644s


In [8]:
start = time.time()
autonn.neuralnetworkgeneration()
end = time.time()

2022-09-29 22:48:30.134797: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2022-09-29 22:48:30.138508: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcuda.so.1
2022-09-29 22:48:30.226718: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:941] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-09-29 22:48:30.227334: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1720] Found device 0 with properties: 
pciBusID: 0000:01:00.0 name: NVIDIA GeForce RTX 2060 computeCapability: 7.5
coreClock: 1.695GHz coreCount: 30 deviceMemorySize: 5.79GiB deviceMemoryBandwidth: 312.97GiB/s
2022-09-29 22:48:30.227452: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.1
2022-09-29 22:48:30.251599: I tensorflow/stream_executor/platform/de


 
 

loss  :  0.003590124426409602 , TEST :  0.038536187261343
output_layer_dense_16_16_loss  :  0.0006923242472112179 , TEST :  0.0032229165080934763
output_layer_dense_16_64_loss  :  0.0005468130693770945 , TEST :  0.0026762248016893864
output_layer_dense_16_128_loss  :  0.0005446601426228881 , TEST :  0.003683902323246002
output_layer_dense_16_512_loss  :  0.0004186035948805511 , TEST :  0.0041736215353012085
output_layer_dense_16_1024_loss  :  0.000321403203997761 , TEST :  0.004276079125702381
output_layer_dense_64_16_loss  :  0.0004052314325235784 , TEST :  0.003972293809056282
output_layer_dense_64_64_loss  :  0.0003141411580145359 , TEST :  0.005224472843110561
output_layer_dense_64_128_loss  :  0.000159550501848571 , TEST :  0.004118770360946655
output_layer_dense_64_512_loss  :  0.00011927708692383021 , TEST :  0.003743923269212246
output_layer_dense_64_1024_loss  :  6.811981438659132e-05 , TEST :  0.003443980822339654

 
 

loss  :  0.0014651044039055705 , TEST :  0.0338632


 
 

loss  :  0.0011492320336401463 , TEST :  0.03307264298200607
output_layer_dense_128_1024_64_loss  :  5.8701807574834675e-05 , TEST :  0.0034096429590135813
output_layer_dense_128_1024_128_loss  :  6.417174154194072e-05 , TEST :  0.0035458917263895273
output_layer_dense_128_1024_512_loss  :  7.17020157026127e-05 , TEST :  0.0034428159706294537
output_layer_dense_128_1024_1024_loss  :  5.199196675675921e-05 , TEST :  0.0030307890847325325
output_layer_dense_512_16_16_loss  :  0.00039839386590756476 , TEST :  0.0026172625366598368
output_layer_dense_512_64_16_loss  :  0.00010197723895544186 , TEST :  0.0032232245430350304
output_layer_dense_512_64_64_loss  :  0.0001360455498797819 , TEST :  0.003446687711402774
output_layer_dense_512_128_16_loss  :  9.445151954423636e-05 , TEST :  0.0035458984784781933
output_layer_dense_512_128_64_loss  :  8.261534094344825e-05 , TEST :  0.0034822418820112944
output_layer_dense_512_128_128_loss  :  8.918108505895361e-05 , TEST :  0.0033281883224844

2022-09-29 23:18:15.612340: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.


INFO:tensorflow:Assets written to: /home/anish/AutoNN_test_weights/candidate_models/dense_16_128_128_dr/assets
INFO:tensorflow:Assets written to: /home/anish/AutoNN_test_weights/candidate_models/dense_512_16_16_dr/assets
INFO:tensorflow:Assets written to: /home/anish/AutoNN_test_weights/candidate_models/dense_16_64_dr/assets
-------------------------------------------------------------------------------------------------------------------
dense_16_128_128_dr_st_dense_512_16_16_dr
Model: "dense_16_128_128_dr_st_dense_512_16_16_dr"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_layer_dense_16_128_128 [(None, 72)]              0         
_________________________________________________________________
layer1_dense_16_128_128_dr ( (None, 16)                1168      
_________________________________________________________________
layer2_dense_16_128_128_dr ( (None, 128)               2176      
__

BEST HYPERPARAMETERS : BEST_LOSS : 0.0004926779656670988, BEST_ACTIVATION : relu, BEST_INITIALIZER : HeUniform, BEST_LEARINING_RATE : 0.00316, BEST_BATCHSIZE : 32, BEST_DROPOUT_RATE : None, BEST_DROPOUT_LOSS : None
HeUniform
DROPOUT RATES : [0.2, 0.5]
TRAIN_LOSS = 0.0007065783138386905, TEST_LOSS = 0.003233877243474126
INFO:tensorflow:Assets written to: /home/anish/AutoNN_test_weights/stacked_models/dense_512_16_16_dr_st_dense_16_64_dr/assets
-------------------------------------------------------------------------------------------------------------------
dense_16_64_dr_st_dense_16_128_128_dr
Model: "dense_16_64_dr_st_dense_16_128_128_dr"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_layer_dense_16_64_dr ( [(None, 72)]              0         
_________________________________________________________________
layer1_dense_16_64_dr (Dense (None, 16)                1168      
______________________

In [9]:
print(f"Time Taken {end-start}s")

Time Taken 4262.250415325165s
