# Experimental Test Code:
- Code example for testing Anomaly Detection algorithms on Smart-Manufacturing datasets.
- Please refer to [ADBench](https://github.com/Minqi824/ADBench) package to use of additional algorithms or not manufacturing datasets.

In [1]:
# !pip install "numpy<2"
# pip install numpy==1.21.0
# pip install pyod
# pip install combo
# pip install tqdm
# pip install xgboost
# pip install lightgbm
# pip install catboost

In [2]:
# import basic package
import os
import pandas as pd
import numpy as np
import time

import warnings
warnings.filterwarnings("ignore")

# import the necessary package
from data_generator import DataGenerator
from myutils import Utils

# instiantiate datagenerator and util objects
datagenerator = DataGenerator() 
utils = Utils()

2025-02-10 10:11:41.832494: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-02-10 10:11:42.831195: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda-11.2/lib64:
2025-02-10 10:11:42.831327: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda-11.2/lib64:


- We include all the datasets for Smart-Manufacturing in the "datasets" folder, as the "number_data.npz" filename. Please see the table in the markdown for details. You can specify the dataset name by removing the filename ".npz" suffix in the data generator, e.g., "88_GenesisPickPlace.npz" as "88_GenesisPickPlace". 
    
    
- All the algorithms included are explained in detail in the [ADBench](https://github.com/Minqi824/ADBench) resource.
    - You need to specify the model name when initialization, as some algorithms (e.g., supervised algorithms) are integrated in one class, please see [ADBench](https://github.com/Minqi824/ADBench) for details.
    - You can also test your own AD algorithms on the proposed datasets, as long as the algorithm can output anomaly score for evaluation.

In [3]:
os.listdir('datasets/Classical')

['38_HSEFilters1.npz',
 '27_DElevators.npz',
 '18_Robotfail.npz',
 '73_Inverter.npz',
 '31_Milling.npz',
 '49_Wafer2.npz',
 '2_pdm_anreal.npz',
 '67_Elevatorpm.npz',
 '12_smap.npz',
 '47_FordB_anreal.npz',
 '60_Concrete.npz',
 '17_kdd.npz',
 '86_HighStorageSystem.npz',
 '57_DutchPowerDemand.npz',
 '5_cfrp.npz',
 '95_CWRUBearing.npz',
 '58_MiningProcess.npz',
 '66_Cuttingblade.npz',
 '9_skab.npz',
 '59_UCISecom.npz',
 '20_Glass2.npz',
 '79_UCINavalPropulsionPlants.npz',
 '61_Biopharmaceutical.npz',
 '80_UCIRobotExectFail.npz',
 '76_UCIAccelerometer.npz',
 '39_HSEFilters2.npz',
 '33_Flea2.npz',
 '46_Sm4Tankbatch.npz',
 '22_Castmetal.npz',
 '85_PlasticExtrusionDefects.npz',
 '56_ShuttleMarottaValve.npz',
 '69_WaterPumpSensor.npz',
 '37_Shutlevalve.npz',
 '55_Boschline.npz',
 '40_Yahoo1.npz',
 '41_Yahoo2.npz',
 '15_Multistage.npz',
 '81_UCIMechanicalAnalysis.npz',
 '11_psm.npz',
 '89_ElecMotorTemperature.npz',
 '64_PlasmaSpray.npz',
 '68_FurnaceVibration.npz',
 '87_3DPrinter.npz',
 '19_Ann

In [4]:
# import AD algorimths or class of algorithms to be used
from baseline.PyOD import PYOD
from baseline.DevNet.run import DevNet
from baseline.Supervised import supervised
from baseline.LSTMOD.LSTMOD import LSTMOutlierDetector
from baseline.GANomaly.run import GANomaly
from pyod.models.vae import VAE

# dataset and model list / dict
# dataset_list = ['133_HighStorageSystem_anreal', '135_GenesisPickPlace_anreal'] # Add the datasets you want to test.
# dataset_list = ['54_CNCMachining', '72_CNCturning'] # Add the datasets you want to test.
# dataset_list = ['31_Milling'] # Add the datasets you want to test.
dataset_list = ['54_CNCMachining'] # Add the datasets you want to test.
dataset_list = ['95_CWRUBearing','35_IMS','65_MachineryFault', '36_PHM']

model_dict = {'CBLOF':PYOD,'OCSVM':PYOD,'HBOS':PYOD,'KNN':PYOD,'LOF':PYOD,'PCA':PYOD,'IForest': PYOD, # Classical
              'DeepSVDD': PYOD,'AutoEncoder': PYOD,'VAE':PYOD,  # Deep
              'LSTMOutlierDetector': LSTMOutlierDetector,'DevNet': DevNet,'GANomaly': GANomaly, # Deep
              'XGBOD':PYOD,'RF': supervised, 'CatB': supervised} # Supervised

# dataframes to save the results
df_AUCROC = pd.DataFrame(data=None, index=dataset_list, columns = model_dict.keys())
df_AUCPR = pd.DataFrame(data=None, index=dataset_list, columns = model_dict.keys())
df_TIMETRAIN = pd.DataFrame(data=None, index=dataset_list, columns = model_dict.keys())
df_TIMEINFER = pd.DataFrame(data=None, index=dataset_list, columns = model_dict.keys())

In [5]:
# seed for reproducible results
seed = 42

for dataset in dataset_list:
    '''
    la: ratio of labeled anomalies, from 0.0 to 1.0
    realistic_synthetic_mode: types of synthetic anomalies, can be local, global, dependency or cluster
    noise_type: inject data noises for testing model robustness, can be duplicated_anomalies, irrelevant_features or label_contamination
    '''
    
    # import the dataset
    datagenerator.dataset = dataset # specify the dataset name
    data = datagenerator.generator(la=0.1, realistic_synthetic_mode=None, noise_type=None) 
    for name, clf in model_dict.items():
        # model initialization.
        # You can make special cases of AD algorithms (in this case, VAE) to tune hyperparameters:
        if name == 'VAE':
            # model initialization
            # clf = VAE(encoder_neurons =[64, 32, 1],decoder_neurons =[1, 32, 64])
            clf = VAE(encoder_neuron_list =[64, 32, 1],decoder_neuron_list =[1, 32, 64])

            # training, for unsupervised models the y label will be discarded
            try:
                start_train = time.time()
                clf = clf.fit(data['X_train'], data['y_train'])
                duracion_train = (time.time() - start_train)
            except:
                pass
            
            # output predicted anomaly score on testing set
            try:
                start_infer = time.time()
                score = clf.decision_function(pd.DataFrame(data['X_test']))
                duracion_infer = (time.time() - start_infer)
            except:
                pass
        
        else:
            # model initialization
            clf = clf(seed=seed, model_name=name)
            
            # training, for unsupervised models the y label will be discarded
            try:
                start_train = time.time()
                clf = clf.fit(data['X_train'], data['y_train'])
                duracion_train = (time.time() - start_train)
            except:
                pass
            
            # output predicted anomaly score on testing set
            try:
                start_infer = time.time()
                score = clf.predict_score(data['X_test'])
                duracion_infer = (time.time() - start_infer)
            except:
                pass
        
        # evaluation
        try:
            result = utils.metric(y_true=data['y_test'], y_score=score)
        except:
            result = {'aucroc':np.float('nan'),'aucpr':np.float('nan')}
            pass
        
        # save results
        df_AUCROC.loc[dataset, name] = result['aucroc']
        df_AUCPR.loc[dataset, name] = result['aucpr']
        df_TIMETRAIN.loc[dataset, name] = duracion_train
        try:
            df_TIMEINFER.loc[dataset, name] = duracion_infer
        except:
            pass

subsampling for dataset 95_CWRUBearing...
current noise type: None
{'Samples': 10000, 'Features': 2, 'Anomalies': 1958, 'Anomalies Ratio(%)': 19.58}
best param: None
best param: None
best param: None
best param: None
best param: None
best param: None
best param: None
best param: None
best param: None


Training:   0%|          | 0/10 [00:00<?, ?it/s]
Training:   0%|          | 0/30 [00:00<?, ?it/s]


XXXX: (7000, 2)
Training size: 7000, No. outliers: 137


2025-02-10 10:11:55.007080: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2025-02-10 10:11:55.011283: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2025-02-10 10:11:55.011596: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2025-02-10 10:11:55.012213: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild

Epoch 1/50


_device.cc:1613] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 21527 MB memory:  -> device: 0, name: GRID P40-24Q, pci bus id: 0000:02:00.0, compute capability: 6.1


Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
best param: None
Learning rate set to 0.023648
0:	learn: 0.6462519	total: 48.1ms	remaining: 48s
1:	learn: 0.6032363	total: 50ms	remaining: 25s
2:	learn: 0.5635283	total: 51.8ms	remaining: 17.2s
3:	learn: 0.5259374	total: 53.4ms	remaining: 13.3s
4:	learn: 0.4905739	total: 55.1ms	remaining: 11s
5:	learn: 0.4598221	total: 56.7ms	remaining: 9.4s
6:	learn: 0.4295559	total: 58.4ms	remaining: 8.28s
7:	learn: 0.4046067	total

Training:   0%|          | 0/10 [00:00<?, ?it/s]
Training:   0%|          | 0/30 [00:00<?, ?it/s]

XXXX: (7000, 8)
Training size: 7000, No. outliers: 151



2025-02-10 10:12:56.932080: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2025-02-10 10:12:56.932567: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2025-02-10 10:12:56.932819: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2025-02-10 10:12:56.933133: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2025-02-10 10:12:56.933382: I tensorflow/compiler/xla/stream_execut

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
best param: None
Learning rate set to 0.023648
0:	learn: 0.6501498	total: 2.88ms	remaining: 2.87s
1:	learn: 0.6102203	total: 4.81ms	remaining: 2.4s
2:	learn: 0.5728914	total: 6.77ms	remaining: 2.25s
3:	learn: 0.5390923	total: 8.79ms	remaining: 2.19s
4:	learn: 0.5076633	total: 10.5ms	remaining: 2.08s
5:	learn: 0.4782439	total: 12.1ms	remaining: 2.01s
6:	learn: 0.4511258	total: 13.8ms	remaining: 1.96s
7:	lea

Training:   0%|          | 0/10 [00:00<?, ?it/s]
Training:   0%|          | 0/30 [00:00<?, ?it/s]

XXXX: (7000, 8)
Training size: 7000, No. outliers: 390



2025-02-10 10:14:20.260642: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2025-02-10 10:14:20.261072: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2025-02-10 10:14:20.261320: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2025-02-10 10:14:20.261637: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2025-02-10 10:14:20.261881: I tensorflow/compiler/xla/stream_execut

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
best param: None
Learning rate set to 0.023648
0:	learn: 0.6623411	total: 2.89ms	remaining: 2.88s
1:	learn: 0.6338213	total: 4.7ms	remaining: 2.35s
2:	learn: 0.6075211	total: 6.54ms	remaining: 2.17s
3:	learn: 0.5826164	total: 8.23ms	remaining: 2.05s
4:	learn: 0.5586988	total: 9.91ms	remaining: 1.97s
5:	learn: 0.5362979	total: 11.6ms	remaining: 1.93s
6:	learn: 0.5157995	total: 13.4ms	remaining: 1.89s
7:	lea

Training:   0%|          | 0/10 [00:00<?, ?it/s]
Training:   0%|          | 0/30 [00:00<?, ?it/s]

XXXX: (7000, 2)
Training size: 7000, No. outliers: 1



2025-02-10 10:15:29.879335: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2025-02-10 10:15:29.879789: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2025-02-10 10:15:29.880035: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2025-02-10 10:15:29.880357: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2025-02-10 10:15:29.880604: I tensorflow/compiler/xla/stream_execut

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
best param: None
Learning rate set to 0.023648
0:	learn: 0.6103835	total: 1.96ms	remaining: 1.96s
1:	learn: 0.5371560	total: 3.85ms	remaining: 1.92s
2:	learn: 0.4744265	total: 5.58ms	remaining: 1.85s
3:	learn: 0.4170846	total: 7.34ms	remaining: 1.83s
4:	learn: 0.3643455	total: 9.03ms	remaining: 1.8s
5:	learn: 0.3207279	total: 10.8ms	remaining: 1.79s
6:	learn: 0.2839190	total: 12.6ms	remaining: 1.78s
7:	lea

In [6]:
df_AUCROC

Unnamed: 0,CBLOF,OCSVM,HBOS,KNN,LOF,PCA,IForest,DeepSVDD,AutoEncoder,VAE,LSTMOutlierDetector,DevNet,GANomaly,XGBOD,RF,CatB
95_CWRUBearing,0.841428,0.883249,0.828272,0.873204,0.782454,0.842405,0.88645,0.88645,0.88645,0.857549,0.857549,0.793117,0.614367,0.872094,0.733428,0.839533
35_IMS,0.500259,0.604209,0.623814,0.639052,0.649106,0.649289,0.64428,0.64428,0.64428,0.640126,0.640126,0.512008,0.533348,0.63551,0.598083,0.547424
65_MachineryFault,0.810077,0.70335,0.790367,0.852842,0.6895,0.780322,0.786756,0.786756,0.786756,0.757755,0.757755,0.597497,0.532046,0.936038,0.914328,0.933843
36_PHM,0.995989,0.997535,0.996031,0.998036,0.990266,0.997493,0.99764,0.99764,0.99764,0.997953,0.997953,0.998538,1.0,0.995948,0.687354,0.999164


In [7]:
df_AUCPR

Unnamed: 0,CBLOF,OCSVM,HBOS,KNN,LOF,PCA,IForest,DeepSVDD,AutoEncoder,VAE,LSTMOutlierDetector,DevNet,GANomaly,XGBOD,RF,CatB
95_CWRUBearing,0.759238,0.810969,0.67933,0.805785,0.614144,0.765933,0.814599,0.814599,0.814599,0.770691,0.770691,0.669963,0.486534,0.761974,0.477794,0.685084
35_IMS,0.214559,0.302422,0.328678,0.337128,0.346116,0.357358,0.346265,0.346265,0.346265,0.354619,0.354619,0.236737,0.27731,0.313049,0.293056,0.261178
65_MachineryFault,0.828311,0.713063,0.845024,0.878228,0.735274,0.828745,0.812524,0.812524,0.812524,0.820247,0.820247,0.760813,0.609642,0.953673,0.924929,0.953455
36_PHM,0.292824,0.38244,0.283333,0.419734,0.230508,0.408029,0.392551,0.392551,0.392551,0.436639,0.436639,0.706146,1.0,0.490588,0.345417,0.700893


In [8]:
df_TIMETRAIN

Unnamed: 0,CBLOF,OCSVM,HBOS,KNN,LOF,PCA,IForest,DeepSVDD,AutoEncoder,VAE,LSTMOutlierDetector,DevNet,GANomaly,XGBOD,RF,CatB
95_CWRUBearing,1.889577,1.966887,2.42629,0.020434,0.038861,0.004818,0.239799,0.239799,0.239799,0.239799,0.239799,11.388237,15.514332,21.378101,0.774385,2.230779
35_IMS,2.230779,1.971935,0.00656,0.244212,0.362466,0.006008,0.245335,0.245335,0.245335,0.245335,0.245335,21.073323,16.218984,28.144801,0.902972,2.173458
65_MachineryFault,0.043286,2.001806,0.006723,0.213315,0.29644,0.005925,0.241327,0.241327,0.241327,0.241327,0.241327,10.452291,15.593469,26.143199,1.915497,2.174693
36_PHM,0.018506,1.925266,0.003397,0.01953,0.036039,0.004453,0.237158,0.237158,0.237158,0.237158,0.237158,11.161645,15.967183,21.096435,0.210833,2.043087


In [9]:
df_TIMEINFER

Unnamed: 0,CBLOF,OCSVM,HBOS,KNN,LOF,PCA,IForest,DeepSVDD,AutoEncoder,VAE,LSTMOutlierDetector,DevNet,GANomaly,XGBOD,RF,CatB
95_CWRUBearing,0.001866,0.34031,0.000554,0.262823,0.013656,0.000626,0.021346,0.021346,0.021346,1.451811,1.451811,0.27315,0.001287,5.954992,0.025803,0.004745
35_IMS,0.004745,0.368385,0.001069,0.437134,0.148945,0.000985,0.022083,0.022083,0.022083,0.112199,0.112199,0.243772,0.000611,9.521194,0.028553,0.007079
65_MachineryFault,0.004091,0.367335,0.001087,0.372647,0.125052,0.000967,0.021141,0.021141,0.021141,0.119053,0.119053,0.242101,0.000608,8.615967,0.031463,0.00672
36_PHM,0.001195,0.339346,0.000456,0.247279,0.013624,0.000623,0.021327,0.021327,0.021327,0.116691,0.116691,0.247985,0.000726,5.93295,0.007659,0.00276


In [10]:
data['X_train'].shape, data['X_test'].shape,

((7000, 2), (3000, 2))

In [11]:
data['y_train'].shape[0], data['y_train'].sum()

(7000, 1.0)

In [12]:
data['y_train'].sum() / data['y_train'].shape[0]

0.00014285714285714287