In [None]:
# downlad GAIN
!git clone git@github.com:jsyoon0823/GAIN.git
# upgrade GAIN to tensorflow v2, tf.placeholder is no longer working in v2 etc.
!tf_upgrade_v2 --infile GAIN/gain.py --outfile GAIN/gain_tf2.py
!tf_upgrade_v2 --infile GAIN/utils.py --outfile GAIN/utils_tf2.py

In [None]:
# rename utils to .utils_tf2 in GAIN/gain_tf2.py
#from utils import normalization, renormalization, rounding
#from utils import xavier_init
#from utils import binary_sampler, uniform_sampler, sample_batch_index

In [5]:
import pandas as pd
import numpy as np
from GAIN.gain_tf2 import gain
from GAIN.utils_tf2 import binary_sampler
from GAIN.utils_tf2 import rmse_loss

# Create missing data in TEP dataset

In [3]:
dataset = pd.read_csv('Tennessee_Event-Driven/datasets/dataset.csv',index_col=False)

In [4]:
dataset_X = dataset.drop(columns='fault_id').values
dataset_Y = dataset['fault_id'].values

In [5]:
no, dim = dataset_X.shape
p = 0.1
# Introduce missing data
mask = binary_sampler(1-p, no, dim)
dataset_X_missing = dataset_X.copy()
dataset_X_missing[mask == 0] = np.nan

In [4]:
'''
p = 0.1
mask = np.random.choice(a=[True, False], size=(dataset_X.shape[0], dataset_X.shape[1]), p=[p, 1-p])
# just a check if the mask really does what I want..
#len(dataset_X[np.where(mask == True)])/(dataset_X.shape[1] * dataset_X.shape[0])
dataset_X_missing = np.copy(dataset_X)
dataset_X_missing[mask] = np.nan
'''

# Impute data

In [21]:
gain_parameters = {'batch_size': 128,
                 'hint_rate': 1.5,
                 'alpha': 100,
                 'iterations': 1000}

In [22]:
dataset_X_imputed = gain(dataset_X_missing, gain_parameters)

100%|█████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:05<00:00, 168.12it/s]


In [23]:
dataset_X_rmse

0.07301131265367386

# GAIN imputation experiment

In [24]:
dataset = pd.read_csv('Tennessee_Event-Driven/datasets/dataset.csv',index_col=False)

dataset_X = dataset.drop(columns='fault_id').values
dataset_Y = dataset['fault_id'].values

no, dim = dataset_X.shape

In [29]:
batch_size_list = [16,32]#,64,128,256]
hint_rate_list = [1,2]#,5,10,100]
alpha_list = [1,10]#,100,1000,10000]
iterations_list = [10,100]#,1000,10000,100000]
p_list = [0.05,0.1]#,0.15,0.2,0.25]


dataset_X_rmse_list = []

for p in p_list:
    mask = binary_sampler(1-p, no, dim)
    dataset_X_missing = dataset_X.copy()
    dataset_X_missing[mask == 0] = np.nan
    dataset_X_rmse_list_temp = []
    index_names = []

    for batch_size in batch_size_list:
        for hint_rate in hint_rate_list:
            for alpha in alpha_list:
                for iterations in iterations_list:
                    gain_parameters = {'batch_size': batch_size,
                                     'hint_rate': hint_rate,
                                     'alpha': alpha,
                                     'iterations': iterations}
                    dataset_X_imputed = gain(dataset_X_missing, gain_parameters)
                    dataset_X_rmse_list_temp.append(rmse_loss(dataset_X, dataset_X_imputed, mask))
                    index_names.append('batch'+str(batch_size)+'_hint_rate'+str(hint_rate)+'_alpha'+str(alpha)+'_iter'+str(iterations))
    dataset_X_rmse_list.append(dataset_X_rmse_list_temp)

result_pd = pd.DataFrame(index = index_names, columns=[str(p) for p in p_list], data = np.array(dataset_X_rmse_list).T)
result_pd.to_csv('TEP_GAINImputation_RMSE.csv',index=False)

100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 27.03it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:04<00:00, 225.50it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 24.31it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:04<00:00, 218.10it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 22.32it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:04<00:00, 209.71it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 21.12it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:04<00:00, 219.62it/s]
100%|███████████████████████████████████

In [38]:
pd.DataFrame(index = index_names, columns=[str(p) for p in p_list], data = np.array(dataset_X_rmse_list).T)

Unnamed: 0,0.05,0.1
batch16_hint_rate1_alpha1_iter10,0.176357,0.170159
batch16_hint_rate1_alpha1_iter1000,0.081187,0.079137
batch16_hint_rate1_alpha10_iter10,0.190125,0.170045
batch16_hint_rate1_alpha10_iter1000,0.080075,0.081688
batch16_hint_rate2_alpha1_iter10,0.176881,0.202493
batch16_hint_rate2_alpha1_iter1000,0.080355,0.086534
batch16_hint_rate2_alpha10_iter10,0.185521,0.188545
batch16_hint_rate2_alpha10_iter1000,0.081752,0.082289
batch32_hint_rate1_alpha1_iter10,0.176655,0.179522
batch32_hint_rate1_alpha1_iter1000,0.078439,0.09073


# RandomForest classification parameter tuning - GridSearch

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

In [None]:
param_grid = [{'n_estimators': [100, 200, 500],
               'max_features': ['auto', 'log2'],
               'max_depth' : [5,10,50,100,None],
               'criterion' :['gini', 'entropy']}]

RF_clf_gs = GridSearchCV(estimator = RandomForestClassifier(), param_grid=param_grid, scoring='f1',n_jobs=4, cv=10)
# afterwards change dataset_X to dataset_X_imputed
scaler = StandardScaler()
scaler.fit(dataset_X)
scaled_dataset_X = scaler.transform(dataset_X) 

RF_clf_gs.fit(scaled_dataset_X, dataset_Y)
means = RF_clf_gs.cv_results_['mean_test_score']
stds = RF_clf_gs.cv_results_['std_test_score']
print('RF 10CV f1 score mean with 95% confidence interval : ')
for mean, std, params in zip(means, stds, RF_clf_gs.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))

# Pipeline

In [30]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report


class Pipeline:
    def __init__(self):
        pass
    
    def _impute(self):
        raise NotImplementedError()
    
    def _scale(self):
        raise NotImplementedError()
    
    def _classify(self):
        raise NotImplementedError()
        
    def _split_data(self):
        raise NotImplementedError()
        
    def process(self):
        raise NotImplementedError()

class FIREMAN_Pipeline(Pipeline):
    def __init__(self, dataset_x, dataset_y, imputer='Simple', scaler='RandomScaler', classifier='RandomForest', scorer='report'):
        self.dataset_x = dataset_x
        self.dataset_y = dataset_y
        self.imputer = imputer
        self.scaler = scaler
        self.classifier = classifier
        self.scorer = scorer

    def _impute(self):
        if self.imputer == 'GAIN':
            pass
        
        elif self.imputer == 'Simple':
            imputer = SimpleImputer()
            imputed_x = imputer.fit_transform(self.dataset_x)
        
        elif self.imputer == '':
            imputed_x = self.dataset_x

        else:
            raise NotImplementedError()
        
        return imputed_x
    
    def _scale(self, x):
        if self.scaler == 'RandomScaler':
            scaler = StandardScaler()
            scaler.fit(x)
            return scaler.transform(x) 

        elif self.scaler == '':
            return x

        else:
            raise NotImplementedError()

    def _split_data(self, x):
        if self.scorer == 'cv_score':
            return x, self.dataset_y

        elif self.scorer == 'report':
            x_train, x_test, y_train, y_test = train_test_split(x, self.dataset_y, test_size=0.1)
            return x_train, x_test, y_train, y_test
    
    def _classify(self):
        if self.classifier == 'RandomForest':
            self.classifier = RandomForestClassifier()
        else:
            raise NotImplementedError()     

    def process(self):
        x_missing = self._impute()
        x_scaled = self._scale(x_missing)
        if self.scorer=='report':
            x_train, x_test, y_train, y_test = self._split_data(x_scaled)
            self._classify()
            self.classifier.fit(x_train, y_train)
            y_predicted = self.classifier.predict(x_test)
            return print(classification_report(y_test, y_predicted))
        
        elif self.scorer=='cv_score':
            x, y = self._split_data(dataset_x_scaled)
            _classify()
            return print(cross_val_score(self.classifier, x, y, cv=10, scoring='f1_weighted'))
        
        else:
            raise NotImplementedError()           

In [31]:
dataset = pd.read_csv('Tennessee_Event-Driven/datasets/dataset.csv',index_col=False)

dataset_X = dataset.drop(columns='fault_id').values
dataset_Y = dataset['fault_id'].values

no, dim = dataset_X.shape
p = 0.1
# Introduce missing data
mask = binary_sampler(1-p, no, dim)
dataset_X_missing = dataset_X.copy()
dataset_X_missing[mask == 0] = np.nan

In [32]:
tep_pipeline = FIREMAN_Pipeline(dataset_X_missing, dataset_Y)

In [33]:
tep_pipeline.process()

              precision    recall  f1-score   support

           0       0.32      0.60      0.42       161
           1       1.00      0.89      0.94       151
           2       1.00      0.89      0.94       124
           3       0.33      0.46      0.38       142
           4       0.84      0.75      0.79       141
           5       0.72      0.76      0.74       147
           6       1.00      0.88      0.93       130
           7       1.00      0.86      0.92       149
           8       0.99      0.93      0.96       145
           9       0.36      0.45      0.40       154
          10       0.77      0.69      0.73       147
          11       0.80      0.61      0.69       150
          12       0.91      0.82      0.86       141
          13       1.00      0.83      0.91       156
          14       0.97      0.79      0.87       166
          15       0.30      0.44      0.35       142
          16       0.76      0.65      0.70       160
          17       0.82    