## 9 Final Functions

In [10]:
#importing Required Libraries
import pandas as pd
import numpy as np
import joblib
from sklearn.metrics import confusion_matrix
import time

import warnings
warnings.filterwarnings('ignore')

- RandomForestClassifier on RandomForestRegressor_imputer Imputed Data(section: 5.5.4) gave the lowest cost value of 9510 among all the trails.


- StandardScaler model(as part of preprocessing) saved at section: 4.2
- impute models and utiliy objects for RandomForestRegressor_imputation saved at section: 4.34
- RandomForestClassifier model(trained with RandomForestRegressor_imputer Imputed Data) saved at section: 5.5.4

In [11]:
#loading the saved StandardScaler model
std_scaler = joblib.load('std_scaler.pkl')

#Loading saved impute models and utiliy objects for RandomForestRegressor_imputation
reg_mod_imp_RF = joblib.load('reg_mod_imp_RF.pkl')
reg_model_med_imputer = joblib.load('reg_model_med_imputer.pkl')
median_imp_cols = joblib.load('median_imp_cols.pkl')

#Loading Saved RandomForestClassifier model
classifier = joblib.load('ML models & Results/RFR_RF.pkl')
#The best threshold value at which lowest cost was acquired
threshold = 0.0229

In [12]:
#loading test data from csv file
test_data = pd.read_csv('aps_failure_test_set.csv', na_values = 'na')

x_test = test_data.drop('class', axis = 1)
y_test = test_data['class']

### 9.1 Final Function - 1

In [13]:
def final_fun_1(X):
    '''It takes in raw data(X) as input and 
                    returns predictions of X'''
    
    def regression_model_imputer_transformer(X, median_imputer, median_imp_cols, reg_model):
        '''It takes in raw data(X) which may contain nan values along with impute models and utility objects
                and returns imputed data(X) using models passed through arguments'''
        
        X_imputed = X.copy()
        #storing all the column indices in col_idcs
        col_idcs = np.arange(X_imputed.shape[1])
        #Imputing median_imp_cols using median_imputer 
        X_imputed.iloc[:,median_imp_cols] = median_imputer.transform(X_imputed.iloc[:,median_imp_cols])
        #filtering all the columns indices which  to be filled by reg_model
        model_imp_cols = np.array([col_idx for col_idx in col_idcs if col_idx not in median_imp_cols])
        filled_cols = median_imp_cols
        for col_idx in model_imp_cols:
            #storing row indices in which the col_idx column have missing values
            missing_rows_in_col = np.isnan(X_imputed.iloc[:,col_idx])
            if missing_rows_in_col.sum() == 0:
                continue
            #predicting these missing values in col_idx column by using reg_model and filled_cols data in those missing_rows_in_col
            X_imputed.iloc[:,col_idx][missing_rows_in_col] = reg_model.predict(X_imputed.iloc[:,filled_cols][missing_rows_in_col])
        return X_imputed

    X = pd.DataFrame(std_scaler.transform(X))
    X = regression_model_imputer_transformer(X, reg_model_med_imputer, median_imp_cols, reg_mod_imp_RF)

    Y_pred_prob = classifier.predict_proba(X)
    Y_pred = (Y_pred_prob[:,1] > threshold).astype(int)
    
    return Y_pred

In [14]:
start_time = time.time()
print('Predictions of Test DataSet: ', final_fun_1(x_test))
print("Total time taken for prediction: %s seconds" % (time.time() - start_time))

Predictions of Test DataSet:  [0 0 0 ... 0 0 0]
Total time taken for prediction: 17.41692852973938 seconds


In [15]:
x = x_test.iloc[0].copy()
x = x.values.reshape(1,-1)
start_time = time.time()
print('Prediction of 0th index row of Test Dataset: ',final_fun_1(x))
print("Total time taken for prediction: %s seconds" % (time.time() - start_time))

Prediction of 0th index row of Test Dataset:  [0]
Total time taken for prediction: 0.4969966411590576 seconds


In [16]:
x = x_test.iloc[15994].copy()
x = x.values.reshape(1,-1)
start_time = time.time()
print('Prediction of 15994th index row of Test Dataset: ',final_fun_1(x))
print("Total time taken for prediction: %s seconds" % (time.time() - start_time))

Prediction of 15994th index row of Test Dataset:  [1]
Total time taken for prediction: 0.349994421005249 seconds


### 9.2 Final Function - 2

In [17]:
def final_fun_2(X,Y):
    '''It takes in raw data as input along with its target values(X,Y) and 
                    returns metric value which is cost (= fp*10 + fn*500)'''

    def regression_model_imputer_transformer(X, median_imputer, median_imp_cols, reg_model):
        '''It takes in raw data(X) which may contain nan values along with impute models and utility objects
                and returns imputed data(X) using models passed through arguments'''
        
        X_imputed = X.copy()
        #storing all the column indices in col_idcs
        col_idcs = np.arange(X_imputed.shape[1])
        #Imputing median_imp_cols using median_imputer 
        X_imputed.iloc[:,median_imp_cols] = median_imputer.transform(X_imputed.iloc[:,median_imp_cols])
        #filtering all the columns indices which  to be filled by reg_model
        model_imp_cols = np.array([col_idx for col_idx in col_idcs if col_idx not in median_imp_cols])
        filled_cols = median_imp_cols
        for col_idx in model_imp_cols:
            #storing row indices in which the col_idx column have missing values
            missing_rows_in_col = np.isnan(X_imputed.iloc[:,col_idx])
            if missing_rows_in_col.sum() == 0:
                continue
            #predicting these missing values in col_idx column by using reg_model and filled_cols data in those missing_rows_in_col
            X_imputed.iloc[:,col_idx][missing_rows_in_col] = reg_model.predict(X_imputed.iloc[:,filled_cols][missing_rows_in_col])
        return X_imputed
    
    #replacing 'neg' and 'pos' class labels with 0 & 1
    Y.replace({'neg':0, 'pos':1}, inplace = True)
    
    X = pd.DataFrame(std_scaler.transform(X), columns = X.columns)
    X = regression_model_imputer_transformer(X, reg_model_med_imputer, median_imp_cols, reg_mod_imp_RF)

    Y_pred_prob = classifier.predict_proba(X)
    Y_pred = (Y_pred_prob[:,1] > threshold).astype(int)
    tn, fp, fn, tp = confusion_matrix(Y, Y_pred).ravel()
    cost = fp*10 + fn*500

    return cost

In [18]:
start_time = time.time()
print('Metric or cost value for Test DataSet: ', final_fun_2(x_test,y_test))
print("Total time taken for metric computation: %s seconds" % (time.time() - start_time))

Metric or cost value for Test DataSet:  9510
Total time taken for metric computation: 17.12857460975647 seconds
