In [1]:
# necessary libraries for prediciton
import utils
import pandas as pd
import numpy as np
import pprint

from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

from sklearn.model_selection import KFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

# Useful Functions

In [2]:
def pretty_matrix(matrix, row_label, col_label):
    """Pretty print of the given matrix """

    # Restraining labels that are too big
    row_label = [el[:10] + '..' if len(el) > 10 else el
                for el in row_label]
    col_label = [el[:10] + '..' if len(el) > 10 else el
                for el in col_label]

    # Stringfying everything & Joining top label
    s_matrix = [list([" "] + (col_label))] + \
               [[row_label[row_idx]] + \
                [str(e) for e in row] for row_idx, row in enumerate(matrix)]

    # Length of each matrix column
    len_s = [max(map(len, col)) for col in zip(*s_matrix)]

    # Cell formatation
    formatation = '\t'.join('{{:{}}}'.format(x) for x in len_s)

    # Apply cell formation to each matrix element
    pretty_mat = [formatation.format(*row) for row in s_matrix]

    # Print Pretty Matrix
    print('\n'.join(pretty_mat))


def display_confusion_matrix(values):
    '''Display the given array as a confusion matrix'''
    pretty_matrix([values[0:2], values[2:4]],
                  ['Actual NO', 'Actual YES'],
                  ['Predic NO', 'Predic YES'])

In [3]:
def apply_PCA(df, variance_val=0.9, debug=True):
    '''Apply the PCA algorithm to given dataframe,
    using the given variance val to trim the df'''
    # Necessary to normalize all data to use PCA
    scaler=StandardScaler()
    X_scaled=scaler.fit_transform(df)

    # PCA - keep, by default mode, 90% variance
    pca = PCA(variance_val)    
    pca.fit(X_scaled)
    X_pca = pca.transform(X_scaled)

    if debug:
        ex_variance=np.var(X_pca,axis=0)
        ex_variance_ratio = ex_variance/np.sum(ex_variance)
        print(' > Impact in total variance of each generated feature by PCA:')
        print(ex_variance_ratio)

    principal_df = pd.DataFrame(data = X_pca, index = df.reset_index()['loan_id'])
    
    return (principal_df, pca)

In [4]:
def auc_scorer(y_true, y_pred):
    '''Scorer of Area Under Curve value'''
    fpr, tpr, _ = metrics.roc_curve(y_true, y_pred)
    return metrics.auc(fpr, tpr)

# Prediction Algorithms

* Logistic Regression
* Decision Tree
* Random Forest
* Gradient Boosting

In [5]:
def create_LR():
    '''Create a Logistic Regression model'''
    return LogisticRegression()

In [6]:
def create_DT():
    '''Create a new Decision Tree'''
    # Useful DecisionTree tutorial:
    # https://www.datacamp.com/community/tutorials/decision-tree-classification-python
    return DecisionTreeClassifier()

In [7]:
def create_RF():
    '''Create a new Ranfom Forest model'''
    return RandomForestClassifier()

In [8]:
def create_GB():
    '''Create a new Gradient Boosting model'''
    return GradientBoostingClassifier()

# Prediction

* Predictions are done in this notebook.
* It is also useful to compare how serveral algorithms perform against one another.

In [9]:
# Useful Macros
K_FOLD_NUM_SPLITS = 5
SEED = 42
USE_PCA = False
UNDERSAMPLE = False
UNDERSAMPLE_RATIO = 0.3

# Pretty printer
pp = pprint.PrettyPrinter(indent=4)

In [10]:
dataset =  utils.read_csv_to_df('dataset/preprocessed_data.csv')

if UNDERSAMPLE:
    print(' > Apllying undersampling:')
    entries_df = len(dataset.index)

    # Getting all minor class cases into final dataset
    minor_df = dataset[dataset['status'] == -1]
    num_minor = len(minor_df.index)
    
    print('\t> Classes initial ratio: %f - %f\n\t> Dataset size: %i' %
         (num_minor / entries_df, (entries_df - num_minor) / entries_df, entries_df))

    # Selecting equal number from major class
    major_df = dataset[dataset['status'] == 1].sample(n=int((num_minor / UNDERSAMPLE_RATIO) - num_minor),
                                                      random_state=SEED)
    num_major = len(major_df.index)
    total_under = num_minor + num_major
    
    print('\t> Classes final ratio: %f - %f\n\t> Dataset size: %i' % 
          (num_minor / total_under, num_major / total_under, total_under))

    # Concatenting to main dataframe
    dataset = pd.concat([minor_df, major_df])

dataset = dataset.set_index('loan_id')
display(dataset)

Unnamed: 0_level_0,amount,payments,frequency,balance_mean,balance_max,balance_min,last_balance,credit_mean,credit_count,credit_max,...,average salary,unemploymant rate '95,unemploymant rate '96,no. of commited crimes '95,no. of commited crimes '96,ratio entrepeneurs,loan_to_account_age_days,owner_age_on_loan,income_to_payments_ratio,status
loan_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5314,96396,8033,2,12250.000000,20100.0,1100.0,20100.0,5025.000000,4,9900.0,...,9650,3.38,3.67,2985.0,2804,0.100,105,0.685425,0.000000,-1
5316,165960,4610,1,52083.859459,120512.8,700.0,52208.9,13523.158824,17,36574.0,...,8369,1.79,2.31,2854.0,2618,0.117,148,0.241205,-0.075325,1
6863,127080,2118,1,30060.954167,49590.4,800.0,20272.8,5009.733333,15,19065.0,...,8390,2.28,2.89,2080.0,2122,0.132,170,0.922457,-0.312244,1
5325,105804,2939,1,41297.480000,65898.5,1000.0,34307.3,9254.600000,13,26448.0,...,10045,1.42,1.71,6604.0,6295,0.135,185,0.840626,-0.198310,1
7240,274740,4579,2,57188.211111,122893.1,600.0,41112.9,21255.930769,13,63366.0,...,8288,3.79,4.52,1562.0,1460,0.110,204,0.030136,-0.013758,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6818,155616,3242,1,44197.509884,75294.1,200.0,60694.1,6448.582857,70,26388.0,...,8994,3.80,4.79,9672.0,9208,0.110,691,0.256158,-0.066497,1
5625,222180,3703,1,55230.444068,130659.1,800.0,59578.8,13417.557143,28,44352.0,...,8843,2.82,3.60,818.0,888,0.113,382,0.934917,0.000000,-1
6805,45024,938,1,41994.907692,63659.3,800.0,38384.3,8544.930000,20,31636.5,...,10177,6.63,7.75,9878.0,10108,0.081,214,0.551608,-0.419243,1
7233,115812,3217,1,56646.516129,119527.2,1100.0,41878.1,16554.986275,51,49887.0,...,8427,1.12,1.54,1874.0,1913,0.107,585,0.634888,0.000000,1


In [11]:
STATUS_COL = dataset.columns.get_loc("status")

In [12]:
# Setting X and Y
X = dataset.iloc[:, 0:STATUS_COL]
y = dataset.iloc[:, [STATUS_COL]]
display(X.head())

if USE_PCA:
    print(' > Applying PCA to X_train: ')
    X, pca = apply_PCA(X, debug=True)
    display(X.head())

Unnamed: 0_level_0,amount,payments,frequency,balance_mean,balance_max,balance_min,last_balance,credit_mean,credit_count,credit_max,...,ratio of urban inhabitants,average salary,unemploymant rate '95,unemploymant rate '96,no. of commited crimes '95,no. of commited crimes '96,ratio entrepeneurs,loan_to_account_age_days,owner_age_on_loan,income_to_payments_ratio
loan_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5314,96396,8033,2,12250.0,20100.0,1100.0,20100.0,5025.0,4,9900.0,...,0.818,9650,3.38,3.67,2985.0,2804,0.1,105,0.685425,0.0
5316,165960,4610,1,52083.859459,120512.8,700.0,52208.9,13523.158824,17,36574.0,...,0.735,8369,1.79,2.31,2854.0,2618,0.117,148,0.241205,-0.075325
6863,127080,2118,1,30060.954167,49590.4,800.0,20272.8,5009.733333,15,19065.0,...,0.535,8390,2.28,2.89,2080.0,2122,0.132,170,0.922457,-0.312244
5325,105804,2939,1,41297.48,65898.5,1000.0,34307.3,9254.6,13,26448.0,...,0.748,10045,1.42,1.71,6604.0,6295,0.135,185,0.840626,-0.19831
7240,274740,4579,2,57188.211111,122893.1,600.0,41112.9,21255.930769,13,63366.0,...,0.505,8288,3.79,4.52,1562.0,1460,0.11,204,0.030136,-0.013758


 > Applying PCA to X_train: 
 > Impact in total variance of each generated feature by PCA:
[0.19657769 0.18956141 0.1388034  0.06691183 0.05245948 0.04620739
 0.03593217 0.03490904 0.03081136 0.02985809 0.02782034 0.0258653
 0.02517927 0.02477961 0.02090243 0.01929853 0.01772426 0.01639841]


Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
loan_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
5314,-0.255374,-5.307916,-1.470216,-1.360021,3.874831,-4.562467,-3.936152,1.893634,1.400044,10.295629,0.404917,1.841465,3.202108,-0.398909,-1.35375,5.491109,3.577173,-0.362315
5316,-1.088397,-1.314941,1.671724,-0.37162,-0.555723,0.490852,-1.571278,-0.681449,-0.78577,-0.876638,0.083808,-0.242377,0.499341,1.085994,-0.13481,-0.3188,0.424528,-1.139947
6863,0.248623,-3.761679,-1.950554,-1.230319,0.367075,0.650885,0.349494,0.734722,-1.887064,-0.472904,-0.016523,0.261052,-1.095974,0.247346,0.329205,0.000114,1.225144,-0.260416
5325,0.102881,-3.153227,-0.565187,-2.079312,0.48495,0.640893,-0.771826,-0.183861,-0.442694,1.730228,-0.317827,-0.264603,-0.925417,-0.336266,0.539432,-0.089363,0.393403,-0.353172
7240,-2.071361,-1.451922,3.636497,-0.700189,0.701154,-0.648482,0.079307,1.297703,-0.624357,-1.074074,-0.204461,-0.178714,2.350342,-1.061946,-0.267894,-0.538075,-0.215593,-0.941514


# Hyper Parameter Tunning

## Random Search 1st to apporach the best solution, GridSearch to refine it

In [13]:
def getLogisticRegressionBest(X, y, debug=True):
    '''Get the Logistic Regression Hyper Parameters'''

    # Maximum number of levels in tree
    max_depth = [int(x) for x in range(2, 20, 4)]
    max_depth.append(None)

    # Create the random grid
    grid = {'penalty': ['l2', 'none'],
            'C': [0.01, 0.05, 0.1, 0.2, 0.5, 1],
            'solver': ['newton-cg', 'lbfgs', 'sag', 'saga'],
            'class_weight': ["balanced", None]}
    
    if debug:
        pp.pprint(grid)
    
    # Using the grid search for best hyperparameters
    lr = create_LR()
    lr_grid = GridSearchCV(estimator = lr,
                           param_grid = grid,
                           scoring=metrics.make_scorer(auc_scorer,
                                                       greater_is_better=True),
                           cv=K_FOLD_NUM_SPLITS,
                           verbose=2,
                           n_jobs = -1)

    # Fit the grid search model
    lr_grid = lr_grid.fit(X, y)
    
    if debug:
        print('Best Score: ', lr_grid.best_score_)
        print('Best Params: ', lr_grid.best_params_)
        
    # Return score, method & params tuple
    return (lr_grid.best_score_, 'Logistic Regression', lr_grid.best_params_)

In [14]:
def getDecisionTreeBest(X, y, debug=True):
    '''Get the Decision Tree Hyper Parameters'''

    # Maximum number of levels in tree
    max_depth = [int(x) for x in range(2, 20, 4)]
    max_depth.append(None)

    # Create the random grid
    grid = {'criterion': ['gini', 'entropy'],
            'splitter': ['best', 'random'],
            'max_features': ['auto', 'sqrt'],
            'max_depth': max_depth,
            'min_samples_split':  [2, 5, 10],
            'min_samples_leaf':  [1, 2, 4],
            'class_weight': ["balanced", None]}
    
    if debug:
        pp.pprint(grid)
    
    # Using the grid search for best hyperparameters
    dt = create_DT()
    dt_grid = GridSearchCV(estimator = dt,
                           param_grid = grid,
                           scoring=metrics.make_scorer(auc_scorer,
                                                       greater_is_better=True),
                           cv=K_FOLD_NUM_SPLITS,
                           verbose=2,
                           n_jobs = -1)

    # Fit the grid search model
    dt_grid = dt_grid.fit(X, y)
    
    if debug:
        print('Best Score: ', dt_grid.best_score_)
        print('Best Params: ', dt_grid.best_params_)
        
    # Return score, method & params tuple
    return (dt_grid.best_score_, 'Decision Tree', dt_grid.best_params_)

In [15]:
def getRandomForestBest(X, y, debug=True):
    '''Get the Random Forest Hyper Parameters'''

    # Maximum number of levels in tree
    max_depth = [int(x) for x in range(2, 16, 4)]
    max_depth.append(None)

    # Create the random grid
    grid = {'n_estimators': [int(x) for x in range(2, 14, 2)],
            'max_features': ['auto', 'sqrt'],
            'max_depth': max_depth,
            'criterion': ['gini', 'entropy'],
            'min_samples_split':  [2, 5, 10],
            'min_samples_leaf':  [1, 2, 4],
            'bootstrap': [True, False],
            'class_weight': ["balanced", "balanced_subsample", None]}
    
    if debug:
        pp.pprint(grid)
    
    # Using the grid search for best hyperparameters
    rf = create_RF()
    rf_grid = GridSearchCV(estimator = rf,
                           param_grid = grid,
                           scoring=metrics.make_scorer(auc_scorer,
                                                       greater_is_better=True),
                           cv=K_FOLD_NUM_SPLITS,
                           verbose=2,
                           n_jobs = -1)

    # Fit the grid search model
    rf_grid = rf_grid.fit(X, y)
    
    if debug:
        print('Best Score: ', rf_grid.best_score_)
        print('Best Params: ', rf_grid.best_params_)
        
    # Return score, method & params tuple
    return (rf_grid.best_score_, 'Random Forest', rf_grid.best_params_)

In [16]:
def getGradientBoostBest(X, y, debug=True):
    '''Get the Gradient Boost Hyper Parameters'''

    # Create the grid parameters
    grid = {'n_estimators': [int(x) for x in range(2, 14, 2)],
            'learning_rate': [0.1, 0.3, 0.5, 0.7],
            'loss': ['deviance', 'exponential'],
            'criterion': ['friedman_mse', 'mse', 'mae'],
            'min_samples_split':  [2, 5, 10],
            'min_samples_leaf':  [1, 2, 4],
            'random_state': [SEED]}
    
    if debug:
        pp.pprint(grid)
    
    # Using the grid search for best hyperparameters
    gb = create_GB()
    gb_grid = GridSearchCV(estimator = gb,
                           param_grid = grid,
                           scoring=metrics.make_scorer(auc_scorer,
                                                       greater_is_better=True),
                           cv=K_FOLD_NUM_SPLITS,
                           verbose=2,
                           n_jobs = -1)

    # Fit the grid search model
    gb_grid = gb_grid.fit(X, y)
    
    if debug:
        print('Best Score: ', gb_grid.best_score_)
        print('Best Params: ', gb_grid.best_params_)
        
    # Return score, method & params tuple
    return (gb_grid.best_score_, 'Gradient Boosting', gb_grid.best_params_)

In [17]:
# Getting the best algorithm
algorithms = []
#algorithms = [getLogisticRegressionBest(X, y),
#              getDecisionTreeBest(X, y),
#              getRandomForestBest(X, y),
#              getGradientBoostBest(X, y)]
algorithms.sort(reverse=True, key=lambda el: el[0])

for index, entry in enumerate(algorithms):
    print('%i. %s - %f\n---------' % (index + 1, entry[1], entry[0]))
    
print('Best algorithm: %s' % algorithms[0][1])

IndexError: list index out of range

## Using method with higher score with our data

In [18]:
# Cross validation settings
auc_scores = []
confusion_matrixes = []
cv = KFold(n_splits=K_FOLD_NUM_SPLITS, random_state=SEED, shuffle=False)

# CHANGE THIS LINE TO CHANGE THE USED CLASSIFICATION METHOD
# classifier = DecisionTreeClassifier(criterion='entropy', min_samples_leaf=2, min_samples_split=10, splitter='best')
# classifier = GradientBoostingClassifier(criterion='friedman_mse', learning_rate=0.7, loss='deviance',
#                                         min_samples_leaf=2, min_samples_split=2, n_estimators=8, random_state=SEED)
classifier = RandomForestClassifier(bootstrap=False, class_weight='balanced_subsample', criterion='entropy',
                                    max_depth=2, max_features='auto', min_samples_leaf=1,
                                    min_samples_split=10, n_estimators=10)


# Applying Cross validation
for train_index, test_index in cv.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Training with this fold
    classifier.fit(X_train, y_train)
    
    # Testing & Measuring accuracy
    y_pred = classifier.predict(X_test)
    
    fpr, tpr, _ = metrics.roc_curve(y_test, y_pred)
    auc_scores.append(metrics.auc(fpr, tpr))
    confusion_matrixes.append(metrics.confusion_matrix(y_test, y_pred).ravel())



In [19]:
# Printing the obtained results
print('Classification Method used:', classifier, '\n')
print('AUC scores:', auc_scores)
print('> Average: ', sum(auc_scores)/len(auc_scores))
for cf in confusion_matrixes:
    display_confusion_matrix(cf)

Classification Method used: RandomForestClassifier(bootstrap=False, class_weight='balanced_subsample',
                       criterion='entropy', max_depth=2, max_features='auto',
                       max_leaf_nodes=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=10, min_weight_fraction_leaf=0.0,
                       n_estimators=10, n_jobs=None, oob_score=False,
                       random_state=None, verbose=0, warm_start=False) 

AUC scores: [0.5233918128654971, 0.6342592592592592, 0.7349137931034482, 0.6839622641509434, 0.6916666666666667]
> Average:  0.6536387592091628
          	Predic NO	Predic YES
Actual NO 	2        	7         
Actual YES	10       	47        
          	Predic NO	Predic YES
Actual NO 	7        	5         
Actual YES	17       	37        
          	Predic NO	Predic YES
Actual NO 	5        	3         
Actual YES	9        	49        
          	Predic NO	Predic YE

### After having our model trained we shall use the model on the data to be sumitted in the kaggle

In [20]:
test_dataset =  utils.read_csv_to_df('dataset/test_dataset.csv')
test_dataset = test_dataset.set_index('loan_id')
display(test_dataset.head())

Unnamed: 0_level_0,amount,payments,frequency,balance_mean,balance_max,balance_min,last_balance,credit_mean,credit_count,credit_max,...,average salary,unemploymant rate '95,unemploymant rate '96,no. of commited crimes '95,no. of commited crimes '96,ratio entrepeneurs,loan_to_account_age_days,owner_age_on_loan,income_to_payments_ratio,status
loan_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5895,93960,1566,1,54520.202247,88246.7,800.0,49548.5,12769.22,30,36637.5,...,8390,2.28,2.89,2080.0,2122,0.132,452,0.389496,-0.294168,
7122,260640,7240,1,31518.182051,88731.8,-718.6,11565.4,9526.772414,29,41871.0,...,8620,1.1,1.25,1089.0,1117,0.1,490,0.688919,-0.049781,
6173,232560,4845,0,40175.6125,79286.6,200.0,45754.0,15302.486047,43,43605.0,...,10045,1.42,1.71,6604.0,6295,0.135,630,0.907412,-0.046302,
6142,221880,3698,1,44440.912676,74216.8,1000.0,38913.4,11797.642857,21,33280.0,...,8899,3.39,3.97,2987.0,2487,0.149,311,0.809056,-0.007279,
5358,38520,3210,1,20231.313158,31302.0,900.0,18914.3,2736.026923,52,8800.0,...,8388,2.41,2.94,1658.0,1668,0.087,597,0.894527,-0.080867,


In [21]:
# We now remove the Y column with NaNs
test_dataset = test_dataset.iloc[:, 0:STATUS_COL]
display(test_dataset.head())

Unnamed: 0_level_0,amount,payments,frequency,balance_mean,balance_max,balance_min,last_balance,credit_mean,credit_count,credit_max,...,ratio of urban inhabitants,average salary,unemploymant rate '95,unemploymant rate '96,no. of commited crimes '95,no. of commited crimes '96,ratio entrepeneurs,loan_to_account_age_days,owner_age_on_loan,income_to_payments_ratio
loan_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5895,93960,1566,1,54520.202247,88246.7,800.0,49548.5,12769.22,30,36637.5,...,0.535,8390,2.28,2.89,2080.0,2122,0.132,452,0.389496,-0.294168
7122,260640,7240,1,31518.182051,88731.8,-718.6,11565.4,9526.772414,29,41871.0,...,0.524,8620,1.1,1.25,1089.0,1117,0.1,490,0.688919,-0.049781
6173,232560,4845,0,40175.6125,79286.6,200.0,45754.0,15302.486047,43,43605.0,...,0.748,10045,1.42,1.71,6604.0,6295,0.135,630,0.907412,-0.046302
6142,221880,3698,1,44440.912676,74216.8,1000.0,38913.4,11797.642857,21,33280.0,...,0.553,8899,3.39,3.97,2987.0,2487,0.149,311,0.809056,-0.007279
5358,38520,3210,1,20231.313158,31302.0,900.0,18914.3,2736.026923,52,8800.0,...,0.591,8388,2.41,2.94,1658.0,1668,0.087,597,0.894527,-0.080867


In [23]:
final_df = pd.DataFrame()

if USE_PCA:
    # Using train PCA and classifying
    scaler=StandardScaler()
    X_test_scaled=scaler.fit_transform(test_dataset)
    predictions_df = pd.DataFrame(data = pca.transform(X_test_scaled),
                                  index=test_dataset.reset_index()['loan_id'])
    display(predictions_df)

    predictions_df['Predicted'] = classifier.predict(predictions_df)
    final_df = predictions_df.reset_index()\
                    [['loan_id', 'Predicted']]\
                    .rename(columns={
                        'loan_id': 'Id'
                    })
    
else:
    final_df = test_dataset.copy()
    final_df['Predicted'] = classifier.predict(final_df)
    final_df = final_df.reset_index()\
                        [['loan_id', 'Predicted']]\
                        .rename(columns={
                            'loan_id': 'Id'
                        })\
                        .drop_duplicates()

display(final_df)

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
loan_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
5895,-0.581452,0.279392,-0.751406,-1.064600,0.088465,1.427686,-0.257290,1.387341,-0.584439,-0.466226,-0.374783,-0.275496,0.627989,1.281398,-0.473764,0.675034,0.724109,-0.099117
7122,-0.857298,-0.501248,-1.109873,-0.948235,-0.791059,-1.733933,0.482990,1.295374,-0.292512,0.363729,-0.561100,-0.338277,-1.142156,-0.350272,1.164492,-0.738465,-0.797314,-1.341220
6173,-0.704360,3.792950,-1.775806,-2.580514,1.785604,-0.321933,0.385107,-0.688817,0.918173,0.342514,1.305124,-1.436521,-3.054766,0.753602,0.706440,1.156564,-1.781860,-0.636923
6142,-0.204660,-1.118093,-0.265491,-1.363497,1.686646,-0.369144,0.757934,-0.572910,0.443331,-0.292389,-0.877404,1.030245,-0.824229,1.227295,1.472915,0.450348,-0.399156,0.621546
5358,-0.579508,-0.198183,-4.834610,0.466264,-1.011945,-0.264960,0.807550,-0.077285,-1.893430,0.082453,0.401773,0.116037,-0.867249,0.915894,-0.517934,0.309674,0.625952,0.754135
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4989,-0.483243,-1.201923,-0.068177,-1.227683,-1.477796,-2.022069,0.754334,0.369835,-0.366142,0.695046,-0.310241,-0.169392,-1.132877,-0.154425,0.457877,-1.128434,0.513320,-0.480039
5221,-2.873454,0.572316,0.337359,-2.355822,-2.385614,1.109646,-1.125092,-0.404571,-1.513068,-0.148395,0.432946,-0.163092,-0.223138,-0.709620,-0.628291,-0.298659,-1.651526,-0.180526
6402,-0.846005,1.712691,1.022582,-2.051936,1.563192,-0.828152,1.485226,0.409008,-1.353870,-0.557706,-0.100744,-0.917546,1.998962,-1.018000,-0.701578,0.555132,0.873210,0.769035
5346,-0.562941,0.544666,-1.965113,3.531954,2.721630,-0.576336,0.093464,-2.467903,-0.856864,-0.227456,1.068265,-0.811477,-1.296557,-1.605611,-1.709726,0.400837,0.619159,1.740849


Unnamed: 0,Id,Predicted
0,5895,1
1,7122,-1
2,6173,1
3,6142,1
4,5358,-1
...,...,...
349,4989,-1
350,5221,1
351,6402,1
352,5346,1


In [None]:
# Outputting predictions to .csv
# CHANGE FILE NAME TO PRESERVE DIFFERENT INSTANCES
utils.write_df_to_csv(final_df, 'predictions', 'prediction.csv')