In [1]:
# necessary libraries for prediciton
import utils
import pandas as pd
import numpy as np
import pprint

from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

from sklearn.model_selection import KFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

# Useful Functions

In [2]:
def pretty_matrix(matrix, row_label, col_label):
    """Pretty print of the given matrix """

    # Restraining labels that are too big
    row_label = [el[:10] + '..' if len(el) > 10 else el
                for el in row_label]
    col_label = [el[:10] + '..' if len(el) > 10 else el
                for el in col_label]

    # Stringfying everything & Joining top label
    s_matrix = [list([" "] + (col_label))] + \
               [[row_label[row_idx]] + \
                [str(e) for e in row] for row_idx, row in enumerate(matrix)]

    # Length of each matrix column
    len_s = [max(map(len, col)) for col in zip(*s_matrix)]

    # Cell formatation
    formatation = '\t'.join('{{:{}}}'.format(x) for x in len_s)

    # Apply cell formation to each matrix element
    pretty_mat = [formatation.format(*row) for row in s_matrix]

    # Print Pretty Matrix
    print('\n'.join(pretty_mat))


def display_confusion_matrix(values):
    '''Display the given array as a confusion matrix'''
    pretty_matrix([values[0:2], values[2:4]],
                  ['Actual NO', 'Actual YES'],
                  ['Predic NO', 'Predic YES'])

In [3]:
def apply_PCA(df, variance_val=0.9, debug=True):
    '''Apply the PCA algorithm to given dataframe,
    using the given variance val to trim the df'''
    # Necessary to normalize all data to use PCA
    scaler=StandardScaler()
    X_scaled=scaler.fit_transform(df)

    # PCA - keep, by default mode, 90% variance
    pca = PCA(variance_val)    
    pca.fit(X_scaled)
    X_pca = pca.transform(X_scaled)

    if debug:
        ex_variance=np.var(X_pca,axis=0)
        ex_variance_ratio = ex_variance/np.sum(ex_variance)
        print(' > Impact in total variance of each generated feature by PCA:')
        print(ex_variance_ratio)

    principal_df = pd.DataFrame(data = X_pca)
    
    return (principal_df, pca)

In [4]:
def auc_scorer(y_true, y_pred):
    '''Scorer of Area Under Curve value'''
    fpr, tpr, _ = metrics.roc_curve(y_true, y_pred)
    return metrics.auc(fpr, tpr)

# Prediction Algorithms

* Decision Tree
* Random Forest
* Gradient Boosting

In [5]:
def create_DT():
    '''Create a new Decision Tree'''
    # Useful DecisionTree tutorial:
    # https://www.datacamp.com/community/tutorials/decision-tree-classification-python
    return DecisionTreeClassifier()

In [6]:
def create_RF():
    '''Create a new Ranfom Forest model'''
    return RandomForestClassifier()

In [7]:
def create_GB():
    '''Create a new Gradient Boosting model'''
    return GradientBoostingClassifier()

# Prediction

* Predictions are done in this notebook.
* It is also useful to compare how serveral algorithms perform against one another.

In [8]:
# Useful Macros
K_FOLD_NUM_SPLITS = 5
SEED = 42

# Pretty printer
pp = pprint.PrettyPrinter(indent=4)

In [9]:
dataset =  utils.read_csv_to_df('dataset/preprocessed_data.csv')
display(dataset.head())

Unnamed: 0,date,amount,payments,frequency,account_creation_date,balance_mean,balance_max,balance_min,last_ballance,credit_mean,...,average salary,unemploymant rate '95,unemploymant rate '96,no. of enterpreneurs per 1000 inhabitants,no. of commited crimes '95,no. of commited crimes '96,loan_to_account_age_days,owner_age_on_loan,salary_over_payments,status
0,0.0,96396,8033,2,0.054011,12250.0,20100.0,1100.0,20100.0,5025.0,...,9650,3.38,3.67,100,2985,2804,105,0.685425,1617,-1
1,0.004721,165960,4610,1,0.024623,52083.859459,120512.8,700.0,52208.9,13523.158824,...,8369,1.79,2.31,117,2854,2618,148,0.241205,3759,1
2,0.018096,127080,2118,1,0.020651,30060.954167,49590.4,800.0,20272.8,5009.733333,...,8390,2.28,2.89,132,2080,2122,170,0.922457,6272,1
3,0.022817,105804,2939,1,0.013503,41297.48,65898.5,1000.0,34307.3,9254.6,...,10045,1.42,1.71,135,6604,6295,185,0.840626,7106,1
4,0.049567,274740,4579,2,0.025417,57188.211111,122893.1,600.0,41112.9,21255.930769,...,8288,3.79,4.52,110,1562,1460,204,0.030136,3709,1


In [10]:
STATUS_COL = dataset.columns.get_loc("status")

In [11]:
# Setting X and Y
X = dataset.iloc[:, 0:STATUS_COL]
y = dataset.iloc[:, [STATUS_COL]]

display(X.head())

print(' > Applying PCA to X_train')
X, pca = apply_PCA(X, debug=True)
display(X)

Unnamed: 0,date,amount,payments,frequency,account_creation_date,balance_mean,balance_max,balance_min,last_ballance,credit_mean,...,ratio of urban inhabitants,average salary,unemploymant rate '95,unemploymant rate '96,no. of enterpreneurs per 1000 inhabitants,no. of commited crimes '95,no. of commited crimes '96,loan_to_account_age_days,owner_age_on_loan,salary_over_payments
0,0.0,96396,8033,2,0.054011,12250.0,20100.0,1100.0,20100.0,5025.0,...,81.8,9650,3.38,3.67,100,2985,2804,105,0.685425,1617
1,0.004721,165960,4610,1,0.024623,52083.859459,120512.8,700.0,52208.9,13523.158824,...,73.5,8369,1.79,2.31,117,2854,2618,148,0.241205,3759
2,0.018096,127080,2118,1,0.020651,30060.954167,49590.4,800.0,20272.8,5009.733333,...,53.5,8390,2.28,2.89,132,2080,2122,170,0.922457,6272
3,0.022817,105804,2939,1,0.013503,41297.48,65898.5,1000.0,34307.3,9254.6,...,74.8,10045,1.42,1.71,135,6604,6295,185,0.840626,7106
4,0.049567,274740,4579,2,0.025417,57188.211111,122893.1,600.0,41112.9,21255.930769,...,50.5,8288,3.79,4.52,110,1562,1460,204,0.030136,3709


 > Applying PCA to X_train
 > Impact in total variance of each generated feature by PCA:
[0.19342177 0.1830362  0.12733503 0.06387891 0.05037522 0.04733853
 0.04686938 0.04428451 0.03449902 0.02973082 0.02809869 0.02685823
 0.02465386 0.02299667 0.02108592 0.01898376 0.01836272 0.01819075]


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,0.240185,-3.771527,-1.773865,-1.891253,-1.252400,-1.575109,6.077470,-0.627003,0.545991,1.892537,10.086697,5.432849,0.366265,3.409484,1.598483,3.322492,-3.183012,-3.890361
1,-0.751771,-1.510474,1.387688,-0.269040,-1.865807,0.509314,1.015191,1.390014,-1.881097,-0.447422,-0.873445,-0.223236,0.530360,-0.120879,0.889329,0.338537,0.414743,-0.292096
2,1.579300,-3.420111,-2.065320,-1.158395,-1.494502,1.058192,1.592037,0.860966,0.438465,-0.414512,-1.319014,0.710939,1.219522,-1.181741,0.195082,-0.355079,-0.235202,-0.345520
3,1.374005,-2.755383,-0.864082,-1.689615,-1.518413,1.843423,1.500311,1.068009,-0.862000,-0.112375,1.013769,1.157818,-0.067574,-0.740547,-0.401850,-1.005080,-0.466557,0.161495
4,-1.893180,-2.178124,3.302084,-0.678413,-1.315306,0.420629,2.266290,1.085289,0.850802,0.610084,-1.385714,-0.157952,0.345069,2.489865,-0.283043,0.702394,0.827234,0.063421
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
323,-2.501911,3.355366,-2.757040,1.044756,2.739234,-1.531222,-2.340566,0.494179,-0.899127,-0.864989,0.807452,-1.021308,-0.598079,0.433075,0.367628,0.108470,0.155455,-0.414724
324,-1.470996,-0.769699,1.669361,0.350854,-0.514539,0.163920,-1.606765,-1.495511,0.946668,1.049886,-0.035341,2.119778,0.566676,-0.848362,0.180077,0.611680,-0.394699,1.051154
325,0.792744,-2.909575,-0.503504,4.612323,1.855751,0.888526,-0.329061,-1.732396,-1.777919,-0.562937,1.022796,0.515935,-0.859788,-0.227109,-0.085020,-0.256632,0.712555,0.039289
326,-2.666283,2.520416,0.197626,-0.093209,-1.179004,0.872086,-2.601828,-1.004807,0.033777,0.087952,-0.328741,0.990043,0.224458,-0.226298,0.001490,0.139505,-1.270193,1.152664


# Hyper Parameter Tunning

## Random Search 1st to apporach the best solution, GridSearch to refine it

In [12]:
def getDecisionTreeBest(X, y, debug=True):
    '''Get the Decision Tree Hyper Parameters'''

    # Maximum number of levels in tree
    max_depth = [int(x) for x in range(2, 20, 4)]
    max_depth.append(None)

    # Create the random grid
    grid = {'criterion': ['gini', 'entropy'],
            'splitter': ['best', 'random'],
            'max_features': ['auto', 'sqrt'],
            'max_depth': max_depth,
            'min_samples_split':  [2, 5, 10],
            'min_samples_leaf':  [1, 2, 4]}
    
    if debug:
        pp.pprint(grid)
    
    # Using the grid search for best hyperparameters
    dt = create_DT()
    dt_grid = GridSearchCV(estimator = dt,
                           param_grid = grid,
                           scoring=metrics.make_scorer(auc_scorer,
                                                       greater_is_better=True),
                           cv=K_FOLD_NUM_SPLITS,
                           verbose=2,
                           n_jobs = -1)

    # Fit the grid search model
    dt_grid = dt_grid.fit(X, y)
    
    if debug:
        print('Best Score: ', dt_grid.best_score_)
        print('Best Params: ', dt_grid.best_params_)
        
    # Return score, method & params tuple
    return (dt_grid.best_score_, 'Decision Tree', dt_grid.best_params_)

In [13]:
def getRandomForestBest(X, y, debug=True):
    '''Get the Random Forest Hyper Parameters'''

    # Maximum number of levels in tree
    max_depth = [int(x) for x in range(2, 20, 4)]
    max_depth.append(None)

    # Create the random grid
    grid = {'n_estimators': [int(x) for x in range(2, 20, 2)],
            'max_features': ['auto', 'sqrt'],
            'max_depth': max_depth,
            'criterion': ['gini', 'entropy'],
            'min_samples_split':  [2, 5, 10],
            'min_samples_leaf':  [1, 2, 4],
            'bootstrap': [True, False]}
    
    if debug:
        pp.pprint(grid)
    
    # Using the grid search for best hyperparameters
    rf = create_RF()
    rf_grid = GridSearchCV(estimator = rf,
                           param_grid = grid,
                           scoring=metrics.make_scorer(auc_scorer,
                                                       greater_is_better=True),
                           cv=K_FOLD_NUM_SPLITS,
                           verbose=2,
                           n_jobs = -1)

    # Fit the grid search model
    rf_grid = rf_grid.fit(X, y)
    
    if debug:
        print('Best Score: ', rf_grid.best_score_)
        print('Best Params: ', rf_grid.best_params_)
        
    # Return score, method & params tuple
    return (rf_grid.best_score_, 'Random Forest', rf_grid.best_params_)

In [14]:
def getGradientBoostBest(X, y, debug=True):
    '''Get the Gradient Boost Hyper Parameters'''

    # Create the grid parameters
    grid = {'n_estimators': [int(x) for x in range(2, 20, 2)],
            'learning_rate': [0.1, 0.3, 0.5, 0.7],
            'loss': ['deviance', 'exponential'],
            'criterion': ['friedman_mse', 'mse', 'mae'],
            'min_samples_split':  [2, 5, 10],
            'min_samples_leaf':  [1, 2, 4],
            'random_state': [SEED]}
    
    if debug:
        pp.pprint(grid)
    
    # Using the grid search for best hyperparameters
    gb = create_GB()
    gb_grid = GridSearchCV(estimator = gb,
                           param_grid = grid,
                           scoring=metrics.make_scorer(auc_scorer,
                                                       greater_is_better=True),
                           cv=K_FOLD_NUM_SPLITS,
                           verbose=2,
                           n_jobs = -1)

    # Fit the grid search model
    gb_grid = gb_grid.fit(X, y)
    
    if debug:
        print('Best Score: ', gb_grid.best_score_)
        print('Best Params: ', gb_grid.best_params_)
        
    # Return score, method & params tuple
    return (gb_grid.best_score_, 'Gradient Boosting', gb_grid.best_params_)

In [15]:
# Getting the best algorithm
algorithms = [getDecisionTreeBest(X, y),
              getRandomForestBest(X, y),
              getGradientBoostBest(X, y)]\
             .sort(reverse=True, key=lambda el: el[0])

for entry, index in enumerate(algorithms):
    print('%i. %s - %f' % (index, entry[1], entry[0]))
    
print('Best algorithm: %s\n\tConfig:\n' % algorithms[0][1])
pprint(algorithms[0][2])

{   'criterion': ['gini', 'entropy'],
    'max_depth': [2, 6, 10, 14, 18, None],
    'max_features': ['auto', 'sqrt'],
    'min_samples_leaf': [1, 2, 4],
    'min_samples_split': [2, 5, 10],
    'splitter': ['best', 'random']}
Fitting 5 folds for each of 432 candidates, totalling 2160 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  41 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-1)]: Done 2160 out of 2160 | elapsed:    6.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Best Score:  0.6834221924221122
Best Params:  {'criterion': 'entropy', 'max_depth': 14, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'splitter': 'best'}
{   'bootstrap': [True, False],
    'criterion': ['gini', 'entropy'],
    'max_depth': [2, 6, 10, 14, 18, None],
    'max_features': ['auto', 'sqrt'],
    'min_samples_leaf': [1, 2, 4],
    'min_samples_split': [2, 5, 10],
    'n_estimators': [2, 4, 6, 8, 10, 12, 14, 16, 18]}
Fitting 5 folds for each of 3888 candidates, totalling 19440 fits


[Parallel(n_jobs=-1)]: Done 358 tasks      | elapsed:    3.3s
[Parallel(n_jobs=-1)]: Done 2052 tasks      | elapsed:   22.9s
[Parallel(n_jobs=-1)]: Done 4894 tasks      | elapsed:   56.4s
[Parallel(n_jobs=-1)]: Done 8856 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 13966 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 19440 out of 19440 | elapsed:  3.6min finished
  self.best_estimator_.fit(X, y, **fit_params)
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Best Score:  0.7170062478350349
Best Params:  {'bootstrap': False, 'criterion': 'gini', 'max_depth': 10, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 2}
{   'criterion': ['friedman_mse', 'mse', 'mae'],
    'learning_rate': [0.1, 0.3, 0.5, 0.7],
    'loss': ['deviance', 'exponential'],
    'min_samples_leaf': [1, 2, 4],
    'min_samples_split': [2, 5, 10],
    'n_estimators': [2, 4, 6, 8, 10, 12, 14, 16, 18],
    'random_state': [42]}
Fitting 5 folds for each of 1944 candidates, totalling 9720 fits


[Parallel(n_jobs=-1)]: Done 358 tasks      | elapsed:    3.7s
[Parallel(n_jobs=-1)]: Done 2052 tasks      | elapsed:   16.8s
[Parallel(n_jobs=-1)]: Done 4894 tasks      | elapsed:   46.8s
[Parallel(n_jobs=-1)]: Done 7142 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 8237 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 9572 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done 9713 out of 9720 | elapsed:  3.3min remaining:    0.1s


Best Score:  0.7119490481793915
Best Params:  {'criterion': 'friedman_mse', 'learning_rate': 0.7, 'loss': 'deviance', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 8, 'random_state': 42}


[Parallel(n_jobs=-1)]: Done 9720 out of 9720 | elapsed:  3.3min finished
  y = column_or_1d(y, warn=True)


TypeError: 'NoneType' object is not iterable

## Using method with higher score with our data

In [16]:
# Cross validation settings
auc_scores = []
confusion_matrixes = []
cv = KFold(n_splits=K_FOLD_NUM_SPLITS, random_state=SEED, shuffle=False)

# CHANGE THIS LINE TO CHANGE THE USED CLASSIFICATION METHOD
# classifier = DecisionTreeClassifier(criterion='entropy', min_samples_leaf=2, min_samples_split=10, splitter='best')
classifier = GradientBoostingClassifier(criterion='friedman_mse', learning_rate=0.7, loss='deviance',
                                        min_samples_leaf=2, min_samples_split=2, n_estimators=8, random_state=SEED)
# classifier = RandomForestClassifier(bootstrap=False, criterion='entropy', max_depth=10, n_estimators=10)

# Applying Cross validation
for train_index, test_index in cv.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Training with this fold
    classifier.fit(X_train, y_train)
    
    # Testing & Measuring accuracy
    y_pred = classifier.predict(X_test)
    
    fpr, tpr, _ = metrics.roc_curve(y_test, y_pred)
    auc_scores.append(metrics.auc(fpr, tpr))
    confusion_matrixes.append(metrics.confusion_matrix(y_test, y_pred).ravel())

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [17]:
# Printing the obtained results
print('Classification Method used:', classifier, '\n')
print('AUC scores:', auc_scores)
print('> Average: ', sum(auc_scores)/len(auc_scores))
for cf in confusion_matrixes:
    display_confusion_matrix(cf)

Classification Method used: GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.7, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=2, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=8,
                           n_iter_no_change=None, presort='auto',
                           random_state=42, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False) 

AUC scores: [0.7719298245614037, 0.6574074074074074, 0.6530172413793103, 0.6477987421383649, 0.5583333333333333]
> Average:  0.6576973097639639
          	Predic NO	Predic YES
Actual NO 	6        	3         
Actual YES	7        	50        
          	Predic NO	Predic YES
Actual NO 	6        	6    

### After having our model trained we shall use the model on the data to be sumitted in the kaggle

In [18]:
test_dataset =  utils.read_csv_to_df('dataset/test_dataset.csv')
ids = utils.read_csv_to_df('dataset/ids.csv')
display(test_dataset.head())

Unnamed: 0,date,amount,payments,frequency,account_creation_date,balance_mean,balance_max,balance_min,last_ballance,credit_mean,...,average salary,unemploymant rate '95,unemploymant rate '96,no. of enterpreneurs per 1000 inhabitants,no. of commited crimes '95,no. of commited crimes '96,loan_to_account_age_days,owner_age_on_loan,salary_over_payments,status
0,0.0,93960,1566,1,0.176892,54520.202247,88246.7,800.0,49548.5,12769.22,...,8390,2.28,2.89,132,2080,2122,452,0.389496,6824,
1,0.00142,260640,7240,1,0.139059,31518.182051,88731.8,-718.6,11565.4,9526.772414,...,8620,1.1,1.25,100,1089,1117,490,0.688919,1380,
2,0.007102,232560,4845,0,0.0,40175.6125,79286.6,200.0,45754.0,15302.486047,...,10045,1.42,1.71,135,6604,6295,630,0.907412,5200,
3,0.025568,221880,3698,1,0.339468,44440.912676,74216.8,1000.0,38913.4,11797.642857,...,8899,3.39,3.97,149,2987,2487,311,0.809056,5201,
4,0.025568,38520,3210,1,0.047035,20231.313158,31302.0,900.0,18914.3,2736.026923,...,8388,2.41,2.94,87,1658,1668,597,0.894527,5178,


In [21]:
# We now remove the Y column with NaNs
test_dataset = test_dataset.iloc[:, 0:STATUS_COL]
display(test_dataset.head())

Unnamed: 0,date,amount,payments,frequency,account_creation_date,balance_mean,balance_max,balance_min,last_ballance,credit_mean,...,ratio of urban inhabitants,average salary,unemploymant rate '95,unemploymant rate '96,no. of enterpreneurs per 1000 inhabitants,no. of commited crimes '95,no. of commited crimes '96,loan_to_account_age_days,owner_age_on_loan,salary_over_payments
0,0.0,93960,1566,1,0.176892,54520.202247,88246.7,800.0,49548.5,12769.22,...,53.5,8390,2.28,2.89,132,2080,2122,452,0.389496,6824
1,0.00142,260640,7240,1,0.139059,31518.182051,88731.8,-718.6,11565.4,9526.772414,...,52.4,8620,1.1,1.25,100,1089,1117,490,0.688919,1380
2,0.007102,232560,4845,0,0.0,40175.6125,79286.6,200.0,45754.0,15302.486047,...,74.8,10045,1.42,1.71,135,6604,6295,630,0.907412,5200
3,0.025568,221880,3698,1,0.339468,44440.912676,74216.8,1000.0,38913.4,11797.642857,...,55.3,8899,3.39,3.97,149,2987,2487,311,0.809056,5201
4,0.025568,38520,3210,1,0.047035,20231.313158,31302.0,900.0,18914.3,2736.026923,...,59.1,8388,2.41,2.94,87,1658,1668,597,0.894527,5178


In [24]:
# Using train PCA and classifying
scaler=StandardScaler()
X_test_scaled=scaler.fit_transform(test_dataset)
predictions_df = pd.DataFrame(data = pca.transform(X_test_scaled))
display(predictions_df)

final_df = pd.DataFrame()
final_df['Predicted'] = classifier.predict(predictions_df)
#predictions_df = ids.merge(predictions_df, on=['date', 'amount'])
#predictions_df = predictions_df[['loan_id', 'Predicted']]\
#                    .rename(columns={
#                        'loan_id': 'Id'
#                    })\
#                    .drop_duplicates()

final_df['Id'] = ids['loan_id']
display(final_df)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,-0.400259,0.393198,-0.940661,-0.557390,-1.626275,1.822556,0.893250,1.580088,0.213027,0.653971,-1.013311,-0.138742,0.441519,0.053610,1.587035,0.319011,-0.675775,-0.657338
1,-0.840055,-0.595610,-1.201944,-1.584251,-2.144203,-1.474335,1.963409,0.816772,0.489063,0.516776,-0.451876,0.309542,-0.122066,-0.960099,-0.518567,-2.208499,-0.361596,0.610134
2,-1.734720,4.245279,-2.096533,-2.308841,-0.654328,1.913007,2.418901,0.225811,-0.521954,-0.315157,0.978848,-1.539298,-0.146770,-3.208327,-1.333808,-1.209223,-1.876406,-0.268425
3,0.001419,-0.820286,-0.547704,-2.015963,2.277085,0.278070,1.598689,1.870195,0.002749,-0.357297,-0.442512,0.290299,0.092695,-1.108431,0.521727,-0.963976,0.650617,-1.414178
4,-0.261242,0.170247,-5.136119,0.199009,-1.807096,-0.331306,0.950704,1.437842,0.219147,-1.335177,-1.011924,0.425299,1.403076,-1.178385,0.301381,-0.027350,-0.534866,-0.135246
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
349,-0.597148,-1.792897,0.218480,-1.613599,-0.690449,-2.163015,-1.281888,-2.025021,1.376053,-0.238995,0.237176,0.837105,-1.001174,-0.996116,-0.321432,-0.132039,0.832027,0.827429
350,-2.674131,-0.331850,0.256700,-1.363780,-1.537665,1.284540,-3.370924,-1.273384,-1.585227,-0.188552,-0.705239,0.597541,0.866621,0.322795,-0.664425,-0.877753,-0.958117,1.026859
351,-1.271652,1.266239,1.195623,-1.591747,0.396800,1.057697,0.341772,-2.916454,1.062420,-0.637257,-1.225529,-0.377297,-0.211205,2.388355,0.203007,-0.168760,-1.200582,0.062000
352,-0.247462,0.393983,-2.081077,3.806811,2.425657,0.603385,0.895202,-2.536661,-0.934002,-1.550473,-0.013632,-0.002682,0.062253,-0.509634,-2.271701,1.782820,-0.578869,0.731463


Unnamed: 0,Predicted,Id
0,1,5895
1,-1,7122
2,1,6173
3,1,6142
4,1,5358
...,...,...
349,1,4989
350,1,5221
351,1,6402
352,-1,5346


In [25]:
# Outputting predictions to .csv
# CHANGE FILE NAME TO PRESERVE DIFFERENT INSTANCES
utils.write_df_to_csv(final_df, 'predictions', 'prediction.csv')