In [1]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import os
from imblearn.over_sampling import SMOTE 
from sklearn.ensemble.gradient_boosting import GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import *
import hyperopt
from hyperopt import *
from hyperopt import fmin, tpe, hp
import matplotlib.pyplot as plt
%matplotlib inline 


Using TensorFlow backend.


<div class="alert alert-block alert-info">
<b>Loading the data: </b> We will load the data for analysis here.
</div>

In [2]:
path_of_input_file = 'D:\\kaggle_trials\\predicting-a-pulsar-star\\pulsar_stars.csv'
df                 = pd.read_csv(path_of_input_file)
df.head(4)

Unnamed: 0,Mean of the integrated profile,Standard deviation of the integrated profile,Excess kurtosis of the integrated profile,Skewness of the integrated profile,Mean of the DM-SNR curve,Standard deviation of the DM-SNR curve,Excess kurtosis of the DM-SNR curve,Skewness of the DM-SNR curve,target_class
0,140.5625,55.683782,-0.234571,-0.699648,3.199833,19.110426,7.975532,74.242225,0
1,102.507812,58.88243,0.465318,-0.515088,1.677258,14.860146,10.576487,127.39358,0
2,103.015625,39.341649,0.323328,1.051164,3.121237,21.744669,7.735822,63.171909,0
3,136.75,57.178449,-0.068415,-0.636238,3.642977,20.95928,6.896499,53.593661,0


In [3]:
print('The number of positive examples are :- ',len(df[df['target_class']==1]))
print('The number of negative examples are :- ',len(df[df['target_class']==0]))

The number of positive examples are :-  1639
The number of negative examples are :-  16259


<div class="alert alert-block alert-info">
<b>Resampling to remove model imbalance:</b> We can clearly see from the above block that the number of negative samples are way higher than the number of positive samples. So we will create synthetic data to balance the dataset
</div>

In [4]:
X            = df[df.columns[:-1]].values
Y            = df['target_class'].values
sm           = SMOTE(random_state=42)
X_res, Y_res = sm.fit_resample(X, Y)

<div class="alert alert-block alert-info">
<b>Data After Resampling :</b> We can clearly see the data after and before resampling. Now we can say that the data is balanced and we will use this data for further analysis
</div>

In [5]:
print('Positive examples before Oversampling is ', sum(Y == 1))
print('Negative examples before Oversampling is ', sum(Y == 0))
print('\n')
print('Positive examples after Oversampling is ', sum(Y_res == 1))
print('Negative examples after Oversampling is ', sum(Y_res == 0))
print('\n')

Positive examples before Oversampling is  1639
Negative examples before Oversampling is  16259


Positive examples after Oversampling is  16259
Negative examples after Oversampling is  16259




<div class="alert alert-block alert-info">
<b>Train Test Split:</b> We will perform train test split of the data from the last step
</div>

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X_res, Y_res, test_size=0.33, random_state=42)

<div class="alert alert-block alert-info">
<b>Parameter Grid:</b> We will create a grid for the parameters to be used for parameter tuning. There will be some discrete choices and there will be some continuous choices and hyperopt will iterate through each of the choices and find the optimal solution for us
</div>

In [7]:

parameter_grid_gbm = {
    'n_estimators' : hp.choice('n_estimators',range(100,150)),
    'learning_rate': hp.uniform('learning_rate',0.01,0.99),
    'loss'         : hp.choice('loss',['deviance','exponential']),
    'subsample'    : hp.uniform('subsample',0.05,1.0),
    'min_samples_split' : hp.uniform('min_samples_split',0.02,1.0),
    'min_samples_leaf'  : hp.choice('min_samples_leaf',range(1,10)),
    'max_depth'         : hp.choice('max_depth',range(3,10)),
    'max_features'      : hp.choice('max_features',['auto','sqrt','log2']),
    'max_leaf_nodes'    : hp.choice('max_leaf_nodes',range(2,10))
}

def hyperopt_train_test(params):
    clf = GradientBoostingClassifier(**params)
    return cross_val_score(clf, X_train, y_train).mean()

def function_to_minimise(params):
    accuracy = hyperopt_train_test(params)
    return {'loss': -1*accuracy, 'status': STATUS_OK}


trials          = Trials()
best_parameters = fmin(function_to_minimise, parameter_grid_gbm, algo=tpe.suggest, max_evals=200, trials=trials)
print('The best parameter tuned on training set is given by :- ',best_parameters)

100%|████████████████████████████████████████████████| 200/200 [05:58<00:00,  1.60s/it, best loss: -0.9574975922336885]
The best parameter tuned on training set is given by :-  {'learning_rate': 0.6614693331752434, 'loss': 1, 'max_depth': 6, 'max_features': 2, 'max_leaf_nodes': 6, 'min_samples_leaf': 7, 'min_samples_split': 0.049360178427083734, 'n_estimators': 29, 'subsample': 0.9997382162423892}


<div class="alert alert-block alert-info">
<b>Instantiating a GBM model:</b> We will instantiate a GBM classifier based on the parameters attained above and we will fit it on the training data. 
</div>

In [8]:
gbmclf = GradientBoostingClassifier(learning_rate     = best_parameters['learning_rate'],
                                    loss              = 'deviance',
                                    max_depth         = best_parameters['max_depth'],
                                    max_features      = best_parameters['max_features'],
                                    max_leaf_nodes    = best_parameters['max_leaf_nodes'],
                                    min_samples_leaf  = best_parameters['min_samples_leaf'],
                                    min_samples_split = best_parameters['min_samples_split'],
                                    n_estimators      = best_parameters['n_estimators'],
                                    subsample         = best_parameters['subsample'])
gbmclf.fit(X_train,y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.6614693331752434, loss='deviance',
                           max_depth=6, max_features=2, max_leaf_nodes=6,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=7,
                           min_samples_split=0.049360178427083734,
                           min_weight_fraction_leaf=0.0, n_estimators=29,
                           n_iter_no_change=None, presort='auto',
                           random_state=None, subsample=0.9997382162423892,
                           tol=0.0001, validation_fraction=0.1, verbose=0,
                           warm_start=False)

<div class="alert alert-block alert-info">
<b>Predicting results:</b> GBM classifier trained above will predict the results and we will then analyse the classification report based on the test data
</div>

In [9]:
y_hat = gbmclf.predict(X_test)
print(classification_report(y_hat,y_test))

              precision    recall  f1-score   support

           0       0.96      0.94      0.95      5539
           1       0.93      0.96      0.95      5192

    accuracy                           0.95     10731
   macro avg       0.95      0.95      0.95     10731
weighted avg       0.95      0.95      0.95     10731

