In [1]:
import pandas  as pd 
import matplotlib.pyplot as plt 
import numpy as np 
from sklearn.linear_model import LogisticRegression 
from sklearn.preprocessing import StandardScaler 
from sklearn.metrics import confusion_matrix, classification_report 
  
# load the data set 
traindata = pd.read_csv('E:\\M. Fahad Habib\\8th Semester\\DWM\Lab\\kepler-labelled-time-series-data\\exoTrain.csv') 
trainarray = traindata.values
testdata = pd.read_csv('E:\\M. Fahad Habib\\8th Semester\\DWM\Lab\\kepler-labelled-time-series-data\\exoTest.csv') 
testarray = testdata.values
  
# print info about columns in the dataframe 
print("Train Data Information:- ")
print(traindata.info()) 
print(traindata.shape)
traindata['LABEL'].value_counts() 
print("Test Data Information:- ")
print(testdata.info()) 
print(testdata.shape)
testdata['LABEL'].value_counts()

Train Data Information:- 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5087 entries, 0 to 5086
Columns: 3198 entries, LABEL to FLUX.3197
dtypes: float64(3197), int64(1)
memory usage: 124.1 MB
None
(5087, 3198)
Test Data Information:- 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 570 entries, 0 to 569
Columns: 3198 entries, LABEL to FLUX.3197
dtypes: float64(3197), int64(1)
memory usage: 13.9 MB
None
(570, 3198)


1    565
2      5
Name: LABEL, dtype: int64

In [2]:
# separate array into input and output components 
train_X = trainarray[:, 1:3198] 
print("Features Of Training Set:- ")
print(train_X)
print(train_X.shape)
print("Labels Of Training Set:- ")
train_Y = trainarray[:,0] 
print(train_Y)
print(train_Y.shape)
print("Features Of Testing Set:- ")
test_X = testarray[:,1:3198] 
print(test_X)
print(test_X.shape)
print("Labels Of Testing Set:- ")
test_Y = testarray[:,0] 
print(test_Y)
print(test_Y.shape)


#Normalizing
from sklearn import preprocessing
train_X = preprocessing.normalize(train_X)
test_X = preprocessing.normalize(test_X)


Features Of Training Set:- 
[[ 93.85  83.81  20.1  ...  61.42   5.08 -39.54]
 [-38.88 -33.83 -58.54 ...   6.46  16.    19.93]
 [532.64 535.92 513.73 ... -28.91 -70.02 -96.67]
 ...
 [273.39 278.   261.73 ...  88.42  79.07  79.43]
 [  3.82   2.09  -3.29 ... -14.55  -6.41  -2.55]
 [323.28 306.36 293.16 ... -16.72 -14.09  27.82]]
(5087, 3197)
Labels Of Training Set:- 
[2. 2. 2. ... 1. 1. 1.]
(5087,)
Features Of Testing Set:- 
[[ 1.19880e+02  1.00210e+02  8.64600e+01 ...  3.57800e+01  2.69430e+02
   5.77200e+01]
 [ 5.73659e+03  5.69998e+03  5.71716e+03 ... -2.36619e+03 -2.29486e+03
  -2.03472e+03]
 [ 8.44480e+02  8.17490e+02  7.70070e+02 ... -1.62680e+02 -3.67900e+01
   3.06300e+01]
 ...
 [-5.40100e+01 -4.41300e+01 -4.12300e+01 ...  5.47000e+00  1.44600e+01
   1.87000e+01]
 [ 9.13600e+01  8.56000e+01  4.88100e+01 ... -8.43000e+00 -6.48000e+00
   1.76000e+01]
 [ 3.07119e+03  2.78253e+03  2.60869e+03 ... -2.77220e+02 -6.96300e+01
   1.21560e+02]]
(570, 3197)
Labels Of Testing Set:- 
[2. 2. 2.

In [3]:
#1 => Applying LR
# logistic regression object 
lr = LogisticRegression() 
  
# train the model on train set 
lr.fit(train_X, train_Y.ravel()) 
  
predictions = lr.predict(test_X) 
  
# print classification report 
print(classification_report(test_Y, predictions)) 





              precision    recall  f1-score   support

         1.0       0.99      1.00      1.00       565
         2.0       0.00      0.00      0.00         5

    accuracy                           0.99       570
   macro avg       0.50      0.50      0.50       570
weighted avg       0.98      0.99      0.99       570



  'precision', 'predicted', average, warn_for)


In [4]:
#Due to Data Imbalance, our results are a bit one-sided so we apply the SMOTE to resolve this imbalance
print("Before OverSampling, counts of label '1': {}".format(sum(train_Y == 1))) 
print("Before OverSampling, counts of label '2': {} \n".format(sum(train_Y == 2))) 
  
# import SMOTE module from imblearn library 
from imblearn.over_sampling import SMOTE 
sm = SMOTE(random_state = 2) 
X_train_res, y_train_res = sm.fit_sample(train_X, train_Y.ravel()) 
  
print('After OverSampling, the shape of train_X: {}'.format(X_train_res.shape)) 
print('After OverSampling, the shape of train_y: {} \n'.format(y_train_res.shape)) 
  
print("After OverSampling, counts of label '1': {}".format(sum(y_train_res == 1))) 
print("After OverSampling, counts of label '2': {}".format(sum(y_train_res == 2))) 


Before OverSampling, counts of label '1': 5050
Before OverSampling, counts of label '2': 37 

After OverSampling, the shape of train_X: (10100, 3197)
After OverSampling, the shape of train_y: (10100,) 

After OverSampling, counts of label '1': 5050
After OverSampling, counts of label '2': 5050


In [5]:
#Applying LR after data balancing
# logistic regression object 
lr = LogisticRegression() 
  
# train the model on train set 
lr.fit(X_train_res, y_train_res.ravel()) 
  
predictions = lr.predict(test_X) 
  
# print classification report 
print(classification_report(test_Y, predictions)) 



              precision    recall  f1-score   support

         1.0       0.99      0.99      0.99       565
         2.0       0.12      0.20      0.15         5

    accuracy                           0.98       570
   macro avg       0.56      0.59      0.57       570
weighted avg       0.99      0.98      0.98       570



In [6]:
#Applying NearMiss Algo 
print("Before Undersampling, counts of label '1': {}".format(sum(train_Y == 1))) 
print("Before Undersampling, counts of label '2': {} \n".format(sum(train_Y == 2))) 
  
# apply near miss 
from imblearn.under_sampling import NearMiss 
nr = NearMiss() 
  
X_train_miss, y_train_miss = nr.fit_sample(train_X, train_Y.ravel()) 
  
print('After Undersampling, the shape of train_X: {}'.format(X_train_miss.shape)) 
print('After Undersampling, the shape of train_y: {} \n'.format(y_train_miss.shape)) 
  
print("After Undersampling, counts of label '1': {}".format(sum(y_train_miss == 1))) 
print("After Undersampling, counts of label '2': {}".format(sum(y_train_miss == 2))) 


Before Undersampling, counts of label '1': 5050
Before Undersampling, counts of label '2': 37 

After Undersampling, the shape of train_X: (74, 3197)
After Undersampling, the shape of train_y: (74,) 

After Undersampling, counts of label '1': 37
After Undersampling, counts of label '2': 37


In [7]:
#Applying LR after data balancing
# logistic regression object 
lr = LogisticRegression()  
clf = lr.fit(X_train_miss, y_train_miss.ravel()) 
  
predictions = clf.predict(test_X) 
  
# print confusion matrix and classification report 
print(confusion_matrix(test_Y, predictions))
print(classification_report(test_Y, predictions)) 



[[ 10 555]
 [  2   3]]
              precision    recall  f1-score   support

         1.0       0.83      0.02      0.03       565
         2.0       0.01      0.60      0.01         5

    accuracy                           0.02       570
   macro avg       0.42      0.31      0.02       570
weighted avg       0.83      0.02      0.03       570



# As the result using NearMiss Algo is better than that of SMOTE, we will use the data balanced using NearMiss for upcoming classifications. 

In [8]:
# sensitivity
cm = confusion_matrix(test_Y, predictions)
sensitivity = cm[0,0]/(cm[0,0]+cm[0,1])
print('Sensitivity : ', sensitivity )


Sensitivity :  0.017699115044247787


In [9]:
# ROC 

from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from matplotlib import pyplot

# generate a no skill prediction (majority class)
ns_probs = [0 for _ in range(len(test_Y))]


# calculate scores
ns_auc = roc_auc_score(test_Y, ns_probs)
lr_auc = roc_auc_score(test_Y, predictions)
# summarize scores
print('No Skill: ROC AUC=%.3f' % (ns_auc))
print('Logistic: ROC AUC=%.3f' % (lr_auc))

# calculate roc curves
ns_fpr, ns_tpr, _ = roc_curve(test_Y, ns_probs)
lr_fpr, lr_tpr, _ = roc_curve(test_Y, predictions)

# plot the roc curve for the model
pyplot.plot(ns_fpr, ns_tpr, linestyle='--', label='No Skill')
pyplot.plot(lr_fpr, lr_tpr, marker='.', label='Logistic')

# axis labels
pyplot.xlabel('False Positive Rate')
pyplot.ylabel('True Positive Rate')

# show the legend
pyplot.legend()
# show the plot
pyplot.show()

No Skill: ROC AUC=0.500
Logistic: ROC AUC=0.309


ValueError: Data is not binary and pos_label is not specified

In [10]:
#2 => Gaussian Naive Bayes
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, confusion_matrix

clasifiers = GaussianNB()
clf = clasifiers.fit(X_train_miss, y_train_miss.ravel())
predictions = clf.predict(test_X)

# print confusion matrix and classification report 
print(confusion_matrix(test_Y, predictions))
print(classification_report(test_Y, predictions))

# Sensitivity
cm = confusion_matrix(test_Y, predictions)
sensitivity = cm[0,0]/(cm[0,0]+cm[0,1])
print('Sensitivity : ', sensitivity )

# ROC 

from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from matplotlib import pyplot

# generate a no skill prediction (majority class)
ns_probs = [0 for _ in range(len(test_Y))]

# calculate scores
ns_auc = roc_auc_score(test_Y, ns_probs)
lr_auc = roc_auc_score(test_Y, predictions)
# summarize scores
print('No Skill: ROC AUC=%.3f' % (ns_auc))
print('Logistic: ROC AUC=%.3f' % (lr_auc))

# calculate roc curves
ns_fpr, ns_tpr, _ = roc_curve(test_Y, ns_probs)
lr_fpr, lr_tpr, _ = roc_curve(test_Y, predictions)

# plot the roc curve for the model
pyplot.plot(ns_fpr, ns_tpr, linestyle='--', label='No Skill')
pyplot.plot(lr_fpr, lr_tpr, marker='.', label='Logistic')

# axis labels
pyplot.xlabel('False Positive Rate')
pyplot.ylabel('True Positive Rate')

# show the legend
pyplot.legend()
# show the plot
pyplot.show()

[[ 90 475]
 [  1   4]]
              precision    recall  f1-score   support

         1.0       0.99      0.16      0.27       565
         2.0       0.01      0.80      0.02         5

    accuracy                           0.16       570
   macro avg       0.50      0.48      0.15       570
weighted avg       0.98      0.16      0.27       570

Sensitivity :  0.1592920353982301
No Skill: ROC AUC=0.500
Logistic: ROC AUC=0.480


ValueError: Data is not binary and pos_label is not specified

In [11]:
#3 K-Nearest Neighbours
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors=5)
clf = classifier.fit(X_train_miss, y_train_miss.ravel())
predictions = clf.predict(test_X)

# print confusion matrix and classification report 
print(confusion_matrix(test_Y, predictions))
print(classification_report(test_Y, predictions))

# Sensitivity
cm = confusion_matrix(test_Y, predictions)
sensitivity = cm[0,0]/(cm[0,0]+cm[0,1])
print('Sensitivity : ', sensitivity )

# ROC 

from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from matplotlib import pyplot

# generate a no skill prediction (majority class)
ns_probs = [0 for _ in range(len(test_Y))]


# calculate scores
ns_auc = roc_auc_score(test_Y, ns_probs)
lr_auc = roc_auc_score(test_Y, predictions)
# summarize scores
print('No Skill: ROC AUC=%.3f' % (ns_auc))
print('Logistic: ROC AUC=%.3f' % (lr_auc))

# calculate roc curves
ns_fpr, ns_tpr, _ = roc_curve(test_Y, ns_probs)
lr_fpr, lr_tpr, _ = roc_curve(test_Y, predictions)

# plot the roc curve for the model
pyplot.plot(ns_fpr, ns_tpr, linestyle='--', label='No Skill')
pyplot.plot(lr_fpr, lr_tpr, marker='.', label='Logistic')

# axis labels
pyplot.xlabel('False Positive Rate')
pyplot.ylabel('True Positive Rate')

# show the legend
pyplot.legend()
# show the plot
pyplot.show()



[[320 245]
 [  3   2]]
              precision    recall  f1-score   support

         1.0       0.99      0.57      0.72       565
         2.0       0.01      0.40      0.02         5

    accuracy                           0.56       570
   macro avg       0.50      0.48      0.37       570
weighted avg       0.98      0.56      0.71       570

Sensitivity :  0.5663716814159292
No Skill: ROC AUC=0.500
Logistic: ROC AUC=0.483


ValueError: Data is not binary and pos_label is not specified

In [12]:
#4 => Decesion Tree
from sklearn import tree
classifier = tree.DecisionTreeClassifier()
clf = classifier.fit(X_train_miss, y_train_miss.ravel())
predictions = clf.predict(test_X)

# print confusion matrix and classification report 
print(confusion_matrix(test_Y, predictions))
print(classification_report(test_Y, predictions))

# Sensitivity
cm = confusion_matrix(test_Y, predictions)
sensitivity = cm[0,0]/(cm[0,0]+cm[0,1])
print('Sensitivity : ', sensitivity )

# ROC 

from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from matplotlib import pyplot

# generate a no skill prediction (majority class)
ns_probs = [0 for _ in range(len(test_Y))]


# calculate scores
ns_auc = roc_auc_score(test_Y, ns_probs)
lr_auc = roc_auc_score(test_Y, predictions)
# summarize scores
print('No Skill: ROC AUC=%.3f' % (ns_auc))
print('Logistic: ROC AUC=%.3f' % (lr_auc))

# calculate roc curves
ns_fpr, ns_tpr, _ = roc_curve(test_Y, ns_probs)
lr_fpr, lr_tpr, _ = roc_curve(test_Y, predictions)

# plot the roc curve for the model
pyplot.plot(ns_fpr, ns_tpr, linestyle='--', label='No Skill')
pyplot.plot(lr_fpr, lr_tpr, marker='.', label='Logistic')

# axis labels
pyplot.xlabel('False Positive Rate')
pyplot.ylabel('True Positive Rate')

# show the legend
pyplot.legend()
# show the plot
pyplot.show()



[[163 402]
 [  3   2]]
              precision    recall  f1-score   support

         1.0       0.98      0.29      0.45       565
         2.0       0.00      0.40      0.01         5

    accuracy                           0.29       570
   macro avg       0.49      0.34      0.23       570
weighted avg       0.97      0.29      0.44       570

Sensitivity :  0.2884955752212389
No Skill: ROC AUC=0.500
Logistic: ROC AUC=0.344


ValueError: Data is not binary and pos_label is not specified

# Errors are related to the ROC Curve. Couldn't figure them out. :/