### Imports

In [66]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [67]:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

In [68]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

### Read in CSV files. The Test and Train split is an element of the Kaggle compeition source of these files

In [69]:
df = pd.read_csv('exoTrain.csv')
dftest = pd.read_csv('exoTest.csv')

In [70]:
df['LABEL'] = df['LABEL']-1

In [71]:
df.head()

Unnamed: 0,LABEL,FLUX.1,FLUX.2,FLUX.3,FLUX.4,FLUX.5,FLUX.6,FLUX.7,FLUX.8,FLUX.9,...,FLUX.3188,FLUX.3189,FLUX.3190,FLUX.3191,FLUX.3192,FLUX.3193,FLUX.3194,FLUX.3195,FLUX.3196,FLUX.3197
0,1,93.85,83.81,20.1,-26.98,-39.56,-124.71,-135.18,-96.27,-79.89,...,-78.07,-102.15,-102.15,25.13,48.57,92.54,39.32,61.42,5.08,-39.54
1,1,-38.88,-33.83,-58.54,-40.09,-79.31,-72.81,-86.55,-85.33,-83.97,...,-3.28,-32.21,-32.21,-24.89,-4.86,0.76,-11.7,6.46,16.0,19.93
2,1,532.64,535.92,513.73,496.92,456.45,466.0,464.5,486.39,436.56,...,-71.69,13.31,13.31,-29.89,-20.88,5.06,-11.8,-28.91,-70.02,-96.67
3,1,326.52,347.39,302.35,298.13,317.74,312.7,322.33,311.31,312.42,...,5.71,-3.73,-3.73,30.05,20.03,-12.67,-8.77,-17.31,-17.35,13.98
4,1,-1107.21,-1112.59,-1118.95,-1095.1,-1057.55,-1034.48,-998.34,-1022.71,-989.57,...,-594.37,-401.66,-401.66,-357.24,-443.76,-438.54,-399.71,-384.65,-411.79,-510.54


In [73]:
dftest['LABEL'] = dftest['LABEL']-1

In [74]:
dftest.head()

Unnamed: 0,LABEL,FLUX.1,FLUX.2,FLUX.3,FLUX.4,FLUX.5,FLUX.6,FLUX.7,FLUX.8,FLUX.9,...,FLUX.3188,FLUX.3189,FLUX.3190,FLUX.3191,FLUX.3192,FLUX.3193,FLUX.3194,FLUX.3195,FLUX.3196,FLUX.3197
0,1,119.88,100.21,86.46,48.68,46.12,39.39,18.57,6.98,6.63,...,14.52,19.29,14.44,-1.62,13.33,45.5,31.93,35.78,269.43,57.72
1,1,5736.59,5699.98,5717.16,5692.73,5663.83,5631.16,5626.39,5569.47,5550.44,...,-581.91,-984.09,-1230.89,-1600.45,-1824.53,-2061.17,-2265.98,-2366.19,-2294.86,-2034.72
2,1,844.48,817.49,770.07,675.01,605.52,499.45,440.77,362.95,207.27,...,17.82,-51.66,-48.29,-59.99,-82.1,-174.54,-95.23,-162.68,-36.79,30.63
3,1,-826.0,-827.31,-846.12,-836.03,-745.5,-784.69,-791.22,-746.5,-709.53,...,122.34,93.03,93.03,68.81,9.81,20.75,20.25,-120.81,-257.56,-215.41
4,1,-39.57,-15.88,-9.16,-6.37,-16.13,-24.05,-0.9,-45.2,-5.04,...,-37.87,-61.85,-27.15,-21.18,-33.76,-85.34,-81.46,-61.98,-69.34,-17.84


#### combine the two datasets into one to regression

In [75]:
dfwhole = pd.concat([df,dftest])

In [76]:
dfwhole.shape

(5657, 3198)

### Split into training and test using train_test_split 80% train

In [81]:
# Split the data into a training and test set.
Xlr, Xtestlr, ylr, ytestlr = train_test_split(dfwhole.loc[:,'FLUX.1':].values, dfwhole['LABEL'].values, test_size =.5, random_state = 101)

In [82]:
clf = LogisticRegression(C=1)

# Fit the model on the trainng data.
clf.fit(Xlr, ylr)
# Print the accuracy from the testing data.
print(accuracy_score(clf.predict(Xtestlr), ytestlr))

0.567691763874


### Print Classification Report and Confusion Matrix

In [83]:
from sklearn.metrics import confusion_matrix,classification_report

#Print Confusion Matrix
print ('\nClassification Report:\n', classification_report(ytestlr,clf.predict(Xtestlr)))
print ('\nConfusion Matrix:\n', pd.crosstab( clf.predict(Xtestlr),ytestlr, rownames=['True'], colnames=['Predicted'], margins=True))


Classification Report:
              precision    recall  f1-score   support

          0       1.00      0.57      0.72      2810
          1       0.01      0.58      0.02        19

avg / total       0.99      0.57      0.72      2829


Confusion Matrix:
 Predicted     0   1   All
True                     
0          1595   8  1603
1          1215  11  1226
All        2810  19  2829


In [80]:
clf.predict(Xtestlr)

array([0, 0, 0, ..., 1, 0, 1], dtype=int64)

In [13]:
#from sklearn.metrics import roc_auc_score
#y_true = np.array([0, 0, 1, 1])
#y_scores = np.array([0.1, 0.4, 0.35, 0.8])
#roc_auc_score(y_true, y_scores)

### Cross Validation

In [84]:
def cv_score(clf, x, y, score_func=accuracy_score):
    result = 0
    nfold = 5
    for train, test in KFold(nfold).split(x): # split data into train/test groups, 5 times
        clf.fit(x[train], y[train]) # fit
        result += score_func(clf.predict(x[test]), y[test]) # evaluate score function on held-out data
    return result / nfold # average

In [85]:
clf2 = LogisticRegression()
score = cv_score(clf2, Xlr, ylr)
print(score)

0.584866318522


### Sample through different C parameters

In [86]:
#the grid of parameters to search over
Cs = [0.001, 0.1, 1, 10, 100]

#For C parameters

def Parameters_C(Cs):
    """
    Input List of C values
    Output Best C value as highest cv_score is tracked 
    """
    max_value = 0 
    
    for parameters in Cs:
        logReg = LogisticRegression(C = parameters) #perform Log Reg with C
        score = cv_score(logReg, Xlr, ylr) #get average score

        if score > max_value:
            max_value = score
            top_C = parameters
        
    return max_value, top_C
            
max_value, top_C = Parameters_C(Cs)

print('Maxium: {}'.format(max_value))
print('Best value for C : {}'.format(top_C))


Maxium: 0.5905250320522843
Best value for C : 0.001


In [87]:
clf = LogisticRegression(C=top_C)
# Fit the model on the trainng data.
clf.fit(Xlr, ylr)
# Print the accuracy from the testing data.
print(accuracy_score(clf.predict(Xtestlr), ytestlr))

0.564863909509


---
#### Do the Logistic Regression on the Kaggle competion split

In [90]:
Xlr = df.loc[:,'FLUX.1':].values
Xtestlr = dftest.loc[:,'FLUX.1':].values
ylr = df['LABEL'].values
ytestlr =dftest['LABEL'].values

In [91]:
Xlr.shape

(5087, 3197)

In [92]:
clf = LogisticRegression(C=top_C)

# Fit the model on the trainng data.
clf.fit(Xlr, ylr)

# Print the accuracy from the testing data.
print(accuracy_score(clf.predict(Xtestlr), ytestlr))

0.519298245614


### Classification Report and Confusion Matrix on Kaggle Split

In [93]:
#Print Confusion Matrix
print ('\nClassification Report:\n', classification_report(ytestlr,clf.predict(Xtestlr)))
print ('\nConfusion Matrix:\n', pd.crosstab( clf.predict(Xtestlr),ytestlr, rownames=['True'], colnames=['Predicted'], margins=True))


Classification Report:
              precision    recall  f1-score   support

          0       0.99      0.52      0.68       565
          1       0.01      0.60      0.02         5

avg / total       0.98      0.52      0.68       570


Confusion Matrix:
 Predicted    0  1  All
True                  
0          293  2  295
1          272  3  275
All        565  5  570


## Now with Normalized Scaled data

In [94]:
# import and instatiate scaler
from sklearn.preprocessing import StandardScaler
scale = StandardScaler()

### fit and transform normalizer 

In [95]:
#create separate array of Labals before transformation, remove labels from transformable dataset
dfwhole_target = dfwhole['LABEL']
dfwhole.drop('LABEL',axis=1,inplace=True)

In [96]:
dfwhole.head()

Unnamed: 0,FLUX.1,FLUX.2,FLUX.3,FLUX.4,FLUX.5,FLUX.6,FLUX.7,FLUX.8,FLUX.9,FLUX.10,...,FLUX.3188,FLUX.3189,FLUX.3190,FLUX.3191,FLUX.3192,FLUX.3193,FLUX.3194,FLUX.3195,FLUX.3196,FLUX.3197
0,93.85,83.81,20.1,-26.98,-39.56,-124.71,-135.18,-96.27,-79.89,-160.17,...,-78.07,-102.15,-102.15,25.13,48.57,92.54,39.32,61.42,5.08,-39.54
1,-38.88,-33.83,-58.54,-40.09,-79.31,-72.81,-86.55,-85.33,-83.97,-73.38,...,-3.28,-32.21,-32.21,-24.89,-4.86,0.76,-11.7,6.46,16.0,19.93
2,532.64,535.92,513.73,496.92,456.45,466.0,464.5,486.39,436.56,484.39,...,-71.69,13.31,13.31,-29.89,-20.88,5.06,-11.8,-28.91,-70.02,-96.67
3,326.52,347.39,302.35,298.13,317.74,312.7,322.33,311.31,312.42,323.33,...,5.71,-3.73,-3.73,30.05,20.03,-12.67,-8.77,-17.31,-17.35,13.98
4,-1107.21,-1112.59,-1118.95,-1095.1,-1057.55,-1034.48,-998.34,-1022.71,-989.57,-970.88,...,-594.37,-401.66,-401.66,-357.24,-443.76,-438.54,-399.71,-384.65,-411.79,-510.54


With the transformation the column headings are lost, so put them in a place holder

In [98]:
column_names = dfwhole.columns

In [101]:
dfwhole_scale.columns = column_names

In [99]:
#Whole Dataset 
dfwhole_scale = scale.fit_transform(dfwhole.as_matrix())

### convert back to dataframe

In [100]:
#Whole Dataset 
dfwhole_scale = pd.DataFrame(dfwhole_scale)

In [102]:
dfwhole_scale.reset_index(inplace=True, drop=True)

In [103]:
dfwhole_scale.head()

Unnamed: 0,FLUX.1,FLUX.2,FLUX.3,FLUX.4,FLUX.5,FLUX.6,FLUX.7,FLUX.8,FLUX.9,FLUX.10,...,FLUX.3188,FLUX.3189,FLUX.3190,FLUX.3191,FLUX.3192,FLUX.3193,FLUX.3194,FLUX.3195,FLUX.3196,FLUX.3197
0,-0.004236,-0.005043,-0.007841,-0.011275,-0.010495,-0.014239,-0.012587,-0.012311,-0.011753,-0.018687,...,-0.014546,-0.017118,-0.01728,-0.0143,-0.012122,-0.010372,0.026682,0.031563,0.024062,0.015458
1,-0.010624,-0.010631,-0.011558,-0.011886,-0.012282,-0.012002,-0.010493,-0.011816,-0.011954,-0.01408,...,-0.011809,-0.01505,-0.015588,-0.015357,-0.013227,-0.012181,0.023712,0.028344,0.024757,0.01969
2,0.01688,0.016432,0.015493,0.013141,0.011802,0.01122,0.013229,0.014076,0.013661,0.015525,...,-0.014312,-0.013704,-0.014488,-0.015463,-0.013559,-0.012096,0.023706,0.026272,0.019286,0.011392
3,0.006961,0.007477,0.005501,0.003877,0.005566,0.004613,0.007109,0.006147,0.007552,0.006976,...,-0.01148,-0.014208,-0.0149,-0.014196,-0.012712,-0.012446,0.023882,0.026952,0.022636,0.019267
4,-0.062035,-0.061872,-0.061682,-0.061053,-0.056257,-0.053449,-0.049746,-0.054268,-0.056517,-0.061718,...,-0.033436,-0.025975,-0.024522,-0.022379,-0.022308,-0.020841,0.001127,0.005435,-0.002453,-0.01806


In [104]:
dfwhole_target = pd.DataFrame(dfwhole_target)
dfwhole_target.reset_index(inplace=True, drop=True)
dfwhole_target.head()

Unnamed: 0,LABEL
0,1
1,1
2,1
3,1
4,1


In [105]:
dfwhole_target.reset_index(inplace=True, drop=True)

In [106]:
dfwhole_scale.shape

(5657, 3197)

In [107]:
df_col_merged =pd.concat([dfwhole_target, dfwhole_scale], axis=1)
df_col_merged.shape

(5657, 3198)

In [108]:
df_col_merged.head()

Unnamed: 0,LABEL,FLUX.1,FLUX.2,FLUX.3,FLUX.4,FLUX.5,FLUX.6,FLUX.7,FLUX.8,FLUX.9,...,FLUX.3188,FLUX.3189,FLUX.3190,FLUX.3191,FLUX.3192,FLUX.3193,FLUX.3194,FLUX.3195,FLUX.3196,FLUX.3197
0,1,-0.004236,-0.005043,-0.007841,-0.011275,-0.010495,-0.014239,-0.012587,-0.012311,-0.011753,...,-0.014546,-0.017118,-0.01728,-0.0143,-0.012122,-0.010372,0.026682,0.031563,0.024062,0.015458
1,1,-0.010624,-0.010631,-0.011558,-0.011886,-0.012282,-0.012002,-0.010493,-0.011816,-0.011954,...,-0.011809,-0.01505,-0.015588,-0.015357,-0.013227,-0.012181,0.023712,0.028344,0.024757,0.01969
2,1,0.01688,0.016432,0.015493,0.013141,0.011802,0.01122,0.013229,0.014076,0.013661,...,-0.014312,-0.013704,-0.014488,-0.015463,-0.013559,-0.012096,0.023706,0.026272,0.019286,0.011392
3,1,0.006961,0.007477,0.005501,0.003877,0.005566,0.004613,0.007109,0.006147,0.007552,...,-0.01148,-0.014208,-0.0149,-0.014196,-0.012712,-0.012446,0.023882,0.026952,0.022636,0.019267
4,1,-0.062035,-0.061872,-0.061682,-0.061053,-0.056257,-0.053449,-0.049746,-0.054268,-0.056517,...,-0.033436,-0.025975,-0.024522,-0.022379,-0.022308,-0.020841,0.001127,0.005435,-0.002453,-0.01806


### Split the data into a training and test set.

### Fit the logistic model : Test size 60% and random state 101

In [109]:
# Split the data into a training and test set.
Xlr, Xtestlr, ylr, ytestlr = train_test_split(df_col_merged.loc[:,'FLUX.1':].values, df_col_merged['LABEL'].values, test_size =.6, random_state = 101)

In [111]:
clf = LogisticRegression()

# Fit the model on the trainng data.
clf.fit(Xlr, ylr)

# Print the accuracy from the testing data.
print(accuracy_score(clf.predict(Xtestlr), ytestlr))

0.983505154639


Classification and Confusion Matrix

In [112]:
#Print Confusion Matrix
print ('\nClassification Report:\n', classification_report(ytestlr,clf.predict(Xtestlr)))
print ('\nConfusion Matrix:\n', pd.crosstab( clf.predict(Xtestlr),ytestlr, rownames=['True'], colnames=['Predicted'], margins=True))


Classification Report:
              precision    recall  f1-score   support

          0       0.99      0.99      0.99      3374
          1       0.00      0.00      0.00        21

avg / total       0.99      0.98      0.99      3395


Confusion Matrix:
 Predicted     0   1   All
True                     
0          3339  21  3360
1            35   0    35
All        3374  21  3395


The recall of 0.00 for 1 means the model is predicting for the Non-exoplanet and failing for actual exoplanets

### Now for the Competition Breakdown:

In [113]:
# Explore Star Light Intensity with & without exoPlanet
# remove the label data

#create separate array of Labels before transformation, remove labels from transformable dataset
df_target = df['LABEL']
df.drop('LABEL',axis=1,inplace=True)

dftest_target = dftest['LABEL']
dftest.drop('LABEL',axis=1,inplace=True)

In [114]:
#Competition Split: Fit Transform
dftrain_scale = scale.fit_transform(df.as_matrix())
dftest_scale = scale.fit_transform(dftest.as_matrix())

In [115]:
#Competition Split: Revert to Dataset
dftrain_scale = pd.DataFrame(dftrain_scale)
dftest_scale = pd.DataFrame(dftest_scale)

In [116]:
clf_c = LogisticRegression(C=1)

# Fit the model on the trainng data.
clf_c.fit(dftrain_scale, df_target)

# Print the accuracy from the testing data.
print(accuracy_score(clf.predict(dftest_scale), dftest_target))

0.973684210526


Classification and Confusion Matrix

In [117]:
#Print Confusion Matrix
print ('\nClassification Report:\n', classification_report(ytestlr,clf_c.predict(Xtestlr)))
print ('\nConfusion Matrix:\n', pd.crosstab( clf_c.predict(Xtestlr),ytestlr, rownames=['True'], colnames=['Predicted'], margins=True))


Classification Report:
              precision    recall  f1-score   support

          0       0.99      1.00      1.00      3374
          1       0.67      0.10      0.17        21

avg / total       0.99      0.99      0.99      3395


Confusion Matrix:
 Predicted     0   1   All
True                     
0          3373  19  3392
1             1   2     3
All        3374  21  3395


The recall of 0.10 for 1 means the model is predicting for the Non-exoplanet and still failing for actual exoplanets, only 2 out of 21 correctly predicted.

### Lets try with Upsampling

In [118]:
from sklearn.utils import resample

In [124]:
df = pd.read_csv('exoTrain.csv')
dftest = pd.read_csv('exoTest.csv')

In [125]:
dfall = pd.concat([df,dftest])

In [131]:
dfall.shape

(5657, 3198)

In [127]:
dfall['LABEL'] = dfall['LABEL']-1

In [130]:
# Separate majority and minority classes
df_noexo = dfall[dfall['LABEL']==0]
df_yesexo = dfall[dfall['LABEL']==1]
 
# Upsample minority class
df_yesexo_upsampled = resample(df_yesexo, 
                                 replace=True,     # sample with replacement
                                 n_samples=2000,    # to match majority class
                                 random_state=42) # reproducible results
 
# Combine majority class with upsampled minority class
df_upsampled = pd.concat([df_noexo, df_yesexo_upsampled])
 
# Display new class counts
df_upsampled['LABEL'].value_counts()

0    5615
1    2000
Name: LABEL, dtype: int64

In [140]:
# Split the data into a training and test set.
Xlr, Xtestlr, ylr, ytestlr = train_test_split(df_upsampled.loc[:,'FLUX.1':].values, df_upsampled['LABEL'].values, test_size =.6, random_state = 101)

In [142]:
#Print Confusion Matrix
print ('\nClassification Report:\n', classification_report(ytestlr,clf_s.predict(Xtestlr)))
print ('\nConfusion Matrix:\n', pd.crosstab(clf_s.predict(Xtestlr),ytestlr, rownames=['True'], colnames=['Predicted'], margins=True))


Classification Report:
              precision    recall  f1-score   support

          0       0.89      0.50      0.64      3417
          1       0.35      0.82      0.50      1152

avg / total       0.76      0.58      0.60      4569


Confusion Matrix:
 Predicted     0     1   All
True                       
0          1694   206  1900
1          1723   946  2669
All        3417  1152  4569


---
### Working with a group of classifiers

In [46]:
#Import the classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import accuracy_score

In [143]:
#Assign the classifiers 
svc = SVC(kernel='sigmoid', class_weight='balanced', gamma=1.0)
knc = KNeighborsClassifier(n_neighbors=5)
mnb = MultinomialNB(alpha=0.2)
dtc = DecisionTreeClassifier(min_samples_split=7, random_state=111)
lrc = LogisticRegression(solver='liblinear', penalty='l1')
rfc = RandomForestClassifier(n_estimators=31, random_state=111)
abc = AdaBoostClassifier(n_estimators=62, random_state=111)
bc = BaggingClassifier(n_estimators=9, random_state=111)
etc = ExtraTreesClassifier(n_estimators=9, random_state=111)

In [49]:
clfs = {'SVC' : svc,'KN' : knc, 'DT': dtc, 'LR': lrc, 'RF': rfc, 'AdaBoost': abc, 'BgC': bc, 'ETC': etc}

In [50]:
def train_classifier(clf, feature_train, labels_train):
    """create the classfier feeder to fit the features and labels"""
    clf.fit(feature_train, labels_train)

In [51]:
def predict_labels(clf, features):
    """predicted values from the features matrix"""
    return (clf.predict(features))

In [144]:
features_train = Xlr
labels_train = ylr
features_test = Xtestlr
labels_test = ytestlr

pred_scores = []
for k,v in clfs.items():
    #print(k)
    train_classifier(v, features_train, labels_train)
    pred = predict_labels(v,features_test)
    pred_scores.append((k, [accuracy_score(labels_test,pred)]))
    print ('\nClassification Report: ', k,'\n', classification_report(labels_test,pred))
    print ('\nConfusion Matrix: ',k,'\n', pd.crosstab( pred,labels_test, rownames=['True'], colnames=['Predicted'], margins=True))
    print


Classification Report:  SVC 
              precision    recall  f1-score   support

          0       0.74      0.71      0.72      3417
          1       0.22      0.25      0.23      1152

avg / total       0.61      0.59      0.60      4569


Confusion Matrix:  SVC 
 Predicted     0     1   All
True                       
0          2415   866  3281
1          1002   286  1288
All        3417  1152  4569

Classification Report:  KN 
              precision    recall  f1-score   support

          0       1.00      0.98      0.99      3417
          1       0.94      1.00      0.97      1152

avg / total       0.99      0.99      0.99      4569


Confusion Matrix:  KN 
 Predicted     0     1   All
True                       
0          3349     0  3349
1            68  1152  1220
All        3417  1152  4569

Classification Report:  DT 
              precision    recall  f1-score   support

          0       1.00      0.99      0.99      3417
          1       0.97      1.00      0.9

#### With the upsampled dataset for Exoplanets, Random Forest and Extra Trees Classifier came back with good results

In [145]:
df_result = pd.DataFrame.from_items(pred_scores,orient='index', columns=['Score'])
df_result

Unnamed: 0,Score
SVC,0.591158
KN,0.985117
DT,0.99234
LR,0.963012
RF,0.999562
AdaBoost,0.992559
BgC,0.997374
ETC,0.999562


Let see how that works with a specific set of data that is 100% exoplanets

In [147]:
df_yesexo.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 42 entries, 0 to 4
Columns: 3198 entries, LABEL to FLUX.3197
dtypes: float64(3197), int64(1)
memory usage: 1.0 MB


In [151]:
df_testmodel = df_yesexo.loc[:, df.columns != 'LABEL']

In [158]:
pred_100p = predict_labels(v,df_testmodel) # fit for Extra Trees Classifier

In [155]:
pred_100p.sum()

42

What if up sample is taken only from 1/2 the data as a new 100% training set and we see if the unseen data is modelled correctly? Take the remainder (unseen) to use a 100% test set.

In [170]:
#split the exoplanet data 50-50
df_yesexo_1sthalf = df_yesexo[:21]
df_yesexo_2ndhalf = df_yesexo[21:42]

### Upsample the exoplanet data to 2000 records

In [174]:
# Upsample exoplanet class
df_yesexo_upsampled = resample(df_yesexo_1sthalf, 
                                 replace=True,     # sample with replacement
                                 n_samples=2000,    
                                 random_state=42) # reproducible results
 
# Combine non-exoplanet class with upsampled exoplanet class
df_upsampled = pd.concat([df_noexo, df_yesexo_upsampled])
 
# Display new exoplanet class counts
df_upsampled['LABEL'].value_counts()

0    5615
1    2000
Name: LABEL, dtype: int64

In [175]:
# Split the data into a training and test set.
Xlr, Xtestlr, ylr, ytestlr = train_test_split(df_upsampled.loc[:,'FLUX.1':].values, df_upsampled['LABEL'].values, test_size =.6, random_state = 101)

array([[ -1.22100000e+01,  -1.34100000e+01,  -1.65800000e+01, ...,
         -5.43000000e+00,  -2.20000000e+00,  -8.00000000e-02],
       [ -2.67210000e+02,  -2.39110000e+02,  -2.33150000e+02, ...,
         -7.57050000e+02,  -7.63260000e+02,  -7.69390000e+02],
       [ -1.51385000e+03,  -1.48397000e+03,  -1.51553000e+03, ...,
          2.55737000e+03,   2.59959000e+03,   2.72262000e+03],
       ..., 
       [ -2.67210000e+02,  -2.39110000e+02,  -2.33150000e+02, ...,
         -7.57050000e+02,  -7.63260000e+02,  -7.69390000e+02],
       [  9.71800000e+01,   7.08600000e+01,   7.32800000e+01, ...,
          1.18300000e+01,   2.68200000e+01,  -1.54400000e+01],
       [  8.03300000e+01,   6.81900000e+01,   4.63000000e+01, ...,
         -1.95900000e+01,  -1.90900000e+01,   5.95000000e+00]])

In [179]:
features_train = Xlr
labels_train = ylr
features_test = Xtestlr
labels_test = ytestlr

pred_scores = []
for k,v in clfs.items():
    #print(k)
    train_classifier(v, features_train, labels_train)
    pred = predict_labels(v,features_test)
    pred_scores.append((k, [accuracy_score(labels_test,pred)]))
    print ('\nClassification Report: ', k,'\n', classification_report(labels_test,pred))
    print ('\nConfusion Matrix: ',k,'\n', pd.crosstab( pred,labels_test, rownames=['True'], colnames=['Predicted'], margins=True))
    print


Classification Report:  SVC 
              precision    recall  f1-score   support

          0       0.70      0.67      0.68      3417
          1       0.13      0.14      0.13      1152

avg / total       0.55      0.53      0.54      4569


Confusion Matrix:  SVC 
 Predicted     0     1   All
True                       
0          2279   987  3266
1          1138   165  1303
All        3417  1152  4569

Classification Report:  KN 
              precision    recall  f1-score   support

          0       1.00      0.99      0.99      3417
          1       0.97      1.00      0.98      1152

avg / total       0.99      0.99      0.99      4569


Confusion Matrix:  KN 
 Predicted     0     1   All
True                       
0          3378     0  3378
1            39  1152  1191
All        3417  1152  4569

Classification Report:  DT 
              precision    recall  f1-score   support

          0       1.00      0.99      1.00      3417
          1       0.97      1.00      0.9

Lets see the fit for never seen data, the 2nd half of the exoplanet dataset

In [182]:
df_testmodel = df_yesexo_2ndhalf.loc[:, df_yesexo_2ndhalf.columns != 'LABEL']

In [191]:
df_testmodel = df_yesexo_1sthalf.loc[:, df_yesexo_1sthalf.columns != 'LABEL']

In [192]:
pred_100p = predict_labels(v,df_testmodel) # fit for Extra Trees Classifier

In [193]:
pred_100p

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int64)

In [202]:
train_classifier(rfc, features_train, labels_train)
pred = predict_labels(rfc,features_test)
pred_scores.append(('RF', [accuracy_score(labels_test,pred)]))
print ('\nClassification Report: RFC\n', classification_report(labels_test,pred))
print ('\nConfusion Matrix: RFC\n', pd.crosstab( pred,labels_test, rownames=['True'], colnames=['Predicted'], margins=True))


Classification Report: RFC
              precision    recall  f1-score   support

          0       1.00      1.00      1.00      3417
          1       1.00      1.00      1.00      1152

avg / total       1.00      1.00      1.00      4569


Confusion Matrix: RFC
 Predicted     0     1   All
True                       
0          3417     0  3417
1             0  1152  1152
All        3417  1152  4569


In [199]:
df_testmodel = df_yesexo_2ndhalf.loc[:, df_yesexo_2ndhalf.columns != 'LABEL']

In [200]:
df_testmodel.head()

Unnamed: 0,FLUX.1,FLUX.2,FLUX.3,FLUX.4,FLUX.5,FLUX.6,FLUX.7,FLUX.8,FLUX.9,FLUX.10,...,FLUX.3188,FLUX.3189,FLUX.3190,FLUX.3191,FLUX.3192,FLUX.3193,FLUX.3194,FLUX.3195,FLUX.3196,FLUX.3197
21,2053.62,2126.05,2146.33,2159.84,2237.59,2236.12,2244.47,2279.61,2288.22,2221.22,...,1832.59,1935.53,1965.84,2094.19,2212.52,2292.64,2454.48,2568.16,2625.45,2578.8
22,-48.48,-22.95,11.15,-70.04,-120.34,-150.04,-309.38,-160.73,-201.41,-303.71,...,90.7,-20.01,-62.12,-45.96,-52.4,-4.93,26.74,21.43,145.3,197.2
23,145.84,137.82,96.99,17.09,-73.79,-157.79,-267.71,-365.91,-385.07,-423.68,...,62.76,101.24,98.13,112.51,95.77,127.98,67.51,91.24,40.4,-10.8
24,207.37,195.04,150.45,135.34,104.9,59.79,42.85,52.74,18.38,-8.13,...,-13.21,-43.43,-14.77,-22.27,-0.04,19.46,9.32,23.55,-4.73,11.82
25,304.5,275.94,269.24,248.51,194.88,167.8,139.13,149.36,100.97,59.58,...,4.21,3.53,-5.13,14.56,-1.44,-10.73,3.49,0.18,-2.89,40.34


In [196]:
pred_100p = predict_labels(rfc,df_testmodel) # fit for Random Forest Classifier
pred_100p.sum()

21

In [201]:
df_testmodel = df_yesexo_1sthalf.loc[:, df_yesexo_1sthalf.columns != 'LABEL']
df_testmodel.head()

Unnamed: 0,FLUX.1,FLUX.2,FLUX.3,FLUX.4,FLUX.5,FLUX.6,FLUX.7,FLUX.8,FLUX.9,FLUX.10,...,FLUX.3188,FLUX.3189,FLUX.3190,FLUX.3191,FLUX.3192,FLUX.3193,FLUX.3194,FLUX.3195,FLUX.3196,FLUX.3197
0,93.85,83.81,20.1,-26.98,-39.56,-124.71,-135.18,-96.27,-79.89,-160.17,...,-78.07,-102.15,-102.15,25.13,48.57,92.54,39.32,61.42,5.08,-39.54
1,-38.88,-33.83,-58.54,-40.09,-79.31,-72.81,-86.55,-85.33,-83.97,-73.38,...,-3.28,-32.21,-32.21,-24.89,-4.86,0.76,-11.7,6.46,16.0,19.93
2,532.64,535.92,513.73,496.92,456.45,466.0,464.5,486.39,436.56,484.39,...,-71.69,13.31,13.31,-29.89,-20.88,5.06,-11.8,-28.91,-70.02,-96.67
3,326.52,347.39,302.35,298.13,317.74,312.7,322.33,311.31,312.42,323.33,...,5.71,-3.73,-3.73,30.05,20.03,-12.67,-8.77,-17.31,-17.35,13.98
4,-1107.21,-1112.59,-1118.95,-1095.1,-1057.55,-1034.48,-998.34,-1022.71,-989.57,-970.88,...,-594.37,-401.66,-401.66,-357.24,-443.76,-438.54,-399.71,-384.65,-411.79,-510.54


In [198]:
pred_100p = predict_labels(rfc,df_testmodel) # fit for Random Forest Classifier
pred_100p.sum()

21

OK so it passes the 100% predictable test.

In [208]:
pred_2 = predict_labels(rfc,features_test)
pred_2.sum()

1152

In [209]:
dftest.head()

Unnamed: 0,LABEL,FLUX.1,FLUX.2,FLUX.3,FLUX.4,FLUX.5,FLUX.6,FLUX.7,FLUX.8,FLUX.9,...,FLUX.3188,FLUX.3189,FLUX.3190,FLUX.3191,FLUX.3192,FLUX.3193,FLUX.3194,FLUX.3195,FLUX.3196,FLUX.3197
0,2,119.88,100.21,86.46,48.68,46.12,39.39,18.57,6.98,6.63,...,14.52,19.29,14.44,-1.62,13.33,45.5,31.93,35.78,269.43,57.72
1,2,5736.59,5699.98,5717.16,5692.73,5663.83,5631.16,5626.39,5569.47,5550.44,...,-581.91,-984.09,-1230.89,-1600.45,-1824.53,-2061.17,-2265.98,-2366.19,-2294.86,-2034.72
2,2,844.48,817.49,770.07,675.01,605.52,499.45,440.77,362.95,207.27,...,17.82,-51.66,-48.29,-59.99,-82.1,-174.54,-95.23,-162.68,-36.79,30.63
3,2,-826.0,-827.31,-846.12,-836.03,-745.5,-784.69,-791.22,-746.5,-709.53,...,122.34,93.03,93.03,68.81,9.81,20.75,20.25,-120.81,-257.56,-215.41
4,2,-39.57,-15.88,-9.16,-6.37,-16.13,-24.05,-0.9,-45.2,-5.04,...,-37.87,-61.85,-27.15,-21.18,-33.76,-85.34,-81.46,-61.98,-69.34,-17.84


Against the original Kaggle test set?

In [220]:
#dftest_target

In [212]:
dftest_features = dftest.loc[:,'FLUX.1':].values

In [221]:
pred_3 = predict_labels(rfc,dftest_features)

In [222]:
print ('\nClassification Report: RFC\n', classification_report(dftest_target,pred_3))
print ('\nConfusion Matrix: RFC\n', pd.crosstab( pred_3,dftest_target, rownames=['True'], colnames=['Predicted'], margins=True))


Classification Report: RFC
              precision    recall  f1-score   support

          0       0.99      1.00      1.00       565
          1       0.00      0.00      0.00         5

avg / total       0.98      0.99      0.99       570


Confusion Matrix: RFC
 Predicted    0  1  All
True                  
0          565  5  570
All        565  5  570


  'precision', 'predicted', average, warn_for)
