In [1]:
%matplotlib inline

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

In [2]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn import decomposition
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.svm import SVC 
from sklearn import svm
from sklearn.svm import LinearSVC

import seaborn as sns

# Read the CSV and Perform Basic Data Cleaning

In [3]:
df = pd.read_csv("cumulative.csv")
df = df.drop(columns=["rowid", "kepid", "kepoi_name", "kepler_name", "koi_pdisposition", "koi_score", "koi_tce_delivname"])
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,9.488036,2.775e-05,-2.775e-05,170.53875,0.00216,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,CONFIRMED,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
2,FALSE POSITIVE,0,1,0,0,19.89914,1.494e-05,-1.494e-05,175.850252,0.000581,...,-176.0,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
3,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,-174.0,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
4,CONFIRMED,0,0,0,0,2.525592,3.761e-06,-3.761e-06,171.59555,0.00113,...,-211.0,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509


# Create a Train Test Split

Use `koi_disposition` for the y values

In [4]:
target = df['koi_disposition']

In [5]:
data = df.drop('koi_disposition', axis=1)
features = data.columns

In [6]:
X = data
y = target
variable_names = data.columns

In [7]:
print(X.shape)
print(y.shape)

(8744, 40)
(8744,)


In [8]:
X = data
y = target
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)


# Pre-processing

Scale the data using the MinMaxScaler

In [9]:
scaler = MinMaxScaler()
scaler.fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

  return self.partial_fit(X, y)


# Train the Support Vector Machine

In [10]:
model2 = SVC(kernel='linear', max_iter=10000)

In [11]:
model2.fit(X_train_scaled, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=10000, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [12]:
predictions = model2.predict(X_test_scaled)

In [13]:
print(f"Training Data Score: {model2.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {model2.score(X_test_scaled, y_test)}")

Training Data Score: 0.8502592253735896
Testing Data Score: 0.838975297346752


# Hyperparameter Tuning

Use `GridSearchCV` to tune the `C` and `gamma` parameters

In [14]:
# Create the GridSearchCV model
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [1, 5, 10, 20],
              'gamma': [0.0001, 0.001, 0.01, 0.1]
             }
grid = GridSearchCV(model2, param_grid, verbose=3)

In [15]:
# Train the model with GridSearch
grid.fit(X_train_scaled, y_train)

Fitting 3 folds for each of 16 candidates, totalling 48 fits
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ....... C=1, gamma=0.0001, score=0.846892138939671, total=   0.5s
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.8s remaining:    0.0s


[CV] ...... C=1, gamma=0.0001, score=0.8384439359267735, total=   0.4s
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    1.6s remaining:    0.0s


[CV] ...... C=1, gamma=0.0001, score=0.8480549199084668, total=   0.4s
[CV] C=1, gamma=0.001 ................................................
[CV] ........ C=1, gamma=0.001, score=0.846892138939671, total=   0.4s
[CV] C=1, gamma=0.001 ................................................
[CV] ....... C=1, gamma=0.001, score=0.8384439359267735, total=   0.7s
[CV] C=1, gamma=0.001 ................................................
[CV] ....... C=1, gamma=0.001, score=0.8480549199084668, total=   0.4s
[CV] C=1, gamma=0.01 .................................................
[CV] ......... C=1, gamma=0.01, score=0.846892138939671, total=   0.5s
[CV] C=1, gamma=0.01 .................................................
[CV] ........ C=1, gamma=0.01, score=0.8384439359267735, total=   0.4s
[CV] C=1, gamma=0.01 .................................................
[CV] ........ C=1, gamma=0.01, score=0.8480549199084668, total=   0.4s
[CV] C=1, gamma=0.1 ..................................................
[CV] .



[CV] ...... C=5, gamma=0.0001, score=0.8660877513711152, total=   0.4s
[CV] C=5, gamma=0.0001 ...............................................
[CV] ...... C=5, gamma=0.0001, score=0.8535469107551488, total=   0.3s
[CV] C=5, gamma=0.0001 ...............................................
[CV] ...... C=5, gamma=0.0001, score=0.8745995423340961, total=   0.4s
[CV] C=5, gamma=0.001 ................................................




[CV] ....... C=5, gamma=0.001, score=0.8660877513711152, total=   0.4s
[CV] C=5, gamma=0.001 ................................................
[CV] ....... C=5, gamma=0.001, score=0.8535469107551488, total=   0.3s
[CV] C=5, gamma=0.001 ................................................
[CV] ....... C=5, gamma=0.001, score=0.8745995423340961, total=   0.5s
[CV] C=5, gamma=0.01 .................................................




[CV] ........ C=5, gamma=0.01, score=0.8660877513711152, total=   0.4s
[CV] C=5, gamma=0.01 .................................................
[CV] ........ C=5, gamma=0.01, score=0.8535469107551488, total=   0.3s
[CV] C=5, gamma=0.01 .................................................
[CV] ........ C=5, gamma=0.01, score=0.8745995423340961, total=   0.4s
[CV] C=5, gamma=0.1 ..................................................




[CV] ......... C=5, gamma=0.1, score=0.8660877513711152, total=   0.4s
[CV] C=5, gamma=0.1 ..................................................
[CV] ......... C=5, gamma=0.1, score=0.8535469107551488, total=   0.3s
[CV] C=5, gamma=0.1 ..................................................
[CV] ......... C=5, gamma=0.1, score=0.8745995423340961, total=   0.4s
[CV] C=10, gamma=0.0001 ..............................................




[CV] ...... C=10, gamma=0.0001, score=0.870201096892139, total=   0.6s
[CV] C=10, gamma=0.0001 ..............................................
[CV] ..... C=10, gamma=0.0001, score=0.8649885583524027, total=   0.4s
[CV] C=10, gamma=0.0001 ..............................................
[CV] ..... C=10, gamma=0.0001, score=0.8791762013729977, total=   0.4s
[CV] C=10, gamma=0.001 ...............................................




[CV] ....... C=10, gamma=0.001, score=0.870201096892139, total=   0.4s
[CV] C=10, gamma=0.001 ...............................................
[CV] ...... C=10, gamma=0.001, score=0.8649885583524027, total=   0.5s
[CV] C=10, gamma=0.001 ...............................................
[CV] ...... C=10, gamma=0.001, score=0.8791762013729977, total=   0.4s
[CV] C=10, gamma=0.01 ................................................




[CV] ........ C=10, gamma=0.01, score=0.870201096892139, total=   0.4s
[CV] C=10, gamma=0.01 ................................................
[CV] ....... C=10, gamma=0.01, score=0.8649885583524027, total=   0.5s
[CV] C=10, gamma=0.01 ................................................
[CV] ....... C=10, gamma=0.01, score=0.8791762013729977, total=   0.5s
[CV] C=10, gamma=0.1 .................................................




[CV] ......... C=10, gamma=0.1, score=0.870201096892139, total=   0.4s
[CV] C=10, gamma=0.1 .................................................
[CV] ........ C=10, gamma=0.1, score=0.8649885583524027, total=   0.4s
[CV] C=10, gamma=0.1 .................................................
[CV] ........ C=10, gamma=0.1, score=0.8791762013729977, total=   0.4s
[CV] C=20, gamma=0.0001 ..............................................




[CV] ...... C=20, gamma=0.0001, score=0.876599634369287, total=   0.4s
[CV] C=20, gamma=0.0001 ..............................................
[CV] ..... C=20, gamma=0.0001, score=0.8663615560640733, total=   0.6s
[CV] C=20, gamma=0.0001 ..............................................
[CV] ..... C=20, gamma=0.0001, score=0.8874141876430206, total=   0.4s
[CV] C=20, gamma=0.001 ...............................................




[CV] ....... C=20, gamma=0.001, score=0.876599634369287, total=   0.4s
[CV] C=20, gamma=0.001 ...............................................
[CV] ...... C=20, gamma=0.001, score=0.8663615560640733, total=   0.4s
[CV] C=20, gamma=0.001 ...............................................
[CV] ...... C=20, gamma=0.001, score=0.8874141876430206, total=   0.4s
[CV] C=20, gamma=0.01 ................................................




[CV] ........ C=20, gamma=0.01, score=0.876599634369287, total=   0.4s
[CV] C=20, gamma=0.01 ................................................
[CV] ....... C=20, gamma=0.01, score=0.8663615560640733, total=   0.5s
[CV] C=20, gamma=0.01 ................................................
[CV] ....... C=20, gamma=0.01, score=0.8874141876430206, total=   0.8s
[CV] C=20, gamma=0.1 .................................................




[CV] ......... C=20, gamma=0.1, score=0.876599634369287, total=   0.7s
[CV] C=20, gamma=0.1 .................................................
[CV] ........ C=20, gamma=0.1, score=0.8663615560640733, total=   0.6s
[CV] C=20, gamma=0.1 .................................................
[CV] ........ C=20, gamma=0.1, score=0.8874141876430206, total=   0.5s


[Parallel(n_jobs=1)]: Done  48 out of  48 | elapsed:   41.2s finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=10000, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'C': [1, 5, 10, 20], 'gamma': [0.0001, 0.001, 0.01, 0.1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=3)

In [16]:
print(grid.best_params_)

{'C': 20, 'gamma': 0.0001}


In [17]:
# List the best score
print(grid.best_score_)

0.8767917047880451


In [18]:
# Make predictions with the hypertuned model
predictions = grid.predict(X_test_scaled)

In [19]:
# Calculate classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions,
                            target_names=['CONFIRMED', 'FALSE POSITIVE', 'CANDIDATE']))

                precision    recall  f1-score   support

     CONFIRMED       0.85      0.62      0.72       523
FALSE POSITIVE       0.73      0.88      0.80       594
     CANDIDATE       0.98      1.00      0.99      1069

     micro avg       0.88      0.88      0.88      2186
     macro avg       0.85      0.83      0.83      2186
  weighted avg       0.88      0.88      0.87      2186

