In [1]:
# Update sklearn to prevent version mismatches, uncomment if update is required
#!pip install sklearn --upgrade

In [2]:
# install joblib. This will be used to save your model, uncomment if update is required
# Restart your kernel after installing 
#!pip install joblib

In [3]:
# Import dependencies
import pandas as pd

# Read the CSV and Perform Basic Data Cleaning

In [4]:
# Read in the data
df = pd.read_csv("../data/exoplanet_data.csv")

# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')

# Drop the null rows
df = df.dropna()

df

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,54.418383,2.479000e-04,-2.479000e-04,162.513840,0.003520,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,FALSE POSITIVE,0,1,0,0,19.899140,1.490000e-05,-1.490000e-05,175.850252,0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,2.630000e-07,-2.630000e-07,170.307565,0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.285210,15.597
3,CONFIRMED,0,0,0,0,2.525592,3.760000e-06,-3.760000e-06,171.595550,0.001130,...,-211,4.438,0.070,-0.210,1.046,0.334,-0.133,288.75488,48.226200,15.509
4,CONFIRMED,0,0,0,0,4.134435,1.050000e-05,-1.050000e-05,172.979370,0.001900,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.224670,15.714
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6986,FALSE POSITIVE,0,0,0,1,8.589871,1.846000e-04,-1.846000e-04,132.016100,0.015700,...,-152,4.296,0.231,-0.189,1.088,0.313,-0.228,298.74921,46.973351,14.478
6987,FALSE POSITIVE,0,1,1,0,0.527699,1.160000e-07,-1.160000e-07,131.705093,0.000170,...,-166,4.529,0.035,-0.196,0.903,0.237,-0.079,297.18875,47.093819,14.082
6988,CANDIDATE,0,0,0,0,1.739849,1.780000e-05,-1.780000e-05,133.001270,0.007690,...,-220,4.444,0.056,-0.224,1.031,0.341,-0.114,286.50937,47.163219,14.757
6989,FALSE POSITIVE,0,0,1,0,0.681402,2.430000e-06,-2.430000e-06,132.181750,0.002850,...,-236,4.447,0.056,-0.224,1.041,0.341,-0.114,294.16489,47.176281,15.385


# Select your features (columns)

Use `koi_disposition` for the y values

In [5]:
# Set the features for x and the target for y

# We are using the feature importance from the previous RandomForest notebook to do our feature selection here by dropping the lowest ranking features
X = df.drop(columns=["koi_disposition", "koi_steff", "koi_srad", "koi_slogg", "koi_slogg_err1", "koi_srad_err2", "koi_tce_plnt_num"], axis=1)
y = df["koi_disposition"].values.reshape(-1, 1)
target_names = ["CANDIDATE", "CONFIRMED", "FALSE POSITIVE"]
# Print the shape of both to make sure it is the same
print(X.shape, y.shape)

(6991, 34) (6991, 1)


# Create a Train Test Split

In [6]:
# Import dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Use the label encoder to change the categorical target values into numeric values using mapping, example: candidate = 0, confirmed = 1, false positive = 2
label_encoder = LabelEncoder()
label_encoder.fit(y)

# Store the mapping used by the label encoder so we can view it
le_name_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
label_encoder.transform(y)

# Create the training and testing data with a random state for repeatability
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=250)

print(le_name_mapping)

{'CANDIDATE': 0, 'CONFIRMED': 1, 'FALSE POSITIVE': 2}


# Pre-processing and Training

Scale the data using the StandardScaler

In [7]:
# Import dependencies
from sklearn.preprocessing import StandardScaler

# Create a StandardScaler model and fit it to the training data
X_scaler = StandardScaler().fit(X_train)

# Transform the training and testing data with the scaler models
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Support vector machine linear classifier
from sklearn.svm import SVC 
model = SVC(kernel='linear')
model.fit(X_train_scaled, y_train)

# Model Accuracy
print('Test Acc: %.3f' % model.score(X_test_scaled, y_test))

Test Acc: 0.883


In [8]:
# Calculate classification report
from sklearn.metrics import classification_report
predictions = model.predict(X_test_scaled)
print(classification_report(y_test, predictions,
                            target_names=target_names))

                precision    recall  f1-score   support

     CANDIDATE       0.82      0.71      0.76       420
     CONFIRMED       0.77      0.82      0.79       453
FALSE POSITIVE       0.97      1.00      0.98       875

      accuracy                           0.88      1748
     macro avg       0.85      0.84      0.85      1748
  weighted avg       0.88      0.88      0.88      1748



# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [9]:
# Create the GridSearchCV model
from sklearn.model_selection import GridSearchCV
param_grids = {'C': [1, 5, 10],
              'gamma': [0.0001, 0.001, 0.01]}
grid = GridSearchCV(model, param_grids, verbose=3)

In [10]:
# Fit the model using the grid search estimator
# This will take the SVC model and try each combination of parameters
grid.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV] C=1, gamma=0.0001 ...............................................
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[CV] ................... C=1, gamma=0.0001, score=0.898, total=   1.1s
[CV] C=1, gamma=0.0001 ...............................................
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.0s remaining:    0.0s
[CV] ................... C=1, gamma=0.0001, score=0.870, total=   0.9s
[CV] C=1, gamma=0.0001 ...............................................
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    1.9s remaining:    0.0s
[CV] ................... C=1, gamma=0.0001, score=0.891, total=   0.8s
[CV] C=1, gamma=0.0001 ...............................................
[CV] ................... C=1, gamma=0.0001, score=0.906, total=   0.6s
[CV] C=1, gamma=0.0001 ...............................................
[CV] ................... C=1, gamma=0.0001, score=0.888, tot

GridSearchCV(cv=None, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='linear', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [1, 5, 10], 'gamma': [0.0001, 0.001, 0.01]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=3)

In [11]:
# Find the best parameters and corresponding score
print(grid.best_params_)
print(grid.best_score_)

{'C': 5, 'gamma': 0.0001}
0.8918581127791645


In [12]:
# Check the training and testing scores
print(f'Training Score: {grid.score(X_train_scaled, y_train)}')
grid.predict(X_test_scaled)
print(f'Testing Score: {grid.score(X_test_scaled, y_test)}')

Training Score: 0.8931909212283045
Testing Score: 0.8844393592677345


# Save the Model

In [13]:
import joblib
filename = 'Autonomousse_SVM.sav'
joblib.dump(model, filename)

['Autonomousse_SVM.sav']