In [1]:
import pandas as pd

# Read the CSV and Perform Basic Data Cleaning

In [2]:
df = pd.read_csv("cumulative.csv")
df = df.drop(columns=["rowid", "kepid", "kepoi_name", "kepler_name", "koi_pdisposition", "koi_score", "koi_tce_delivname"])
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,9.488036,2.775e-05,-2.775e-05,170.53875,0.00216,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,CONFIRMED,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
2,FALSE POSITIVE,0,1,0,0,19.89914,1.494e-05,-1.494e-05,175.850252,0.000581,...,-176.0,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
3,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,-174.0,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
4,CONFIRMED,0,0,0,0,2.525592,3.761e-06,-3.761e-06,171.59555,0.00113,...,-211.0,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509


# Create a Train Test Split

Use `koi_disposition` for the y values

In [3]:
#import dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler


#assign x and y values
X = df.drop("koi_disposition", axis=1)
y = df["koi_disposition"]

#split training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=84, stratify=y)

In [4]:
X_train.head()

Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,koi_time0bk_err2,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
7417,1,0,1,0,0.811391,2.423e-06,-2.423e-06,131.8759,0.00231,-0.00231,...,-150.0,2.682,0.033,-0.027,12.487,0.836,-3.343,298.05032,43.23317,11.817
1119,0,1,1,1,0.7982,2.75e-07,-2.75e-07,133.962079,0.000392,-0.000392,...,-206.0,4.449,0.05,-0.2,1.031,0.299,-0.107,296.98853,46.596882,14.613
1162,0,0,0,0,2.806243,7.519e-06,-7.519e-06,133.7314,0.00203,-0.00203,...,-91.0,4.684,0.023,-0.025,0.581,0.027,-0.03,289.29065,51.40889,15.988
3433,0,0,0,0,31.784816,0.0001006,-0.0001006,135.87105,0.00269,-0.00269,...,-117.0,4.022,0.195,-0.09,1.694,0.257,-0.418,289.07755,46.005219,13.988
736,0,0,1,0,78.925742,0.0001925,-0.0001925,429.37454,0.00161,-0.00161,...,-202.0,4.538,0.042,-0.179,0.882,0.23,-0.077,291.44522,44.52869,15.913


# Pre-processing

Scale the data using the MinMaxScaler

In [5]:
X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

  return self.partial_fit(X, y)


# Train the Support Vector Machine

In [6]:
 # Support vector machine linear classifier
from sklearn.svm import SVC 
model = SVC(kernel='linear')
model.fit(X_train_scaled, y_train)
predictions = model.predict(X_test)

In [7]:
 SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [8]:
print(f"Training Data Score: {model.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {model.score(X_test_scaled, y_test)}")

Training Data Score: 0.8455321744434279
Testing Data Score: 0.8476669716376944


# Hyperparameter Tuning

Use `GridSearchCV` to tune the `C` and `gamma` parameters

In [9]:
# Create the GridSearchCV model
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [1, 5, 10, 50],
              'gamma': [0.0001, 0.0005, 0.001, 0.005]}
grid = GridSearchCV(model, param_grid=param_grid, verbose=3, cv=5)

In [10]:
# Train the model with GridSearch
grid.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ...... C=1, gamma=0.0001, score=0.8528963414634146, total=   0.6s
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.1s remaining:    0.0s


[CV] ...... C=1, gamma=0.0001, score=0.8300304878048781, total=   0.6s
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    2.3s remaining:    0.0s


[CV] ................. C=1, gamma=0.0001, score=0.84375, total=   0.8s
[CV] C=1, gamma=0.0001 ...............................................
[CV] ...... C=1, gamma=0.0001, score=0.8353658536585366, total=   0.6s
[CV] C=1, gamma=0.0001 ...............................................
[CV] ...... C=1, gamma=0.0001, score=0.8419847328244274, total=   0.6s
[CV] C=1, gamma=0.0005 ...............................................
[CV] ...... C=1, gamma=0.0005, score=0.8528963414634146, total=   0.6s
[CV] C=1, gamma=0.0005 ...............................................
[CV] ...... C=1, gamma=0.0005, score=0.8300304878048781, total=   0.7s
[CV] C=1, gamma=0.0005 ...............................................
[CV] ................. C=1, gamma=0.0005, score=0.84375, total=   0.5s
[CV] C=1, gamma=0.0005 ...............................................
[CV] ...... C=1, gamma=0.0005, score=0.8353658536585366, total=   0.6s
[CV] C=1, gamma=0.0005 ...............................................
[CV] .

[CV] ..... C=50, gamma=0.0001, score=0.8963414634146342, total=   1.0s
[CV] C=50, gamma=0.0001 ..............................................
[CV] ..... C=50, gamma=0.0001, score=0.8772865853658537, total=   0.7s
[CV] C=50, gamma=0.0001 ..............................................
[CV] ...... C=50, gamma=0.0001, score=0.881859756097561, total=   0.8s
[CV] C=50, gamma=0.0001 ..............................................
[CV] ..... C=50, gamma=0.0001, score=0.8765243902439024, total=   1.0s
[CV] C=50, gamma=0.0001 ..............................................
[CV] ..... C=50, gamma=0.0001, score=0.8793893129770992, total=   0.7s
[CV] C=50, gamma=0.0005 ..............................................
[CV] ..... C=50, gamma=0.0005, score=0.8963414634146342, total=   0.8s
[CV] C=50, gamma=0.0005 ..............................................
[CV] ..... C=50, gamma=0.0005, score=0.8772865853658537, total=   0.7s
[CV] C=50, gamma=0.0005 ..............................................
[CV] .

[Parallel(n_jobs=1)]: Done  80 out of  80 | elapsed:  1.7min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'C': [1, 5, 10, 50], 'gamma': [0.0001, 0.0005, 0.001, 0.005]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=3)

In [11]:
print(grid.best_params_)
print(grid.best_score_)

{'C': 50, 'gamma': 0.0001}
0.8822811832875876
