In [1]:
#dependencies
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

# Read the CSV and Perform Basic Data Cleaning

In [2]:
df = pd.read_csv("exoplanet_data.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,FALSE POSITIVE,0,1,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
3,CONFIRMED,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,...,-211,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509
4,CONFIRMED,0,0,0,0,4.134435,1.05e-05,-1.05e-05,172.97937,0.0019,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714


# Select your features (columns)

In [3]:
# Set features. This will also be used as your x values.
selected_features = df[['koi_disposition', 'koi_period', 'koi_duration', 'koi_srad', 'koi_prad']]
humanlegible = selected_features.rename(columns={"koi_disposition": "KOI Disposition", "koi_period": "KOI Period (days)", "koi_duration": "KOI Duration (hrs)", "koi_srad": "KOI SRad (solar radii)", "koi_prad": "KOI Prad (earth radii)"})
humanlegible.head()

Unnamed: 0,KOI Disposition,KOI Period (days),KOI Duration (hrs),KOI SRad (solar radii),KOI Prad (earth radii)
0,CONFIRMED,54.418383,4.507,0.927,2.83
1,FALSE POSITIVE,19.89914,1.7822,0.868,14.6
2,FALSE POSITIVE,1.736952,2.40641,0.791,33.46
3,CONFIRMED,2.525592,1.6545,1.046,2.75
4,CONFIRMED,4.134435,3.1402,0.972,2.77


In [4]:
selected_features.head()

Unnamed: 0,koi_disposition,koi_period,koi_duration,koi_srad,koi_prad
0,CONFIRMED,54.418383,4.507,0.927,2.83
1,FALSE POSITIVE,19.89914,1.7822,0.868,14.6
2,FALSE POSITIVE,1.736952,2.40641,0.791,33.46
3,CONFIRMED,2.525592,1.6545,1.046,2.75
4,CONFIRMED,4.134435,3.1402,0.972,2.77


# Create a Train Test Split

Use `koi_disposition` for the y values

In [5]:
from sklearn.model_selection import train_test_split
# use `koi_disposition` for the y values
y = selected_features["koi_disposition"]
#X values are all other values besides `koi_disposition`
X = selected_features.drop("koi_disposition", axis=1)
#set up the train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=50, stratify=y)

In [6]:
X_train

Unnamed: 0,koi_period,koi_duration,koi_srad,koi_prad
3904,359.006880,32.1600,0.834,28.57
3357,2.178119,1.4680,0.921,0.80
6773,85.977043,1.9190,0.452,1.46
4612,201.621794,3.1720,0.923,2.02
4975,132.114000,2.5140,0.523,0.75
...,...,...,...,...
2656,45.155356,5.7845,0.972,7.65
5769,1.235135,5.4400,0.997,0.70
2527,9.848246,5.7471,1.248,2.68
3429,27.685336,4.6610,0.639,3.11


# Pre-processing

Scale the data using the MinMaxScaler and perform some feature selection

In [7]:
# Scale your data
# all variables (X_train, X_test, etc) from the train test split reflect
#     the selected features
X_train_scaled = MinMaxScaler().fit(X_train).transform(X_train)
X_test_scaled = MinMaxScaler().fit(X_train).transform(X_test)

# Train the Model



In [8]:
trained_model = LogisticRegression()
trained_model.fit(X_train, y_train)
print(f"Training Data Score: {trained_model.score(X_train, y_train)}")
print(f"Testing Data Score: {trained_model.score(X_test, y_test)}")

Training Data Score: 0.5290864009155064
Testing Data Score: 0.532608695652174


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [9]:
# Create the GridSearchCV model
hyperparams = {'C': [1, 5, 10],
              'penalty': ["l1", "l2"]}
#run GridSearchCV to search linear and RBF grids
final_model = GridSearchCV(trained_model, hyperparams, verbose=3)

In [10]:
# Train the model with GridSearch
final_model.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s


Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV] C=1, penalty=l1 .................................................
[CV] ....................... C=1, penalty=l1, score=nan, total=   0.0s
[CV] C=1, penalty=l1 .................................................
[CV] ....................... C=1, penalty=l1, score=nan, total=   0.0s
[CV] C=1, penalty=l1 .................................................
[CV] ....................... C=1, penalty=l1, score=nan, total=   0.0s
[CV] C=1, penalty=l1 .................................................
[CV] ....................... C=1, penalty=l1, score=nan, total=   0.0s
[CV] C=1, penalty=l1 .................................................
[CV] ....................... C=1, penalty=l1, score=nan, total=   0.0s
[CV] C=1, penalty=l2 .................................................


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[CV] ..................... C=1, penalty=l2, score=0.548, total=   0.5s
[CV] C=1, penalty=l2 .................................................


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[CV] ..................... C=1, penalty=l2, score=0.522, total=   0.3s
[CV] C=1, penalty=l2 .................................................


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[CV] ..................... C=1, penalty=l2, score=0.548, total=   0.3s
[CV] C=1, penalty=l2 .................................................


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[CV] ..................... C=1, penalty=l2, score=0.520, total=   0.3s
[CV] C=1, penalty=l2 .................................................


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.



[CV] ..................... C=1, penalty=l2, score=0.521, total=   0.4s
[CV] C=5, penalty=l1 .................................................
[CV] ....................... C=5, penalty=l1, score=nan, total=   0.0s
[CV] C=5, penalty=l1 .................................................
[CV] ....................... C=5, penalty=l1, score=nan, total=   0.0s
[CV] C=5, penalty=l1 .................................................
[CV] ....................... C=5, penalty=l1, score=nan, total=   0.0s
[CV] C=5, penalty=l1 .................................................
[CV] ....................... C=5, penalty=l1, score=nan, total=   0.0s
[CV] C=5, penalty=l1 .................................................
[CV] ....................... C=5, penalty=l1, score=nan, total=   0.0s
[CV] C=5, penalty=l2 .................................................


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[CV] ..................... C=5, penalty=l2, score=0.546, total=   0.4s
[CV] C=5, penalty=l2 .................................................


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[CV] ..................... C=5, penalty=l2, score=0.517, total=   0.4s
[CV] C=5, penalty=l2 .................................................


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[CV] ..................... C=5, penalty=l2, score=0.545, total=   0.3s
[CV] C=5, penalty=l2 .................................................


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[CV] ..................... C=5, penalty=l2, score=0.520, total=   0.4s
[CV] C=5, penalty=l2 .................................................


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.



[CV] ..................... C=5, penalty=l2, score=0.522, total=   0.5s
[CV] C=10, penalty=l1 ................................................
[CV] ...................... C=10, penalty=l1, score=nan, total=   0.0s
[CV] C=10, penalty=l1 ................................................
[CV] ...................... C=10, penalty=l1, score=nan, total=   0.0s
[CV] C=10, penalty=l1 ................................................
[CV] ...................... C=10, penalty=l1, score=nan, total=   0.0s
[CV] C=10, penalty=l1 ................................................
[CV] ...................... C=10, penalty=l1, score=nan, total=   0.0s
[CV] C=10, penalty=l1 ................................................
[CV] ...................... C=10, penalty=l1, score=nan, total=   0.0s
[CV] C=10, penalty=l2 ................................................


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[CV] .................... C=10, penalty=l2, score=0.548, total=   0.3s
[CV] C=10, penalty=l2 ................................................


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[CV] .................... C=10, penalty=l2, score=0.517, total=   0.3s
[CV] C=10, penalty=l2 ................................................


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[CV] .................... C=10, penalty=l2, score=0.548, total=   0.4s
[CV] C=10, penalty=l2 ................................................


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[CV] .................... C=10, penalty=l2, score=0.522, total=   0.4s
[CV] C=10, penalty=l2 ................................................


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    5.8s finished


[CV] .................... C=10, penalty=l2, score=0.522, total=   0.4s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


GridSearchCV(cv=None, error_score=nan,
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='auto',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='lbfgs',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [1, 5, 10], 'penalty': ['l1', 'l2']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=3)

In [11]:
print(final_model.best_params_)
print(final_model.best_score_)

{'C': 1, 'penalty': 'l2'}
0.5319429991485893


In [12]:
from sklearn.metrics import classification_report
print(classification_report(y_test, final_model.predict(X_test_scaled)))

                precision    recall  f1-score   support

     CANDIDATE       0.00      0.00      0.00       422
     CONFIRMED       0.26      1.00      0.41       450
FALSE POSITIVE       0.00      0.00      0.00       876

      accuracy                           0.26      1748
     macro avg       0.09      0.33      0.14      1748
  weighted avg       0.07      0.26      0.11      1748



  _warn_prf(average, modifier, msg_start, len(result))


# Save the Model

In [13]:
filename = 'culhane_log_reg_model(lrm).sav'
joblib.dump(final_model, filename)

['culhane_log_reg_model(lrm).sav']

# Testing

In [14]:
#test to make sure the dump/load doesn't corrupt the file
loaded_model = joblib.load(filename)
#compare scores of the pre-save and post-save models
comparison1 = loaded_model.score(X_test, y_test)
comparison2 = trained_model.score(X_test, y_test)
if comparison1 == comparison2:
    print("Test Successful")
else:
    print("Test Failed")

Test Successful


# Compare

In [15]:
loaded_model2 = joblib.load('culhane_sup_vec_model_(svm).sav')
if loaded_model.score(X_test, y_test) > loaded_model2.score(X_test, y_test):
    print('Higher Score: Logistic Regression')
else:
    print('Higher Score: Support Vector')

Higher Score: Logistic Regression
