# Home Work Logistic Regression Model

In [111]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

## Read the CSV and Perform Basic Data Cleaning

In [112]:
df = pd.read_csv("cumulative.csv")

#  Remove features that create noise and result in improved model
df = df.drop(columns=["rowid", "kepid", "kepoi_name", "kepler_name", "koi_pdisposition", "koi_score", "koi_tce_delivname"])

# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.tail(3)

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
9561,CANDIDATE,0,0,0,0,1.739849,1.8e-05,-1.8e-05,133.00127,0.00769,...,-220.0,4.444,0.056,-0.224,1.031,0.341,-0.114,286.50937,47.163219,14.757
9562,FALSE POSITIVE,0,0,1,0,0.681402,2e-06,-2e-06,132.18175,0.00285,...,-236.0,4.447,0.056,-0.224,1.041,0.341,-0.114,294.16489,47.176281,15.385
9563,FALSE POSITIVE,0,0,1,1,4.856035,6.4e-05,-6.4e-05,135.9933,0.0108,...,-225.0,4.385,0.054,-0.216,1.193,0.41,-0.137,297.00977,47.121021,14.826


In [113]:
df.shape

(8744, 41)

## Create a Train and Test Split Cleaning

Use `koi_disposition` for the y values

In [114]:
from sklearn.model_selection import train_test_split

In [115]:
y = df['koi_disposition']

X = df.drop(columns=["koi_disposition"])

X = df[['koi_fpflag_nt', 'koi_fpflag_ss', 'koi_fpflag_co',
       'koi_fpflag_ec', 'koi_period', 'koi_period_err1', 'koi_period_err2',
       'koi_time0bk', 'koi_time0bk_err1', 'koi_time0bk_err2', 'koi_impact',
       'koi_impact_err1', 'koi_impact_err2', 'koi_duration',
       'koi_duration_err1', 'koi_duration_err2', 'koi_depth', 'koi_depth_err1',
       'koi_depth_err2', 'koi_prad', 'koi_prad_err1', 'koi_prad_err2',
       'koi_teq', 'koi_insol', 'koi_insol_err1', 'koi_insol_err2',
       'koi_model_snr', 'koi_tce_plnt_num', 'koi_steff', 'koi_steff_err1',
       'koi_steff_err2', 'koi_slogg', 'koi_slogg_err1', 'koi_slogg_err2',
       'koi_srad', 'koi_srad_err1', 'koi_srad_err2', 'ra', 'dec',
       'koi_kepmag']]

In [116]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [117]:
X_train.tail(3)

Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,koi_time0bk_err2,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
315,0,0,0,0,90.124051,0.000443,-0.000443,356.23415,0.0032,-0.0032,...,-248.0,4.375,0.056,-0.224,1.215,0.41,-0.137,293.22391,38.033581,14.96
6070,1,0,0,0,357.77096,0.01451,-0.01451,309.1578,0.0253,-0.0253,...,-134.0,4.395,0.121,-0.248,0.981,0.372,-0.134,289.88107,41.069221,14.161
2335,0,0,0,0,1.40702,4e-06,-4e-06,131.60897,0.00247,-0.00247,...,-82.0,4.485,0.083,-0.028,0.849,0.034,-0.067,290.43323,51.626041,14.946


In [118]:
y_train.head(5)

5964    FALSE POSITIVE
9410    FALSE POSITIVE
4204    FALSE POSITIVE
5933         CANDIDATE
6996    FALSE POSITIVE
Name: koi_disposition, dtype: object

## Pre-processing

Scale the data using the MinMaxScaler and perform some feature selection

In [119]:
from sklearn.preprocessing import MinMaxScaler

In [120]:
# Scale your data
X_scale = MinMaxScaler().fit(X_train)

In [121]:
scaled_X_train = X_scale.transform(X_train)
scaled_X_test = X_scale.transform(X_test)

In [122]:
# Sample testing
scaled_X_test

array([[1.        , 0.        , 0.        , ..., 0.82761422, 0.61679246,
        0.72055038],
       [0.        , 0.        , 0.        , ..., 0.56804124, 0.18475295,
        0.67847315],
       [0.        , 0.        , 0.        , ..., 0.59535744, 0.40454911,
        0.72720817],
       ...,
       [0.        , 0.        , 0.        , ..., 0.54028689, 0.38039243,
        0.72472259],
       [0.        , 0.        , 0.        , ..., 0.21802835, 0.16108934,
        0.27243675],
       [0.        , 0.        , 0.        , ..., 0.28172331, 0.53591545,
        0.70093209]])

# Create and Train the Logistic Regression Model for Classification

In [123]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(scaled_X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [124]:
print(f"Training Data Score: {model.score(X_train, y_train)}")
print(f"Testing Data Score: {model.score(X_test, y_test)}")

Training Data Score: 0.4582189691979262
Testing Data Score: 0.4606587374199451


# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [125]:
# Create the GridSearchCV model

from sklearn.model_selection import GridSearchCV

grid = GridSearchCV(model, {'C': [1, 10], 'penalty': ["l1", "l2"]}, verbose=3)

In [126]:
# Train the model with GridSearch

grid.fit(X_train, Y_train)

Fitting 3 folds for each of 4 candidates, totalling 12 fits
[CV] C=1, penalty=l1 .................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ..................... C=1, penalty=l1, score=0.882, total=   2.0s
[CV] C=1, penalty=l1 .................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.9s remaining:    0.0s


[CV] ..................... C=1, penalty=l1, score=0.882, total=   4.2s
[CV] C=1, penalty=l1 .................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    6.1s remaining:    0.0s


[CV] ..................... C=1, penalty=l1, score=0.878, total=   3.3s
[CV] C=1, penalty=l2 .................................................




[CV] ..................... C=1, penalty=l2, score=0.676, total=   1.3s
[CV] C=1, penalty=l2 .................................................




[CV] ..................... C=1, penalty=l2, score=0.675, total=   1.7s
[CV] C=1, penalty=l2 .................................................




[CV] ..................... C=1, penalty=l2, score=0.651, total=   1.6s
[CV] C=10, penalty=l1 ................................................
[CV] .................... C=10, penalty=l1, score=0.878, total=   1.4s
[CV] C=10, penalty=l1 ................................................




[CV] .................... C=10, penalty=l1, score=0.885, total=   4.3s
[CV] C=10, penalty=l1 ................................................




[CV] .................... C=10, penalty=l1, score=0.883, total=   4.9s
[CV] C=10, penalty=l2 ................................................




[CV] .................... C=10, penalty=l2, score=0.677, total=   1.4s
[CV] C=10, penalty=l2 ................................................




[CV] .................... C=10, penalty=l2, score=0.673, total=   1.7s
[CV] C=10, penalty=l2 ................................................


[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed:   29.4s finished


[CV] .................... C=10, penalty=l2, score=0.649, total=   1.6s


GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='warn',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='warn',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=None,
             param_grid={'C': [1, 10], 'penalty': ['l1', 'l2']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=3)

In [127]:
print(grid.best_params_)
print(grid.best_score_)

{'C': 10, 'penalty': 'l1'}
0.8821286977737115


In [128]:
grid

GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='warn',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='warn',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=None,
             param_grid={'C': [1, 10], 'penalty': ['l1', 'l2']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=3)

# Save the Model

In [129]:
# save your model by updating "your_name" with your name
# and "your_model" with your model variable
# be sure to turn this in to BCS
# if joblib fails to import, try running the command to install in terminal/git-bash
import joblib
filename = 'chi.sav'
joblib.dump(model_chi, filename)

NameError: name 'model_chi' is not defined