# Logistic Regression Model

### Exoplanet Research

#### By Erick Hernandez

Given the information about the exoplanet research, a logistic regression model will be used with selected columns in order to return boolean values for individual results.

In [1]:
# Import of modules

import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# Reading of attached CSV

df = pd.read_csv("Resources/exoplanet_data.csv")
df.head()

Unnamed: 0,rowid,kepid,kepoi_name,kepler_name,koi_disposition,koi_pdisposition,koi_score,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,1,10797460,K00752.01,Kepler-227 b,CONFIRMED,CANDIDATE,1.0,0,0,0,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,2,10797460,K00752.02,Kepler-227 c,CONFIRMED,CANDIDATE,0.969,0,0,0,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
2,3,10811496,K00753.01,,FALSE POSITIVE,FALSE POSITIVE,0.0,0,1,0,...,-176.0,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
3,4,10848459,K00754.01,,FALSE POSITIVE,FALSE POSITIVE,0.0,0,1,0,...,-174.0,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
4,5,10854555,K00755.01,Kepler-664 b,CONFIRMED,CANDIDATE,1.0,0,0,0,...,-211.0,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509


In [3]:
df["koi_score"].fillna(0, inplace=True)

In [4]:
# Assign X (data) and y (target)

X = df[["koi_score", "koi_period", "koi_duration", "ra", "dec"]]
y = df["koi_disposition"]

X = X[:100]
y = y[:100]

In [5]:
# Split our data into training and testing

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [6]:
# Create a Logistic Regression Model

from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier

LogisticRegression()

In [7]:
# Scaling information
from sklearn import preprocessing
scaler = preprocessing.StandardScaler().fit(X_train)
scaler

StandardScaler()

In [8]:
# Fit (train) or model using the training data

classifier.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [9]:
# Validate the model using the test data

print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.92
Testing Data Score: 0.88


In [10]:
# Make predictions

predictions = classifier.predict(X_test)
print(f"First 10 Predictions:   {predictions[:10]}")
print(f"First 10 Actual labels: {y_test[:10].tolist()}")

First 10 Predictions:   ['FALSE POSITIVE' 'CONFIRMED' 'CONFIRMED' 'CONFIRMED' 'FALSE POSITIVE'
 'FALSE POSITIVE' 'CONFIRMED' 'CONFIRMED' 'CONFIRMED' 'CONFIRMED']
First 10 Actual labels: ['FALSE POSITIVE', 'CANDIDATE', 'CONFIRMED', 'CONFIRMED', 'FALSE POSITIVE', 'FALSE POSITIVE', 'CONFIRMED', 'CONFIRMED', 'CONFIRMED', 'CONFIRMED']


In [11]:
# Compare results

pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)

Unnamed: 0,Prediction,Actual
0,FALSE POSITIVE,FALSE POSITIVE
1,CONFIRMED,CANDIDATE
2,CONFIRMED,CONFIRMED
3,CONFIRMED,CONFIRMED
4,FALSE POSITIVE,FALSE POSITIVE
5,FALSE POSITIVE,FALSE POSITIVE
6,CONFIRMED,CONFIRMED
7,CONFIRMED,CONFIRMED
8,CONFIRMED,CONFIRMED
9,CONFIRMED,CONFIRMED
