In [1]:
#Import all necessary packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
#Import sklearn models to predict outcomes
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline 
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, roc_auc_score, plot_confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier 
from imblearn.over_sampling import SMOTE
import seaborn as sns


pd.set_option('display.max_columns', 100)

In [2]:
train = pd.read_csv('exoTrain.csv')
test = pd.read_csv('exoTest.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'exoTrain.csv'

In [None]:
train.describe()

In [None]:
fig = plt.figure(figsize=(15,40))
for i in range(12):
    ax = fig.add_subplot(14,4,i+1)
    ax.scatter(np.arange(3197),train[train['LABEL'] == 2].iloc[i,1:],s=1)

In [None]:
fig = plt.figure(figsize=(15,40))
for i in range(12):
    ax = fig.add_subplot(14,4,i+1)
    ax.scatter(np.arange(3197),train[train['LABEL'] == 1].iloc[i,1:],s=1)

In [None]:
train.groupby('LABEL')['FLUX'].count()

In [None]:
test.LABEL.value_counts()

In [None]:
train.groupby(['LABEL']).count()

In [None]:
#for i in range(train.FLUX[i], 3197):
#    flux = train.groupby(['LABEL', 'FLUX.'[i]]).count().unstack()

# Stack of fluxes 
flux = train.groupby(['LABEL'])['FLUX.3188', 'FLUX.1'].count()
p1 = flux.plot(kind = 'bar', stacked = True, 
                   title = 'Fluxes by Stars with and without planets', 
                   color = ['grey','lightgreen'], alpha = .70)
p1.set_xlabel('1: Stars without Planets, 2: Stars with Planets')
p1.set_ylabel('Fluxes')
p1.legend(['Planets','No Planets'])
plt.show()

In [None]:
fig, ax = plt.subplots()
# Plot a histogram of "Weight" for mens_rowing
ax.hist(train['LABEL'], label="Planets", bins=20, histtype='step')
# Compare to histogram of "Weight" for mens_gymnastics
ax.hist(train['LABEL'], label="No Planets", bins=15, histtype='step')
# Set the x-axis label to "Weight (kg)"
ax.set_xlabel('Planets (2), Planets (1)')
# Set the y-axis label to "# of observations"
ax.set_ylabel('FLUX')
ax.set_title('Distrubtion of Flux of Stars with Planets and Without Planets')
ax.legend()
plt.show()

In [None]:
fig = plt.figure(figsize=(15,40))
for i in range(12):
    ax = fig.add_subplot(14,4,i+1)
    train[train['LABEL']==1].iloc[i,1:].hist(bins=40)

fig = plt.figure(figsize=(15,40))
for i in range(12):
    ax = fig.add_subplot(14,4,i+1)
    train[train['LABEL']==2].iloc[i,1:].hist(bins=40)

In [None]:
X_tr = train.drop('LABEL', axis=1)
y_tr = train.LABEL 
X_tt = test.drop('LABEL', axis=1)
y_tt = test.LABEL

# Using Logistic regression to make predictions for present of a planet

In [None]:
logreg = LogisticRegression().fit(X_tr, y_tr)
logreg_pred = logreg.predict(X_tt)
acc_log = accuracy_score(y_tt, logreg_pred)
print(round(acc_log, 2)* 100)

In [None]:
cf_matrix = confusion_matrix(y_tt, logreg_pred)
print(cf_matrix)

In [None]:
group_names = ['True Neg','False Pos','False Neg','True Pos']
group_counts = ["{0:0.0f}".format(value) for value in
                cf_matrix.flatten()]
group_percentages = ["{0:.2%}".format(value) for value in cf_matrix.flatten()/np.sum(cf_matrix)]
labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in zip(group_names,group_counts,group_percentages)]
labels = np.asarray(labels).reshape(2,2)
sns.heatmap(cf_matrix, annot=labels, fmt='', cmap='Blues');

# KNN model

In [None]:
knn = KNeighborsClassifier(n_neighbors=5, p = 1)
# Using a Manhattan distance

In [None]:
knn_m = knn.fit(X_tr, y_tr)
knn_m_p = knn.predict(X_tt)
knn_score = accuracy_score(y_tt, knn_m_p)
knn_score

In [None]:
cf_matrix_knn = confusion_matrix(y_tt, knn_m_p)
print(cf_matrix_knn)

In [None]:
group_names = ['True Neg','False Pos','False Neg','True Pos']
group_counts = ["{0:0.0f}".format(value) for value in
                cf_matrix_knn.flatten()]
group_percentages = ["{0:.2%}".format(value) for value in cf_matrix_knn.flatten()/np.sum(cf_matrix)]
labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in zip(group_names,group_counts,group_percentages)]
labels = np.asarray(labels).reshape(2,2)
sns.heatmap(cf_matrix_knn, annot=labels, fmt='', cmap='Blues');

In [None]:
knn_e = KNeighborsClassifier(n_neighbors=5, p = 2)
# Using Euclidean distance 

In [None]:
knn_e = knn_e.fit(X_tr, y_tr)
knn_e_p = knn_e.predict(X_tt)
knn_e_score = accuracy_score(y_tt, knn_e_p)
knn_e_score

In [None]:
cf_matrix_knn_e = confusion_matrix(y_tt, knn_e_p)
print(cf_matrix_knn_e)

In [None]:
group_names = ['True Neg','False Pos','False Neg','True Pos']
group_counts = ["{0:0.0f}".format(value) for value in
                cf_matrix_knn_e.flatten()]
group_percentages = ["{0:.2%}".format(value) for value in cf_matrix_knn_e.flatten()/np.sum(cf_matrix)]
labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in zip(group_names,group_counts,group_percentages)]
labels = np.asarray(labels).reshape(2,2)
sns.heatmap(cf_matrix_knn_e, annot=labels, fmt='', cmap='Blues');

In [None]:
#Using Minoski Distance 
knn_mi = KNeighborsClassifier(n_neighbors=2, p = 3)

In [None]:
knn_mi = knn_mi.fit(X_tr, y_tr)
knn_mi_p = knn_mi.predict(X_tt)
knn_mi_score = accuracy_score(y_tt, knn_e_p)
knn_mi_score

In [None]:
cf_matrix_knn_mi = confusion_matrix(y_tt, knn_mi_p)
print(cf_matrix_knn_mi)

In [None]:
group_names = ['True Neg','False Pos','False Neg','True Pos']
group_counts = ["{0:0.0f}".format(value) for value in
                cf_matrix_knn_mi.flatten()]
group_percentages = ["{0:.2%}".format(value) for value in cf_matrix_knn_mi.flatten()/np.sum(cf_matrix)]
labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in zip(group_names,group_counts,group_percentages)]
labels = np.asarray(labels).reshape(2,2)
sns.heatmap(cf_matrix_knn_mi, annot=labels, fmt='', cmap='Blues');

# Decision Tree

In [None]:
classifier = DecisionTreeClassifier(random_state=13)  
dt = classifier.fit(X_tr, y_tr) 

In [None]:
dt_pred = dt.predict(X_tt)  
dt_score = accuracy_score(y_tt, dt_pred)
dt_score

In [None]:
cf_matrix_dt = confusion_matrix(y_tt, dt_pred)
print(cf_matrix_dt)

In [None]:
group_names = ['True Neg','False Pos','False Neg','True Pos']
group_counts = ["{0:0.0f}".format(value) for value in
                cf_matrix_dt.flatten()]
group_percentages = ["{0:.2%}".format(value) for value in cf_matrix_dt.flatten()/np.sum(cf_matrix)]
labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in zip(group_names,group_counts,group_percentages)]
labels = np.asarray(labels).reshape(2,2)
sns.heatmap(cf_matrix_dt, annot=labels, fmt='', cmap='Blues');

# Using SMOTE to create syntheic classes for 2: exoplanets present due to the inbalance of classes

In [None]:
sm = SMOTE(sampling_strategy='not majority', random_state = 13)
SM_X_tr, SM_y_tr = sm.fit_sample(X_tr, y_tr)

In [None]:
logreg = LogisticRegression().fit(SM_X_tr, SM_y_tr)
logreg_pred_sm = logreg.predict(X_tt)
acc_log_sm = accuracy_score(y_tt, logreg_pred_sm)
acc_log_sm

In [None]:
cf_matrix_log_sm = confusion_matrix(y_tt, logreg_pred_sm)
print(cf_matrix_log_sm)
group_names = ['True Neg','False Pos','False Neg','True Pos']
group_counts = ["{0:0.0f}".format(value) for value in
                cf_matrix_log_sm.flatten()]
group_percentages = ["{0:.2%}".format(value) for value in cf_matrix_log_sm.flatten()/np.sum(cf_matrix)]
labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in zip(group_names,group_counts,group_percentages)]
labels = np.asarray(labels).reshape(2,2)
sns.heatmap(cf_matrix_log_sm, annot=labels, fmt='', cmap='Blues');

In [None]:
knn_sm = KNeighborsClassifier(n_neighbors=5, p = 2)
# Using Euclidean distance

In [None]:
knn_e_sm = knn_e.fit(SM_X_tr, SM_y_tr)
knn_e_sm_p = knn_e_sm.predict(X_tt)
knn_e_sm_score = accuracy_score(y_tt, knn_e_sm_p)
knn_e_sm_score

In [None]:
cf_matrix_knn_sm = confusion_matrix(y_tt, knn_e_sm_p)
print(cf_matrix_knn_sm)
group_names = ['True Neg','False Pos','False Neg','True Pos']
group_counts = ["{0:0.0f}".format(value) for value in
                cf_matrix_knn_sm.flatten()]
group_percentages = ["{0:.2%}".format(value) for value in cf_matrix_knn_sm.flatten()/np.sum(cf_matrix)]
labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in zip(group_names,group_counts,group_percentages)]
labels = np.asarray(labels).reshape(2,2)
sns.heatmap(cf_matrix_knn_sm, annot=labels, fmt='', cmap='Blues');

In [None]:
dt_sm = classifier.fit(SM_X_tr, SM_y_tr) 

In [None]:
dt_sm_pred = dt_sm.predict(X_tt)  
dt_score = accuracy_score(y_tt, dt_sm_pred)
dt_score

In [None]:
SM_y_tr.value_counts()

In [None]:
cf_matrix_dt_sm = confusion_matrix(y_tt, dt_sm_pred)
print(cf_matrix_dt_sm)
group_names = ['True Neg','False Pos','False Neg','True Pos']
group_counts = ["{0:0.0f}".format(value) for value in
                cf_matrix_dt_sm.flatten()]
group_percentages = ["{0:.2%}".format(value) for value in cf_matrix_dt_sm.flatten()/np.sum(cf_matrix)]
labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in zip(group_names,group_counts,group_percentages)]
labels = np.asarray(labels).reshape(2,2)
sns.heatmap(cf_matrix_dt_sm, annot=labels, fmt='', cmap='Blues');