# ML Modeling
### Names: Christian Juarez, Analiese Gonzalez, Alyssa Amancio

In this jupyter notebook, two models are created. One with all of the features from the preprocessed data labeled fullModel. While the other model made has reduced amount of features from the preprocessed data, labeled as reducModel.

In [2]:
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

# Set style for better plots
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

ROOT = Path("/Users/Bubs/Desktop/Planet")

# Load processed data
X = pd.read_parquet(ROOT/"data/processed/features/table_v1.parquet")
y = pd.read_csv(ROOT/"data/processed/labels/labels_v1.csv")
spl = pd.read_csv(ROOT/"data/processed/splits/split_v1.csv")

# Merge all
df = X.merge(y, on="kepid").merge(spl, on="kepid")

### Observing the table's features
A glimpse into what the model looks like and the list of features that are used in the model.

In [3]:
df

Unnamed: 0,kepid,teff,logg,feh,radius,mass,kepmag,rrmscdpp03p0,rrmscdpp06p0,rrmscdpp12p0,...,detection_eff,rrmscdpp03p0_log,rrmscdpp06p0_log,rrmscdpp12p0_log,nconfp,nkoi,ntce,label_strict,label_lenient,split
0,10000785,5333.0,4.616,-1.00,0.650,0.635,15.749,445.410,499.980,589.300,...,0.000054,6.101238,6.216566,6.380631,0,0,2,0,0,train
1,10000797,6289.0,4.270,-0.44,1.195,0.968,13.994,80.767,60.264,45.939,...,0.001693,4.403874,4.115192,3.848849,0,0,0,0,0,train
2,10000800,5692.0,4.547,-0.04,0.866,0.965,15.379,226.348,184.595,158.220,...,0.000264,5.426482,5.223567,5.070287,0,0,0,0,0,test
3,10000823,6580.0,4.377,-0.16,1.169,1.191,15.558,181.468,148.879,132.140,...,0.000590,5.206575,5.009828,4.891401,0,0,0,0,0,val
4,10000827,5648.0,4.559,-0.10,0.841,0.939,14.841,124.834,92.096,67.532,...,0.000517,4.834964,4.533631,4.227301,0,0,0,0,0,train
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150757,9992816,5955.0,4.520,-0.16,0.910,0.999,14.701,192.192,159.996,144.157,...,0.000352,5.263685,5.081380,4.977816,0,0,0,0,0,train
150758,9999784,5591.0,4.559,0.07,0.860,0.977,15.606,211.123,157.659,118.948,...,0.000301,5.357166,5.066757,4.787058,0,0,0,0,0,train
150759,9999869,6110.0,4.450,-0.10,1.010,1.043,13.568,77.290,66.273,63.620,...,0.001134,4.360420,4.208759,4.168524,0,0,0,0,0,train
150760,9999901,5067.0,4.573,0.28,0.807,0.888,13.252,74.759,71.316,78.648,...,0.000689,4.327557,4.281045,4.377617,0,0,0,0,0,train


### Full Model: all features

In [14]:
from sklearn.ensemble import RandomForestClassifier
import pickle

X = df.drop(columns=['split', 'label_lenient', 'label_strict'], axis=1)
y= df['label_lenient']

X_train = X[df['split'] == 'train']
y_train = y[df['split'] == 'train']

X_val = X[df['split'] == 'val']
y_val = y[df['split'] == 'val']

X_test = X[df['split'] == 'test']
y_test = y[df['split'] == 'test']

fullModel = RandomForestClassifier(random_state = 42)
fullModel.fit(X_train,y_train)

filename_one = 'finalized_fullModel_M1.sav'
pickle.dump(fullModel, open(filename_one, 'wb'))

loaded_model = pickle.load(open(filename_one, 'rb'))
result = loaded_model.score(X_test, y_test)
print(result)


#print("Validation Score: ")
#print(fullModel.score(X_val, y_val))

#print("Test Score: ")
#print(fullModel.score(X_test, y_test))


0.9918084436042848


#### Importance of Features Listed

In [12]:
results = zip(X.columns, fullModel.feature_importances_)
for x in list(results):
    print(x)

('kepid', 0.019353787450185986)
('teff', 0.019706400091493513)
('logg', 0.011584818655851639)
('feh', 0.012754934324715072)
('radius', 0.012743928218487022)
('mass', 0.014800001452362307)
('kepmag', 0.01652014618448349)
('rrmscdpp03p0', 0.011565241371143712)
('rrmscdpp06p0', 0.01081321135531046)
('rrmscdpp12p0', 0.011846488569750378)
('dataspan', 0.042581911501460415)
('dutycycle', 0.014333980872650802)
('nquarters', 0.009600503870999314)
('feh_x_teff', 0.01761594463171127)
('radius_x_kepmag', 0.01339899705910118)
('transit_prob', 0.0130450564213693)
('stellar_density', 0.012105727251262946)
('noise_star_ratio', 0.012172311691087099)
('noise_consistency', 0.01780552726661906)
('obs_quality', 0.015421535773940868)
('detection_eff', 0.011645498999973107)
('rrmscdpp03p0_log', 0.01122256882050056)
('rrmscdpp06p0_log', 0.010827331012909863)
('rrmscdpp12p0_log', 0.011612615892071432)
('nconfp', 0.3312769300599852)
('nkoi', 0.25206854122510824)
('ntce', 0.06157605997546588)


### Reduced Model: selected features

In [11]:
from sklearn.ensemble import RandomForestClassifier
import pickle

reduc_df = (df.filter(['nconfp','nkoi','ntce','dataspan','teff','kepid','noise_consistency','feh_x_teff','kepmag','obs_quality']))

X = reduc_df
y= df['label_lenient']

X_train = X[df['split'] == 'train']
y_train = y[df['split'] == 'train']

X_val = X[df['split'] == 'val']
y_val = y[df['split'] == 'val']

X_test = X[df['split'] == 'test']
y_test = y[df['split'] == 'test']

reducModel = RandomForestClassifier(random_state = 42)
reducModel.fit(X_train,y_train)

filename_two = 'finalized_reducModel_M2.sav'
pickle.dump(reducModel, open(filename_two, 'wb'))

loaded_model = pickle.load(open(filename_two, 'rb'))
result = loaded_model.score(X_test, y_test)
print(result)

#print("Validation Score: ")
#print(reducModel.score(X_val, y_val))

#print("Test Score: ")
#print(reducModel.score(X_test, y_test))

0.9919742645839552
