In [59]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_selection import SelectKBest
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier
from sklearn.externals import joblib
from multiprocessing import cpu_count

In [2]:
# Set DataFrame display option
pd.options.display.max_columns = None

In [3]:
# Show the number of CPU cores
print(cpu_count())

12


# Read the CSV and Perform Basic Data Cleaning

In [4]:
df = pd.read_csv("cumulative.csv")
df = df.drop(columns=["rowid", "kepid", "kepoi_name", "kepler_name", "koi_pdisposition", "koi_score", "koi_tce_delivname"])
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,koi_time0bk_err2,koi_impact,koi_impact_err1,koi_impact_err2,koi_duration,koi_duration_err1,koi_duration_err2,koi_depth,koi_depth_err1,koi_depth_err2,koi_prad,koi_prad_err1,koi_prad_err2,koi_teq,koi_insol,koi_insol_err1,koi_insol_err2,koi_model_snr,koi_tce_plnt_num,koi_steff,koi_steff_err1,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,9.488036,2.775e-05,-2.775e-05,170.53875,0.00216,-0.00216,0.146,0.318,-0.146,2.9575,0.0819,-0.0819,615.8,19.5,-19.5,2.26,0.26,-0.15,793.0,93.59,29.45,-16.65,35.8,1.0,5455.0,81.0,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,CONFIRMED,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,-0.00352,0.586,0.059,-0.443,4.507,0.116,-0.116,874.8,35.5,-35.5,2.83,0.32,-0.19,443.0,9.11,2.87,-1.62,25.8,2.0,5455.0,81.0,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
2,FALSE POSITIVE,0,1,0,0,19.89914,1.494e-05,-1.494e-05,175.850252,0.000581,-0.000581,0.969,5.126,-0.077,1.7822,0.0341,-0.0341,10829.0,171.0,-171.0,14.6,3.92,-1.31,638.0,39.3,31.04,-10.49,76.3,1.0,5853.0,158.0,-176.0,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
3,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,-0.000115,1.276,0.115,-0.092,2.40641,0.00537,-0.00537,8079.2,12.8,-12.8,33.46,8.5,-2.83,1395.0,891.96,668.95,-230.35,505.6,1.0,5805.0,157.0,-174.0,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
4,CONFIRMED,0,0,0,0,2.525592,3.761e-06,-3.761e-06,171.59555,0.00113,-0.00113,0.701,0.235,-0.478,1.6545,0.042,-0.042,603.3,16.9,-16.9,2.75,0.88,-0.35,1406.0,926.16,874.33,-314.24,40.9,1.0,6031.0,169.0,-211.0,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509


In [5]:
df.shape

(8744, 41)

In [6]:
df.describe()

Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,koi_time0bk_err2,koi_impact,koi_impact_err1,koi_impact_err2,koi_duration,koi_duration_err1,koi_duration_err2,koi_depth,koi_depth_err1,koi_depth_err2,koi_prad,koi_prad_err1,koi_prad_err2,koi_teq,koi_insol,koi_insol_err1,koi_insol_err2,koi_model_snr,koi_tce_plnt_num,koi_steff,koi_steff_err1,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
count,8744.0,8744.0,8744.0,8744.0,8744.0,8744.0,8744.0,8744.0,8744.0,8744.0,8744.0,8744.0,8744.0,8744.0,8744.0,8744.0,8744.0,8744.0,8744.0,8744.0,8744.0,8744.0,8744.0,8744.0,8744.0,8744.0,8744.0,8744.0,8744.0,8744.0,8744.0,8744.0,8744.0,8744.0,8744.0,8744.0,8744.0,8744.0,8744.0,8744.0
mean,0.157136,0.241194,0.201167,0.125114,56.080618,0.001839523,-0.001839523,164.563271,0.0093,-0.0093,0.718315,1.972656,-0.330185,5.521409,0.317958,-0.317958,23556.89,119.695174,-119.695174,103.93219,17.553652,-34.046233,1087.503545,7237.635,3387.183,-3775.831,268.401315,1.240279,5698.246569,144.1035,-161.354758,4.310223,0.120733,-0.140411,1.704566,0.352884,-0.379933,292.075061,43.828259,14.273969
std,0.36395,0.427832,0.400895,0.330867,117.38528,0.007276504,0.007276504,66.476457,0.021662,0.021662,3.176618,9.464462,1.227048,6.318934,0.641691,0.641691,81879.98,4196.518567,4196.518567,3151.780687,395.040369,1223.495531,839.110779,159096.6,51204.5,88918.09,811.988767,0.654931,797.953594,47.284063,72.986448,0.431557,0.132813,0.082936,5.682429,0.906364,1.810943,4.772918,3.599786,1.343509
min,0.0,0.0,0.0,0.0,0.25982,1.1e-08,-0.1568,120.515914,9e-06,-0.569,0.0,0.0,-59.32,0.167,0.0,-20.2,4.5,0.0,-388600.0,0.14,0.0,-77180.0,92.0,0.02,0.0,-5600031.0,1.6,1.0,2661.0,0.0,-1762.0,0.047,0.0,-1.207,0.109,0.0,-103.825,279.85272,36.577381,6.966
25%,0.0,0.0,0.0,0.0,2.667824,5.28675e-06,-0.000245625,132.729408,0.0012,-0.01,0.197,0.04,-0.447,2.43775,0.050275,-0.331,160.6,9.475,-47.6,1.4,0.24,-2.0325,551.0,21.815,11.1,-294.4275,12.4,1.0,5296.0,105.0,-197.0,4.21575,0.043,-0.195,0.826,0.12575,-0.247,288.670237,40.805911,13.474
50%,0.0,0.0,0.0,0.0,8.970985,3.323e-05,-3.323e-05,136.910235,0.00402,-0.00402,0.5405,0.192,-0.207,3.778935,0.1375,-0.1375,421.85,20.2,-20.2,2.4,0.51,-0.31,884.0,144.625,78.215,-43.89,23.65,1.0,5757.0,157.0,-159.0,4.439,0.07,-0.127,0.997,0.246,-0.111,292.285005,43.703989,14.534
75%,0.0,0.0,0.0,0.0,34.190033,0.000245625,-5.28675e-06,169.975942,0.01,-0.0012,0.889,0.37885,-0.045,6.172,0.331,-0.050275,1462.875,47.6,-9.475,14.8725,2.5425,-0.14,1381.0,859.585,530.435,-6.28,79.3,1.0,6109.0,174.0,-112.0,4.544,0.149,-0.087,1.34625,0.356,-0.069,295.90051,46.722135,15.31825
max,1.0,1.0,1.0,1.0,1071.232624,0.1568,-1.1e-08,1472.522306,0.569,-9e-06,100.806,85.54,0.0,138.54,20.2,0.0,1541400.0,388600.0,0.0,200346.0,21640.0,0.0,14667.0,10947550.0,3617133.0,0.0,9054.7,8.0,15896.0,676.0,0.0,5.364,1.472,0.0,180.013,33.091,0.0,301.72076,52.33601,19.065


In [7]:
df.dtypes

koi_disposition       object
koi_fpflag_nt          int64
koi_fpflag_ss          int64
koi_fpflag_co          int64
koi_fpflag_ec          int64
koi_period           float64
koi_period_err1      float64
koi_period_err2      float64
koi_time0bk          float64
koi_time0bk_err1     float64
koi_time0bk_err2     float64
koi_impact           float64
koi_impact_err1      float64
koi_impact_err2      float64
koi_duration         float64
koi_duration_err1    float64
koi_duration_err2    float64
koi_depth            float64
koi_depth_err1       float64
koi_depth_err2       float64
koi_prad             float64
koi_prad_err1        float64
koi_prad_err2        float64
koi_teq              float64
koi_insol            float64
koi_insol_err1       float64
koi_insol_err2       float64
koi_model_snr        float64
koi_tce_plnt_num     float64
koi_steff            float64
koi_steff_err1       float64
koi_steff_err2       float64
koi_slogg            float64
koi_slogg_err1       float64
koi_slogg_err2

# Create a Train Test Split

Use `koi_disposition` for the y values

In [8]:
Y = df["koi_disposition"].values
Y.shape

(8744,)

In [9]:
Y[:10]

array(['CONFIRMED', 'CONFIRMED', 'FALSE POSITIVE', 'FALSE POSITIVE',
       'CONFIRMED', 'CONFIRMED', 'CONFIRMED', 'CONFIRMED',
       'FALSE POSITIVE', 'CONFIRMED'], dtype=object)

In [10]:
X = df.drop(columns=["koi_disposition"]).values
X.shape

(8744, 40)

In [11]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=7)

In [12]:
print(X_train.shape)
print(X_test.shape)

(6995, 40)
(1749, 40)


In [13]:
X_train, X_dev, Y_train, Y_dev = train_test_split(X_train, Y_train, test_size=0.25, random_state=7)

In [14]:
print(X_train.shape)
print(X_dev.shape)

(5246, 40)
(1749, 40)


# Pre-processing

Scale the data using the MinMaxScaler

In [15]:
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_dev_scaled = scaler.transform(X_dev)
X_test_scaled = scaler.transform(X_test)

In [16]:
X_train_scaled[0]

array([0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       7.51158796e-03, 3.82716890e-05, 9.99961728e-01, 1.78492718e-01,
       8.49397873e-04, 9.99150602e-01, 9.96666865e-02, 5.77858312e-01,
       9.69757249e-01, 1.39272111e-02, 1.37128713e-03, 9.98628713e-01,
       1.06759605e-02, 3.02466263e-03, 9.96975337e-01, 4.17343372e-03,
       9.17744917e-03, 9.98965535e-01, 5.80441101e-02, 1.94552025e-05,
       4.51057688e-05, 9.99985950e-01, 9.95132875e-03, 0.00000000e+00,
       2.10502456e-01, 2.64792899e-01, 8.98410897e-01, 8.48598834e-01,
       3.06451613e-02, 8.27842721e-01, 4.05216115e-03, 6.01371974e-03,
       9.99229473e-01, 8.70295864e-01, 7.62934453e-01, 7.24687991e-01])

In [17]:
X_dev_scaled[0]

array([1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       1.05755244e-01, 1.66325151e-02, 9.83367485e-01, 4.86064168e-02,
       3.16196399e-02, 9.68380360e-01, 1.81338412e-03, 3.70469956e-03,
       9.96918409e-01, 1.88831986e-02, 2.77227723e-02, 9.72277228e-01,
       2.98871792e-04, 1.66123779e-03, 9.98338762e-01, 3.28432108e-05,
       6.46950092e-05, 9.99996761e-01, 4.65891605e-02, 9.22557356e-06,
       1.80775676e-05, 9.99996726e-01, 5.71258826e-04, 0.00000000e+00,
       1.64790329e-01, 2.01183432e-01, 9.48354143e-01, 5.86797066e-01,
       2.41935484e-02, 9.67056323e-01, 2.25397990e-02, 2.60493790e-02,
       9.98535998e-01, 9.06553167e-01, 5.52476995e-01, 5.24836763e-01])

In [18]:
X_test_scaled[0]

array([0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       1.76697024e-03, 9.19005231e-06, 9.99990810e-01, 8.65543260e-03,
       1.32216433e-03, 9.98677836e-01, 9.94980458e-03, 1.98737433e-03,
       9.97859069e-01, 2.47799375e-02, 7.37623762e-04, 9.99262376e-01,
       1.09735649e-01, 9.72545370e-03, 9.90274546e-01, 1.86577389e-04,
       2.73105360e-04, 9.99967479e-01, 8.65105146e-02, 8.03048689e-05,
       1.34273066e-04, 9.99950087e-01, 3.38299477e-02, 0.00000000e+00,
       1.99848886e-01, 2.73668639e-01, 8.95005675e-01, 8.64209140e-01,
       2.41935484e-02, 8.97980871e-01, 3.21282462e-03, 3.26372730e-03,
       9.99556947e-01, 5.10255765e-01, 7.13127392e-02, 8.00727333e-01])

# Baseline Models

In [19]:
n_folds = 10 # 10 folds cross validation
seed = 23 # To repeat the results

In [62]:
"""
Logistic Regression (using all features)
"""
c_values = [0.01, 0.05, 0.25, 0.5, 1, 10, 100, 200, 500, 1000, 1200, 1500, 2000, 5000]
penalties = ["l1", "l2"]
param_grid = dict(penalty=penalties, C=c_values)
model = LogisticRegression()
cv = KFold(n_splits=n_folds, random_state=seed)
grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring="accuracy", cv=cv, n_jobs=10)
grid_result = grid.fit(X_train_scaled, Y_train)
print(f"Best: {grid_result.best_score_} with {grid_result.best_params_}\n")
means = grid_result.cv_results_['mean_test_score'] 
stds = grid_result.cv_results_['std_test_score'] 
params = grid_result.cv_results_['params']
results_list = []
for mean, stdev, param in zip(means, stds, params):
    results_list.append([mean, stdev, param])

results_list_sorted = sorted(results_list, key=lambda x: x[0], reverse=True)

# Show the top 3 best performances
print("Top 3 performances are: \n")
for i in range(3):
    print(f"{results_list_sorted[i][0]} ({results_list_sorted[i][1]}) with: {results_list_sorted[i][2]}")
    model = LogisticRegression(**results_list_sorted[i][2])
    model.fit(X_train_scaled, Y_train)
    print(f"Dev set accuracy: {accuracy_score(Y_dev, model.predict(X_dev_scaled))}")
    print(f"Test set accuracy: {accuracy_score(Y_test, model.predict(X_test_scaled))}")
    print("---" * 20)

Best: 0.8808616088448341 with {'C': 1200, 'penalty': 'l1'}

Top 3 performances are: 

0.8808616088448341 (0.009414950805762642) with: {'C': 1200, 'penalty': 'l1'}
Dev set accuracy: 0.8816466552315609
Test set accuracy: 0.8822184105202973
------------------------------------------------------------
0.8808616088448341 (0.00914551899881242) with: {'C': 1500, 'penalty': 'l1'}
Dev set accuracy: 0.8810748999428245
Test set accuracy: 0.8822184105202973
------------------------------------------------------------
0.8808616088448341 (0.009801780994178883) with: {'C': 2000, 'penalty': 'l1'}
Dev set accuracy: 0.8810748999428245
Test set accuracy: 0.8822184105202973
------------------------------------------------------------


In [66]:
"""
SVC (using all features)
"""
c_values = [0.01, 0.05, 0.25, 0.5, 1, 10, 100, 200, 500, 800, 1000, 1200]
kernels = ["linear", "rbf"]
param_grid = dict(C=c_values, kernel=kernels)
model = SVC()
cv = KFold(n_splits=n_folds, random_state=seed)
grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring="accuracy", cv=cv, n_jobs=10)
grid_result = grid.fit(X_train_scaled, Y_train)
print(f"Best: {grid_result.best_score_} with {grid_result.best_params_}\n")
means = grid_result.cv_results_['mean_test_score'] 
stds = grid_result.cv_results_['std_test_score'] 
params = grid_result.cv_results_['params']
results_list = []
for mean, stdev, param in zip(means, stds, params):
    results_list.append([mean, stdev, param])

results_list_sorted = sorted(results_list, key=lambda x: x[0], reverse=True)

# Show the top 3 best performances
print("Top 3 performances are: \n")
for i in range(3):
    print(f"{results_list_sorted[i][0]} ({results_list_sorted[i][1]}) with: {results_list_sorted[i][2]}")
    model = SVC(**results_list_sorted[i][2])
    model.fit(X_train_scaled, Y_train)
    print(f"Dev set accuracy: {accuracy_score(Y_dev, model.predict(X_dev_scaled))}")
    print(f"Test set accuracy: {accuracy_score(Y_test, model.predict(X_test_scaled))}")
    print("---" * 20)

Best: 0.8894395730080061 with {'C': 1000, 'kernel': 'linear'}

Top 3 performances are: 

0.8894395730080061 (0.009871853021316454) with: {'C': 1000, 'kernel': 'linear'}
Dev set accuracy: 0.8902229845626072
Test set accuracy: 0.8839336763865066
------------------------------------------------------------
0.8894395730080061 (0.009332956443723362) with: {'C': 1200, 'kernel': 'linear'}
Dev set accuracy: 0.8907947398513436
Test set accuracy: 0.8833619210977701
------------------------------------------------------------
0.8890583301563095 (0.010403369814827734) with: {'C': 500, 'kernel': 'linear'}
Dev set accuracy: 0.8896512292738707
Test set accuracy: 0.8833619210977701
------------------------------------------------------------


In [52]:
"""
Decision Tree
"""
criterion = ["gini", "entropy"]
param_grid = dict(criterion=criterion)
model = DecisionTreeClassifier(random_state=seed)
cv = KFold(n_splits=n_folds, random_state=seed)
grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring="accuracy", cv=cv, n_jobs=10)
grid_result = grid.fit(X_train_scaled, Y_train)
print(f"Best: {grid_result.best_score_} with {grid_result.best_params_}\n")
means = grid_result.cv_results_['mean_test_score'] 
stds = grid_result.cv_results_['std_test_score'] 
params = grid_result.cv_results_['params']
results_list = []
for mean, stdev, param in zip(means, stds, params):
    results_list.append([mean, stdev, param])

results_list_sorted = sorted(results_list, key=lambda x: x[0], reverse=True)

# Show the top 3 best performances
print("Top 2 performances are: \n")
for i in range(2):
    print(f"{results_list_sorted[i][0]} ({results_list_sorted[i][1]}) with: {results_list_sorted[i][2]}")
    model = DecisionTreeClassifier(random_state=seed, **results_list_sorted[i][2])
    model.fit(X_train_scaled, Y_train)
    print(f"Dev set accuracy: {accuracy_score(Y_dev, model.predict(X_dev_scaled))}")
    print(f"Test set accuracy: {accuracy_score(Y_test, model.predict(X_test_scaled))}")
    print("---" * 20)

Best: 0.8520777735417461 with {'criterion': 'entropy'}

Top 2 performances are: 

0.8520777735417461 (0.007234512015127407) with: {'criterion': 'entropy'}
Dev set accuracy: 0.8484848484848485
Test set accuracy: 0.8502001143510578
------------------------------------------------------------
0.85055280213496 (0.012403770053758824) with: {'criterion': 'gini'}
Dev set accuracy: 0.8479130931961121
Test set accuracy: 0.8496283590623214
------------------------------------------------------------


# Dimension Reduction

In [36]:
"""
Combine PCA and Logistic Regression
"""
# Create a pipeline
pca = PCA()
lr = LogisticRegression()
pipe = Pipeline(steps=[('pca', pca), ('lr', lr)])
param_dict = {"pca__n_components": [20, 24, 28, 32, 36, 37, 38, 39],
              "lr__C": [0.01, 0.5, 1, 10, 100, 200, 500, 1000, 1200, 1500, 2000],
              "lr__penalty": ["l1", "l2"]}
cv = KFold(n_splits=n_folds, random_state=seed)
estimator = GridSearchCV(pipe, param_dict, scoring="accuracy", cv=cv, n_jobs=10)
grid_result = estimator.fit(X_train_scaled, Y_train)
print(f"Best: {grid_result.best_score_} with {grid_result.best_params_}\n")
means = grid_result.cv_results_['mean_test_score'] 
stds = grid_result.cv_results_['std_test_score'] 
params = grid_result.cv_results_['params']
results_list = []
for mean, stdev, param in zip(means, stds, params):
    results_list.append([mean, stdev, param])

results_list_sorted = sorted(results_list, key=lambda x: x[0], reverse=True)

# Show the top 3 best performances
print("Top 3 performances are: \n")
for i in range(3):
    print(f"{results_list_sorted[i][0]} ({results_list_sorted[i][1]}) with: {results_list_sorted[i][2]}")
    model = pipe.set_params(**results_list_sorted[i][2], lr__n_jobs=10)
    model.fit(X_train_scaled, Y_train)
    print(f"Dev set accuracy: {accuracy_score(Y_dev, model.predict(X_dev_scaled))}")
    print(f"Test set accuracy: {accuracy_score(Y_test, model.predict(X_test_scaled))}")
    print("---" * 20)

Best: 0.8797178802897445 with {'lr__C': 1200, 'lr__penalty': 'l1', 'pca__n_components': 36}

Top 3 performances are: 

0.8797178802897445 (0.010736084662026094) with: {'lr__C': 1200, 'lr__penalty': 'l1', 'pca__n_components': 36}


  " = {}.".format(self.n_jobs))


Dev set accuracy: 0.8839336763865066
Test set accuracy: 0.8776443682104059
------------------------------------------------------------
0.8795272588638963 (0.01065703232095771) with: {'lr__C': 1500, 'lr__penalty': 'l1', 'pca__n_components': 38}


  " = {}.".format(self.n_jobs))


Dev set accuracy: 0.8833619210977701
Test set accuracy: 0.8770726129216695
------------------------------------------------------------
0.8795272588638963 (0.010951153877647574) with: {'lr__C': 2000, 'lr__penalty': 'l1', 'pca__n_components': 36}


  " = {}.".format(self.n_jobs))


Dev set accuracy: 0.8805031446540881
Test set accuracy: 0.8759291023441966
------------------------------------------------------------


In [35]:
"""
Combine PCA and SVM
"""
# Create a pipeline
pca = PCA()
svc = SVC()
pipe = Pipeline(steps=[('pca', pca), ('svc', svc)])
param_dict = {"pca__n_components": [20, 24, 28, 32, 36, 37, 38, 39],
              "svc__C": [0.01, 0.5, 1, 10, 100, 200, 500, 1000, 1200, 1500, 2000],
              "svc__kernel": ["linear", "rbf"]}
cv = KFold(n_splits=n_folds, random_state=seed)
estimator = GridSearchCV(pipe, param_dict, scoring="accuracy", cv=cv, n_jobs=10)
grid_result = estimator.fit(X_train_scaled, Y_train)
print(f"Best: {grid_result.best_score_} with {grid_result.best_params_}\n")
means = grid_result.cv_results_['mean_test_score'] 
stds = grid_result.cv_results_['std_test_score'] 
params = grid_result.cv_results_['params']
results_list = []
for mean, stdev, param in zip(means, stds, params):
    results_list.append([mean, stdev, param])

results_list_sorted = sorted(results_list, key=lambda x: x[0], reverse=True)

# Show the top 3 best performances
print("Top 3 performances are: \n")
for i in range(3):
    print(f"{results_list_sorted[i][0]} ({results_list_sorted[i][1]}) with: {results_list_sorted[i][2]}")
    model = pipe.set_params(**results_list_sorted[i][2])
    model.fit(X_train_scaled, Y_train)
    print(f"Dev set accuracy: {accuracy_score(Y_dev, model.predict(X_dev_scaled))}")
    print(f"Test set accuracy: {accuracy_score(Y_test, model.predict(X_test_scaled))}")
    print("---" * 20)

Best: 0.8902020587113991 with {'pca__n_components': 32, 'svc__C': 500, 'svc__kernel': 'linear'}

Top 3 performances are: 

0.8902020587113991 (0.010313491107285749) with: {'pca__n_components': 32, 'svc__C': 500, 'svc__kernel': 'linear'}
Dev set accuracy: 0.8890794739851343
Test set accuracy: 0.8822184105202973
------------------------------------------------------------
0.8900114372855509 (0.009829016513236158) with: {'pca__n_components': 36, 'svc__C': 1200, 'svc__kernel': 'linear'}
Dev set accuracy: 0.8902229845626072
Test set accuracy: 0.8833619210977701
------------------------------------------------------------
0.8900114372855509 (0.009829016513236158) with: {'pca__n_components': 37, 'svc__C': 1200, 'svc__kernel': 'linear'}
Dev set accuracy: 0.8902229845626072
Test set accuracy: 0.8833619210977701
------------------------------------------------------------


In [37]:
"""
Combine K Best and Logistic Regression
"""
# Create a pipeline
k_best = SelectKBest()
lr = LogisticRegression()
pipe = Pipeline(steps=[('k_best', k_best), ('lr', lr)])
param_dict = {"k_best__k": [10, 20, 24, 28, 30, 32, 36, 37, 38, 39],
              "lr__C": [0.01, 0.5, 1, 10, 100, 200, 500, 1000, 1200, 1500, 2000],
              "lr__penalty": ["l1", "l2"]}
cv = KFold(n_splits=n_folds, random_state=seed)
estimator = GridSearchCV(pipe, param_dict, scoring="accuracy", cv=cv, n_jobs=10)
grid_result = estimator.fit(X_train_scaled, Y_train)
print(f"Best: {grid_result.best_score_} with {grid_result.best_params_}\n")
means = grid_result.cv_results_['mean_test_score'] 
stds = grid_result.cv_results_['std_test_score'] 
params = grid_result.cv_results_['params']
results_list = []
for mean, stdev, param in zip(means, stds, params):
    results_list.append([mean, stdev, param])

results_list_sorted = sorted(results_list, key=lambda x: x[0], reverse=True)

# Show the top 3 best performances
print("Top 3 performances are: \n")
for i in range(3):
    print(f"{results_list_sorted[i][0]} ({results_list_sorted[i][1]}) with: {results_list_sorted[i][2]}")
    model = pipe.set_params(**results_list_sorted[i][2])
    model.fit(X_train_scaled, Y_train)
    print(f"Dev set accuracy: {accuracy_score(Y_dev, model.predict(X_dev_scaled))}")
    print(f"Test set accuracy: {accuracy_score(Y_test, model.predict(X_test_scaled))}")
    print("---" * 20)

Best: 0.8810522302706825 with {'k_best__k': 38, 'lr__C': 2000, 'lr__penalty': 'l1'}

Top 3 performances are: 

0.8810522302706825 (0.010894270786154623) with: {'k_best__k': 38, 'lr__C': 2000, 'lr__penalty': 'l1'}
Dev set accuracy: 0.8805031446540881
Test set accuracy: 0.8822184105202973
------------------------------------------------------------
0.8810522302706825 (0.010342897760711256) with: {'k_best__k': 39, 'lr__C': 2000, 'lr__penalty': 'l1'}
Dev set accuracy: 0.8805031446540881
Test set accuracy: 0.8822184105202973
------------------------------------------------------------
0.8808616088448341 (0.011186227510145716) with: {'k_best__k': 38, 'lr__C': 1500, 'lr__penalty': 'l1'}
Dev set accuracy: 0.8799313893653516
Test set accuracy: 0.8827901658090337
------------------------------------------------------------


In [38]:
"""
Combine K Best and SVM
"""
# Create a pipeline
k_best = SelectKBest()
svc = SVC()
pipe = Pipeline(steps=[('k_best', k_best), ('svc', svc)])
param_dict = {"k_best__k": [10, 20, 24, 28, 30, 32, 36, 37, 38, 39],
              "svc__C": [0.01, 0.5, 1, 10, 100, 200, 500, 1000, 1200, 1500, 2000],
              "svc__kernel": ["linear", "rbf"]}
cv = KFold(n_splits=n_folds, random_state=seed)
estimator = GridSearchCV(pipe, param_dict, scoring="accuracy", cv=cv, n_jobs=10)
grid_result = estimator.fit(X_train_scaled, Y_train)
print(f"Best: {grid_result.best_score_} with {grid_result.best_params_}\n")
means = grid_result.cv_results_['mean_test_score'] 
stds = grid_result.cv_results_['std_test_score'] 
params = grid_result.cv_results_['params']
results_list = []
for mean, stdev, param in zip(means, stds, params):
    results_list.append([mean, stdev, param])

results_list_sorted = sorted(results_list, key=lambda x: x[0], reverse=True)

# Show the top 3 best performances
print("Top 3 performances are: \n")
for i in range(3):
    print(f"{results_list_sorted[i][0]} ({results_list_sorted[i][1]}) with: {results_list_sorted[i][2]}")
    model = pipe.set_params(**results_list_sorted[i][2])
    model.fit(X_train_scaled, Y_train)
    print(f"Dev set accuracy: {accuracy_score(Y_dev, model.predict(X_dev_scaled))}")
    print(f"Test set accuracy: {accuracy_score(Y_test, model.predict(X_test_scaled))}")
    print("---" * 20)

Best: 0.8900114372855509 with {'k_best__k': 32, 'svc__C': 1500, 'svc__kernel': 'linear'}

Top 3 performances are: 

0.8900114372855509 (0.012211591301931831) with: {'k_best__k': 32, 'svc__C': 1500, 'svc__kernel': 'linear'}
Dev set accuracy: 0.8839336763865066
Test set accuracy: 0.8805031446540881
------------------------------------------------------------
0.8900114372855509 (0.012115945994352775) with: {'k_best__k': 32, 'svc__C': 2000, 'svc__kernel': 'linear'}
Dev set accuracy: 0.884505431675243
Test set accuracy: 0.8810748999428245
------------------------------------------------------------
0.8900114372855509 (0.010180096102591496) with: {'k_best__k': 36, 'svc__C': 1000, 'svc__kernel': 'linear'}
Dev set accuracy: 0.8867924528301887
Test set accuracy: 0.8833619210977701
------------------------------------------------------------


# Ensemble Methods

In [43]:
"""
Bagged Decision Trees (using all features)
"""
num_trees = [5, 10, 20, 30, 40, 50, 60, 80, 100]
param_grid = dict(n_estimators=num_trees)
cart = DecisionTreeClassifier()
model = BaggingClassifier(base_estimator=cart, random_state=seed)
cv = KFold(n_splits=n_folds, random_state=seed)
grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring="accuracy", cv=cv, n_jobs=10)
grid_result = grid.fit(X_train_scaled, Y_train)
print(f"Best: {grid_result.best_score_} with {grid_result.best_params_}\n")
means = grid_result.cv_results_['mean_test_score'] 
stds = grid_result.cv_results_['std_test_score'] 
params = grid_result.cv_results_['params']
results_list = []
for mean, stdev, param in zip(means, stds, params):
    results_list.append([mean, stdev, param])

results_list_sorted = sorted(results_list, key=lambda x: x[0], reverse=True)

# Show the top 3 best performances
print("Top 3 performances are: \n")
for i in range(3):
    print(f"{results_list_sorted[i][0]} ({results_list_sorted[i][1]}) with: {results_list_sorted[i][2]}")
    model = BaggingClassifier(base_estimator=cart, random_state=seed, **results_list_sorted[i][2])
    model.fit(X_train_scaled, Y_train)
    print(f"Dev set accuracy: {accuracy_score(Y_dev, model.predict(X_dev_scaled))}")
    print(f"Test set accuracy: {accuracy_score(Y_test, model.predict(X_test_scaled))}")
    print("---" * 20)

Best: 0.8961113229126954 with {'n_estimators': 40}

Top 3 performances are: 

0.8961113229126954 (results_list_sorted[i][1]) with: {'n_estimators': 40}
Dev set accuracy: 0.89937106918239
Test set accuracy: 0.8890794739851343
------------------------------------------------------------
0.8959207014868471 (results_list_sorted[i][1]) with: {'n_estimators': 50}
Dev set accuracy: 0.8982275586049171
Test set accuracy: 0.8925100057175529
------------------------------------------------------------
0.8953488372093024 (results_list_sorted[i][1]) with: {'n_estimators': 60}
Dev set accuracy: 0.8999428244711264
Test set accuracy: 0.8902229845626072
------------------------------------------------------------


In [50]:
"""
Random Forest
"""
num_trees = [5, 10, 20, 30, 40, 50, 60, 80, 100]
max_features = [5, 10, 15, 20, 25, 30, 32, 35, 36, 37, 38, 39, 40]
param_grid = dict(n_estimators=num_trees, max_features=max_features)
model = RandomForestClassifier(random_state=seed)
cv = KFold(n_splits=n_folds, random_state=seed)
grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring="accuracy", cv=cv, n_jobs=10)
grid_result = grid.fit(X_train_scaled, Y_train)
print(f"Best: {grid_result.best_score_} with {grid_result.best_params_}\n")
means = grid_result.cv_results_['mean_test_score'] 
stds = grid_result.cv_results_['std_test_score'] 
params = grid_result.cv_results_['params']
results_list = []
for mean, stdev, param in zip(means, stds, params):
    results_list.append([mean, stdev, param])

results_list_sorted = sorted(results_list, key=lambda x: x[0], reverse=True)

# Show the top 3 best performances
print("Top 3 performances are: \n")
for i in range(3):
    print(f"{results_list_sorted[i][0]} ({results_list_sorted[i][1]}) with: {results_list_sorted[i][2]}")
    model = RandomForestClassifier(**results_list_sorted[i][2])
    model.fit(X_train_scaled, Y_train)
    print(f"Dev set accuracy: {accuracy_score(Y_dev, model.predict(X_dev_scaled))}")
    print(f"Test set accuracy: {accuracy_score(Y_test, model.predict(X_test_scaled))}")
    print("---" * 20)

Best: 0.8985894014487228 with {'max_features': 15, 'n_estimators': 60}

Top 3 performances are: 

0.8985894014487228 (0.009812785857074733) with: {'max_features': 15, 'n_estimators': 60}
Dev set accuracy: 0.906232132647227
Test set accuracy: 0.8953687821612349
------------------------------------------------------------
0.8978269157453298 (0.009878256363240153) with: {'max_features': 10, 'n_estimators': 80}
Dev set accuracy: 0.9045168667810177
Test set accuracy: 0.8959405374499714
------------------------------------------------------------
0.8976362943194816 (0.012719441389811258) with: {'max_features': 15, 'n_estimators': 100}
Dev set accuracy: 0.902229845626072
Test set accuracy: 0.8965122927387078
------------------------------------------------------------


In [49]:
"""
AdaBoost
"""
num_trees = [5, 10, 20, 30, 40, 50, 60, 80, 100]
param_grid = dict(n_estimators=num_trees)
model = AdaBoostClassifier(random_state=seed)
cv = KFold(n_splits=n_folds, random_state=seed)
grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring="accuracy", cv=cv, n_jobs=10)
grid_result = grid.fit(X_train_scaled, Y_train)
print(f"Best: {grid_result.best_score_} with {grid_result.best_params_}\n")
means = grid_result.cv_results_['mean_test_score'] 
stds = grid_result.cv_results_['std_test_score'] 
params = grid_result.cv_results_['params']
results_list = []
for mean, stdev, param in zip(means, stds, params):
    results_list.append([mean, stdev, param])

results_list_sorted = sorted(results_list, key=lambda x: x[0], reverse=True)

# Show the top 3 best performances
print("Top 3 performances are: \n")
for i in range(3):
    print(f"{results_list_sorted[i][0]} ({results_list_sorted[i][1]}) with: {results_list_sorted[i][2]}")
    model = AdaBoostClassifier(random_state=seed, **results_list_sorted[i][2])
    model.fit(X_train_scaled, Y_train)
    print(f"Dev set accuracy: {accuracy_score(Y_dev, model.predict(X_dev_scaled))}")
    print(f"Test set accuracy: {accuracy_score(Y_test, model.predict(X_test_scaled))}")
    print("---" * 20)

Best: 0.7878383530308807 with {'n_estimators': 30}

Top 3 performances are: 

0.7878383530308807 (0.03656549261744601) with: {'n_estimators': 30}
Dev set accuracy: 0.773013150371641
Test set accuracy: 0.7821612349914236
------------------------------------------------------------
0.7764010674799847 (0.030015679470027375) with: {'n_estimators': 100}
Dev set accuracy: 0.7524299599771298
Test set accuracy: 0.7684391080617495
------------------------------------------------------------
0.7731605032405643 (0.027904982160973597) with: {'n_estimators': 80}
Dev set accuracy: 0.7347055460263008
Test set accuracy: 0.7272727272727273
------------------------------------------------------------


# Final Evaluation

In [58]:
models = []
models.append(SVC(C=1000, kernel="linear"))
models.append(SVC(C=1200, kernel="linear"))
models.append(Pipeline(steps=[('pca', PCA(n_components=32)), ('svc', SVC(C=500, kernel="linear"))]))
models.append(Pipeline(steps=[('pca', PCA(n_components=36)), ('svc', SVC(C=1200, kernel="linear"))]))
models.append(RandomForestClassifier(max_features=15, n_estimators=60, random_state=seed))
models.append(RandomForestClassifier(max_features=10, n_estimators=80, random_state=seed))
models.append(RandomForestClassifier(max_features=15, n_estimators=100, random_state=seed))

for i in range(len(models)):
    model = models[i]
    model.fit(X_train_scaled, Y_train)
    print(f"Model no. {i}")
    print(f"Final average accuracy: {(model.score(X_dev_scaled, Y_dev) + model.score(X_test_scaled, Y_test)) / 2}")
    print("---" * 40) 

Model no. 0
Final average accuracy: 0.8870783304745569
------------------------------------------------------------------------------------------------------------------------
Model no. 1
Final average accuracy: 0.8870783304745569
------------------------------------------------------------------------------------------------------------------------
Model no. 2
Final average accuracy: 0.8856489422527158
------------------------------------------------------------------------------------------------------------------------
Model no. 3
Final average accuracy: 0.8867924528301887
------------------------------------------------------------------------------------------------------------------------
Model no. 4
Final average accuracy: 0.8987993138936535
------------------------------------------------------------------------------------------------------------------------
Model no. 5
Final average accuracy: 0.9002287021154946
-----------------------------------------------------------------

In [63]:
"""
Save the best model
"""
# Create a pipeline
scaler = MinMaxScaler()
rf = RandomForestClassifier(max_features=10, n_estimators=80, random_state=seed)
pipe = Pipeline(steps=[('scaler', scaler), ('rf', rf)])
pipe.fit(X_train, Y_train)
print(f"Final average accuracy: {(pipe.score(X_dev, Y_dev) + pipe.score(X_test, Y_test)) / 2}")

Final average accuracy: 0.9002287021154946


In [64]:
# Save the final model
joblib.dump(pipe, "model/clf.pkl")

['model/clf.pkl']

In [65]:
# Load model and test again
model = joblib.load("model/clf.pkl")
print(f"Final average accuracy: {(model.score(X_dev, Y_dev) + model.score(X_test, Y_test)) / 2}")

Final average accuracy: 0.9002287021154946
