**Import feature variables that i just prepared for the model experiments from preprocess.py file**

In [2]:
import preprocess as pp

In [3]:
# tcas data
X1_train = pp.BOWtcas_train
X1_test = pp.BOWtcas_test
y1_train = pp.ytcas_train
y1_test = pp.ytcas_test

# combined data
X2_train = pp.BOWdf_train
X2_test = pp.BOWdf_test
y2_train = pp.ydf_train
y2_test = pp.ydf_test

-------------------------------------------------------------------------------------------------------------

# **Modelling**

now i'll instantiate the model that can works with my features, for easy to build i choose sklearn for instantiatation.

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.neural_network import MLPClassifier

import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning)

In [5]:
from sklearn.model_selection import KFold, GridSearchCV

In [6]:
kf = KFold(n_splits=5, shuffle=True)

-------------------------------------------------------------------------------------------------------------

In [7]:
from sklearn.pipeline import Pipeline

**1. Start with  tcas data**

* Logistic Regression

In [9]:
%time
lr_pipeline = Pipeline([('lr', LogisticRegression())])
param_lr = {
    'lr__C': [0.1, 1, 10],
    'lr__penalty': ['l2'],
    'lr__solver': ['lbfgs'] 
}
# Initialize GridSearchCV
lr_grid_search = GridSearchCV(lr_pipeline, param_lr, cv=kf, scoring='accuracy')
# Perform GridSearchCV to find the best combination of hyperparameters
lr_grid_search.fit(X1_train, y1_train)

# Get the best hyperparameters
print(lr_grid_search.best_params_)
best_lr_pipeline = lr_grid_search.best_estimator_

CPU times: total: 0 ns
Wall time: 0 ns
{'lr__C': 1, 'lr__penalty': 'l2', 'lr__solver': 'lbfgs'}


* Naive Bayes

In [10]:
%time
nb_pipeline = Pipeline([('nb', MultinomialNB())])
param_nb = {
    'nb__alpha': [0.1, 1.0, 10.0]
}
# Initialize GridSearchCV
nb_grid_search = GridSearchCV(nb_pipeline, param_nb, cv=kf, scoring='accuracy')
# Perform GridSearchCV to find the best combination of hyperparameters
nb_grid_search.fit(X1_train, y1_train)
# Get the best hyperparameters
print(nb_grid_search.best_params_)
best_nb_pipeline = nb_grid_search.best_estimator_

CPU times: total: 0 ns
Wall time: 0 ns
{'nb__alpha': 1.0}


* Random Forest

In [11]:
%time
rf_pipeline = Pipeline([('rf', RandomForestClassifier())])
param_rf = {
    'rf__n_estimators': [100, 200, 300],
    'rf__max_depth': [None, 10, 20],
}
rf_grid_search = GridSearchCV(rf_pipeline, param_rf, cv=kf, scoring='accuracy')
# Perform GridSearchCV to find the best combination of hyperparameters
rf_grid_search.fit(X1_train, y1_train)
# Get the best hyperparameters
print(rf_grid_search.best_params_)
best_rf_pipeline = rf_grid_search.best_estimator_

CPU times: total: 0 ns
Wall time: 0 ns
{'rf__max_depth': None, 'rf__n_estimators': 300}


* XGBoost

In [12]:
%time
xg_pipeline = Pipeline([('xg', xgb.XGBClassifier())])
param_xg = {
    'xg__n_estimators': [100, 200, 300],
    'xg__max_depth': [3, 5, 7],
    'xg__learning_rate': [0.1, 0.01, 0.001]
}
xg_grid_search = GridSearchCV(xg_pipeline, param_xg, cv=kf, scoring='accuracy')
# Perform GridSearchCV to find the best combination of hyperparameters
xg_grid_search.fit(X1_train, y1_train)

# Get the best hyperparameters
print(xg_grid_search.best_params_)
best_xg_pipeline = xg_grid_search.best_estimator_

CPU times: total: 0 ns
Wall time: 0 ns
{'xg__learning_rate': 0.1, 'xg__max_depth': 3, 'xg__n_estimators': 100}


* Neural networks

In [13]:
%time
nn_pipeline = Pipeline([('nn', MLPClassifier())])
param_nn = {
    'nn__hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 100), (50, 50, 50)],
    'nn__activation': ['relu', 'tanh', 'logistic'],
    'nn__solver': ['adam', 'sgd'],
    'nn__max_iter': [200, 500, 1000],
}
nn_grid_search = GridSearchCV(nn_pipeline, param_nn, cv=kf, scoring='accuracy')
nn_grid_search.fit(X1_train, y1_train)

# Get the best hyperparameters
print(nn_grid_search.best_params_)
best_nn_pipeline = nn_grid_search.best_estimator_

CPU times: total: 0 ns
Wall time: 0 ns
{'nn__activation': 'logistic', 'nn__hidden_layer_sizes': (50, 50, 50), 'nn__max_iter': 200, 'nn__solver': 'adam'}


-------------------------------------------------------------------------------------------------------------

**2. Lastly does with combined data**

* Logistic Regression

In [14]:
%time
lrc_pipeline = Pipeline([('lrc', LogisticRegression())])
param_lrc = {
    'lrc__C': [0.1, 1, 10],
    'lrc__penalty': ['l2'],
    'lrc__solver': ['lbfgs'] 
}
# Initialize GridSearchCV
lrc_grid_search = GridSearchCV(lrc_pipeline, param_lrc, cv=kf, scoring='accuracy')
# Perform GridSearchCV to find the best combination of hyperparameters
lrc_grid_search.fit(X2_train, y2_train)

# Get the best hyperparameters
print(lrc_grid_search.best_params_)
best_lrc_pipeline = lrc_grid_search.best_estimator_

CPU times: total: 0 ns
Wall time: 0 ns
{'lrc__C': 10, 'lrc__penalty': 'l2', 'lrc__solver': 'lbfgs'}


* Naive Bayes

In [15]:
%time
nbc_pipeline = Pipeline([('nbc', MultinomialNB())])
param_nbc = {
    'nbc__alpha': [0.1, 1.0, 10.0]
}
# Initialize GridSearchCV
nbc_grid_search = GridSearchCV(nbc_pipeline, param_nbc, cv=kf, scoring='accuracy')
# Perform GridSearchCV to find the best combination of hyperparameters
nbc_grid_search.fit(X2_train, y2_train)
# Get the best hyperparameters
print(nbc_grid_search.best_params_)
best_nbc_pipeline = nbc_grid_search.best_estimator_

CPU times: total: 0 ns
Wall time: 0 ns
{'nbc__alpha': 1.0}


* Random Forest

In [16]:
%time
rfc_pipeline = Pipeline([('rfc', RandomForestClassifier())])
param_rfc = {
    'rfc__n_estimators': [100, 200, 300],
    'rfc__max_depth': [None, 10, 20],
}
rfc_grid_search = GridSearchCV(rfc_pipeline, param_rfc, cv=kf, scoring='accuracy')
# Perform GridSearchCV to find the best combination of hyperparameters
rfc_grid_search.fit(X2_train, y2_train)
# Get the best hyperparameters
print(rfc_grid_search.best_params_)
best_rfc_pipeline = rfc_grid_search.best_estimator_

CPU times: total: 0 ns
Wall time: 0 ns
{'rfc__max_depth': None, 'rfc__n_estimators': 300}


* XGBoost

In [17]:
%time
xgc_pipeline = Pipeline([('xgc', xgb.XGBClassifier())])
param_xgc = {
    'xgc__n_estimators': [100, 200, 300],
    'xgc__max_depth': [3, 5, 7],
    'xgc__learning_rate': [0.1, 0.01, 0.001]
}
xgc_grid_search = GridSearchCV(xgc_pipeline, param_xgc, cv=kf, scoring='accuracy')
# Perform GridSearchCV to find the best combination of hyperparameters
xgc_grid_search.fit(X2_train, y2_train)

# Get the best hyperparameters
print(xgc_grid_search.best_params_)
best_xgc_pipeline = xgc_grid_search.best_estimator_

CPU times: total: 0 ns
Wall time: 0 ns


{'xgc__learning_rate': 0.1, 'xgc__max_depth': 3, 'xgc__n_estimators': 300}


* Neural networks

In [18]:
%time
nnc_pipeline = Pipeline([('nnc', MLPClassifier())])
param_nnc = {
    'nnc__hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 100), (50, 50, 50)],
    'nnc__activation': ['relu', 'tanh', 'logistic'],
    'nnc__solver': ['adam', 'sgd'],
    'nnc__max_iter': [200, 500, 1000],
}
                        
nnc_grid_search = GridSearchCV(nnc_pipeline, param_nnc, cv=kf, scoring='accuracy')
nnc_grid_search.fit(X2_train, y2_train)

# Get the best hyperparameters
print(nnc_grid_search.best_params_)
best_nnc_pipeline = nnc_grid_search.best_estimator_

CPU times: total: 0 ns
Wall time: 0 ns
{'nnc__activation': 'relu', 'nnc__hidden_layer_sizes': (50,), 'nnc__max_iter': 500, 'nnc__solver': 'adam'}


-------------------------------------------------------------------------------------------------------------

*End of Modelling*

In [20]:
import joblib

# Save the trained pipeline to a file

# model pipeline wit tcas trained 
joblib.dump(best_lr_pipeline, 'best_lr_pipeline.pkl')
joblib.dump(best_nb_pipeline, 'best_nb_pipeline.pkl')
joblib.dump(best_rf_pipeline, 'best_rf_pipeline.pkl')
joblib.dump(best_xg_pipeline, 'best_xg_pipeline.pkl')
joblib.dump(best_nn_pipeline, 'best_nn_pipeline.pkl')

# model pipeline wit combined trained
joblib.dump(best_lrc_pipeline, 'best_lrc_pipeline.pkl')
joblib.dump(best_nbc_pipeline, 'best_nbc_pipeline.pkl')
joblib.dump(best_rfc_pipeline, 'best_rfc_pipeline.pkl')
joblib.dump(best_xgc_pipeline, 'best_xgc_pipeline.pkl')
joblib.dump(best_nnc_pipeline, 'best_nnc_pipeline.pkl')

['best_nnc_pipeline.pkl']