In [2]:
import pandas as pd
%run preprocess.py

# Hyperparameter Tuning Function for Random Forest and Logistic Regression Models

In [3]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

model_params = {
    'random_forest': {
        'model': RandomForestClassifier(random_state=0),
        'params' : {
            'n_estimators': [1,10,50,70]
        }
    },
    'logistic_regression' : {
        'model': LogisticRegression(solver='liblinear',multi_class='auto'),
        'params': {
            'C': [1,5,10]
        }
    }
}

# Train the Models

In [4]:
scores = []

for model_name, mp in model_params.items():
    clf =  GridSearchCV(mp['model'], mp['params'], cv=5, return_train_score=False)
    clf.fit(X_train_scaled, y_train)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })
    
result_df = pd.DataFrame(scores,columns=['model','best_score','best_params'])
result_df

Unnamed: 0,model,best_score,best_params
0,random_forest,0.895202,{'n_estimators': 70}
1,logistic_regression,0.82099,{'C': 5}


Random Forest with n_estimators=70 gives 89.5% Accuracy

In [5]:
model_rf = RandomForestClassifier(n_estimators=70)
model_rf = model_rf.fit(X_train, y_train)

# Save the Model

In [6]:
import joblib
filename = 'model_rf.sav'
joblib.dump(model_rf, filename)

['model_rf.sav']

# Evaluate the Model

In [8]:
# load the trained model from saved file
model_clf = joblib.load(filename)


# Make predictions with the model
predictions = model_clf.predict(X_test)

# Calculate classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions,
                            target_names=["CONFIRMED", "FALSE POSITIVE", "CANDIDATE"]))

                precision    recall  f1-score   support

     CONFIRMED       0.81      0.79      0.80       327
FALSE POSITIVE       0.85      0.82      0.83       393
     CANDIDATE       0.97      1.00      0.98       679

      accuracy                           0.90      1399
     macro avg       0.88      0.87      0.87      1399
  weighted avg       0.90      0.90      0.90      1399



I can conclude that Random Forest with n_estimators=70 is the best model for solving my problem of Exoplanet classification