<a href="https://colab.research.google.com/github/Nur-E-Anika/evolutionary-algorithm/blob/main/HO.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install opendatasets
!pip install pandas

Collecting opendatasets
  Downloading opendatasets-0.1.22-py3-none-any.whl (15 kB)


In [None]:
import opendatasets as od
import pandas

od.download("https://www.kaggle.com/competitions/titanic")

In [None]:
# !pip install numpy==1.24.4
# !pip install pandas --upgrade
# !pip install matplotlib --upgrade

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os

In [None]:
!pip install seaborn --upgrade
!pip install sklearn --upgrade

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.model_selection import train_test_split

In [None]:
# load dataset using pandas
titanic_train_df = pd.read_csv('./titanic/train.csv')
titanic_test_df = pd.read_csv('./titanic/test.csv')

In [None]:
titanic_Y_col = titanic_train_df.columns[1]
titanic_X_col = titanic_train_df.columns[2:]
titanic_X_col = titanic_X_col.drop(['Name','Ticket'])

In [None]:
titanic_X, titanic_Y = titanic_train_df[titanic_X_col].copy(), titanic_train_df[titanic_Y_col].copy()

In [None]:
numeric_cols = titanic_train_df[titanic_X_col].select_dtypes(include=np.number).columns.tolist()
categorical_cols = titanic_train_df[titanic_X_col].select_dtypes(exclude=np.number).columns.tolist()


In [None]:
# Impute and scale numeric columns
imputer = SimpleImputer().fit(titanic_train_df[numeric_cols])
titanic_X[numeric_cols] = imputer.transform(titanic_X[numeric_cols])


scaler = MinMaxScaler().fit(titanic_X[numeric_cols])
titanic_X[numeric_cols] = scaler.transform(titanic_X[numeric_cols])

In [None]:
# One-hot encode categorical columns
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore').fit(titanic_X[categorical_cols])
encoded_cols = list(encoder.get_feature_names_out(categorical_cols))
titanic_X[encoded_cols] = encoder.transform(titanic_X[categorical_cols])

In [None]:
titanic_X = titanic_X[numeric_cols + encoded_cols]

In [None]:
titanic_X_Train, titanic_X_Test, titanic_Y_Train, titanic_Y_Test = train_test_split(titanic_X, titanic_Y, test_size = 0.30,random_state = 42)

In [None]:
!pip install -U pymoo

In [None]:
from pymoo.algorithms.moo.nsga3 import NSGA3
from pymoo.factory import get_problem, get_reference_directions, get_sampling, get_crossover, get_mutation, get_termination
from pymoo.operators.selection.rnd import RandomSelection
from pymoo.operators.crossover.sbx import SBX
from pymoo.operators.mutation.pm import PolynomialMutation
from pymoo.termination.default import DefaultMultiObjectiveTermination
from pymoo.core.problem import Problem
from pymoo.optimize import minimize
from pymoo.operators.sampling.lhs import LHS

In [None]:
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')
%matplotlib inline
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

In [None]:
def test_params(**params):
    model = RandomForestClassifier(random_state=42, n_jobs=-1, **params).fit(titanic_X_Train, titanic_Y_Train)
    train_accuracy_score = accuracy_score(titanic_Y_Train, model.predict(titanic_X_Train))
    val_accuracy_score = accuracy_score(titanic_Y_Test, model.predict(titanic_X_Test))
    return train_accuracy_score, val_accuracy_score

In [None]:
# define the hyperparameter optimization problem
class HyperparameterOptimizationProblem(Problem):

    def __init__(self,level):
        # define the lower and upper bounds of the hyperparameters
        # n_estimators: number of trees in the forest (integer)
        # max_depth: maximum depth of each tree (integer)
        # max_features: maximum number of features (integer)
        # min_samples_leaf: minimum number of samples required to be at a leaf node (integer)
        self.level = level
        self.var_ranges = [
            [(10, 500), (2, 8), (2,30), (1,5), (0, 0.3)],
            [(10, 600), (2, 12), (2,40), (1,9), (0,0.5)]
        ]
        xl = np.array([10, 2, 2, 1, 0])
        xu = np.array([600, 12, 40, 9, 0.5])

        # initialize the problem with 4 variables and 2 objectives
        super().__init__(n_var = 5, n_obj = 3,
                         xl=[rng[0] for rng in self.var_ranges[level]],
                         xu=[rng[1] for rng in self.var_ranges[level]]
            )

    def _evaluate(self, x, out, *args, **kwargs):
        # evaluate each solution (each row of x)
        f = np.zeros((x.shape[0], self.n_obj))
        for i in range(x.shape[0]):
            # get the hyperparameters
            n_estimators = int(x[i, 0])
            max_depth = int(x[i, 1])
            min_samples_split = int(x[i, 2])
            min_samples_leaf = int(x[i, 3])
            min_weight_fraction_leaf = int(x[i, 4])


            # build and train the random forest model
            model = RandomForestClassifier(n_estimators=n_estimators,
                                           max_depth=max_depth,
                                           min_samples_split=min_samples_split,
                                           min_samples_leaf = min_samples_leaf,
                                           min_weight_fraction_leaf = min_weight_fraction_leaf,
                                           max_features="sqrt",
                                           random_state=42,
                                           n_jobs = -1)
            model.fit(titanic_X_Train, titanic_Y_Train)

            # predict on the test set
            y_pred = model.predict(titanic_X_Test)

            # calculate the accuracy, f1 and ROC/AUC score as the objectives
            f[i, 0] = -accuracy_score(titanic_Y_Test, y_pred) # negate because we want to maximize
            f[i, 1] = -f1_score(titanic_Y_Test, y_pred) # negate because we want to maximize
            f[i, 2] = -roc_auc_score(titanic_Y_Test, y_pred) # negate because we want to maximize


        # assign the objectives to the output dictionary
        out["F"] = f

In [None]:
%%time
# create an instance of the problem
problem = HyperparameterOptimizationProblem(level=0)
problem1 = HyperparameterOptimizationProblem(level=1)

# create an instance of NSGA-III algorithm
algorithm = NSGA3(
    pop_size= 100,
    ref_dirs=get_reference_directions("das-dennis", 3, n_partitions=12),
    # sampling=get_sampling("int_random"),
    sampling=LHS(),
    selection = RandomSelection(),
    # crossover=get_crossover("int_sbx", prob=0.9, eta=15),
    crossover = SBX(prob=0.6, prob_var=0.5),
    mutation=PolynomialMutation(prob=0.5),
    eliminate_duplicates=True)

algorithm1 = NSGA3(
    pop_size= 50,
    ref_dirs=get_reference_directions("das-dennis", 3, n_partitions=12),
    # sampling=get_sampling("int_random"),
    sampling=LHS(),
    selection = RandomSelection(),
    # crossover=get_crossover("int_sbx", prob=0.9, eta=15),
    crossover = SBX(prob=0.9, prob_var=0.8),
    mutation=PolynomialMutation(prob=0.8),
    eliminate_duplicates=True)

# create an instance of termination criterion
# termination = get_termination("n_gen", 50)

# early stop
termination = DefaultMultiObjectiveTermination(
    xtol=1e-8,           # movement in the design space xtol
    cvtol=1e-6,          # the convergence in the constraint cv_tol
    ftol=0.0025,         # objective space f_tol.
    period=30,
    n_max_gen=50,        # maximum number of generations n_max_gen
    n_max_evals=100000   # function evaluations n_max_evals
)

# perform the optimization
res = minimize(problem,
               algorithm,
               termination,
               seed=42,
               save_history=True,
               verbose=True)
res1 = minimize(problem1,
               algorithm1,
               termination,
               seed=42,
               seed_population=res.pop,
               save_history=True,
               verbose=True)

# print the results
print(f"Best solution found: \nX = {res1.X.astype(int)} \nF = {-res1.F}") # negate F because we maximized
