# Random Forest with scikit-learn library_GridSearch_Optuna

In [8]:
import pandas as pd
import numpy as np
from pprint import pprint



data = pd.read_csv(r"train.csv")

from sklearn import preprocessing
le = preprocessing.LabelEncoder()
data["satisfaction"] = le.fit_transform(data["satisfaction"])
data["Customer Type"] = le.fit_transform(data["Customer Type"])



data["Class"] = data["Class"].map({ "Eco":1,"Eco Plus":2,"Business":3})


"""data["JobType"] = data["JobType"].map({" ?":np.nan, " Never-worked":0,
                                       " Without-pay":1," Self-emp-not-inc":2,
                                       " Self-emp-inc":3," Private":4,
                                         "Local-gov":5," State-gov":6," Federal-gov":7," Without-pay":8})"""




Y = data.iloc[:,-1]
X = data[["Class", "Online boarding","Inflight entertainment"]]

from sklearn.impute import SimpleImputer
imputer  = SimpleImputer(missing_values = np.nan, strategy = "mean")
missing_data = X.values
missing_data = missing_data.reshape(len(missing_data),X.shape[1])
imputer = imputer.fit(missing_data)
X = imputer.transform(missing_data)
X1 = pd.DataFrame(X, columns = ["Class", "Online boarding","Inflight entertainment"])


from sklearn.model_selection import train_test_split


x_train, x_test,y_train,y_test = train_test_split(X1,Y,test_size=0.33, random_state=0)

x_train = x_train.sort_index().values
y_train = y_train.sort_index().values
x_test = x_test.sort_index().values
y_test = y_test.sort_index().values

from sklearn.preprocessing import StandardScaler

sc = StandardScaler()

x_train = sc.fit_transform(x_train)

x_test = sc.transform(x_test)


In [9]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100, n_jobs=-1,
                            random_state=0, max_depth= 13 )
rf.fit(x_train,y_train)
y_pred = rf.predict(x_test)

In [10]:
from sklearn.metrics import confusion_matrix

cm2 = confusion_matrix(y_test,y_pred)

cm2

array([[16989,  2432],
       [ 3007, 11861]], dtype=int64)

In [11]:
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
def print_metrics(labels, preds):
    print("Precision Score: {}".format(precision_score(labels, preds, average = 'macro')))
    print("Recall Score: {}".format(recall_score(labels, preds, average = 'macro')))
    print("Accuracy Score: {}".format(accuracy_score(labels, preds)))
    print("F1 Score: {}".format(f1_score(labels, preds, average = 'macro')))
    
print_metrics(y_test, y_pred)

Precision Score: 0.839733351063972
Recall Score: 0.8362641465447576
Accuracy Score: 0.8413777013036251
F1 Score: 0.8377488072254495


In [12]:
from sklearn.model_selection import GridSearchCV, cross_val_score

print(cross_val_score(RandomForestClassifier(n_estimators=100, 
                                 n_jobs=-1, random_state=0, max_depth= 13), 
                              x_train, y_train, cv=5))

print('mean of cv-scores: {0}'.format(round(np.mean(cross_val_score(RandomForestClassifier(n_estimators=100, 
                                 n_jobs=-1, random_state=0,max_depth= 13), 
                              x_train, y_train, cv=5)),4)))

[0.84284996 0.84522014 0.8381096  0.84622567 0.84414279]
mean of cv-scores: 0.8433


In [13]:
import time 


start = time.time()
rfc1= RandomForestClassifier(n_jobs=-1, random_state=0)

rf_params = {'max_depth': range(10,30,10),'n_estimators': range(50,200,50)}

rf_grid = GridSearchCV(rfc1, rf_params, cv=3, scoring = 'accuracy', n_jobs=-1)

rf_grid.fit(x_train, y_train)

rf_grid.predict(x_test)

print(rf_grid.best_params_)

print("best cv mean ", rf_grid.best_score_)

print('Best holdout result', accuracy_score(y_test, y_pred))

end = time.time()
print("GridSearchCV çalışma süresi " + str(end-start) + " sn")

{'max_depth': 10, 'n_estimators': 150}
best cv mean  0.8434245493069022
Best holdout result 0.8413777013036251
GridSearchCV çalışma süresi 37.920539140701294 sn


In [14]:
from optuna import Trial, visualization
import optuna
start = time.time()

def tune(objective):
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=20)

    params = study.best_params
    best_score = study.best_value
    print(f"Best score: {best_score}\n")
    print(f"Optimized parameters: {params}\n")
    return params


def rf_objective(trial):
    maxdepth_number  = trial.suggest_int("max_depth", 10, 30,10)
    estimators_number = trial.suggest_int("n_estimators", 50, 200,50)
    rf = RandomForestClassifier(n_estimators = estimators_number,max_depth = maxdepth_number,n_jobs=-1)
    
    scores = cross_val_score(rf,x_train,y_train,cv=3, scoring = 'accuracy', n_jobs = -1)
    
    return scores.mean()

randomforest_params = tune(rf_objective)

print(randomforest_params)

end = time.time()


print("Optuna çalışma süresi " + str(end-start) + " sn")
    



[32m[I 2021-09-21 22:12:55,272][0m A new study created in memory with name: no-name-387259d3-a15f-45e6-b514-cd1491c052b8[0m
[32m[I 2021-09-21 22:13:02,756][0m Trial 0 finished with value: 0.8434820081878905 and parameters: {'max_depth': 20, 'n_estimators': 50}. Best is trial 0 with value: 0.8434820081878905.[0m
[32m[I 2021-09-21 22:13:10,889][0m Trial 1 finished with value: 0.8434245493069022 and parameters: {'max_depth': 10, 'n_estimators': 100}. Best is trial 0 with value: 0.8434820081878905.[0m
[32m[I 2021-09-21 22:13:18,693][0m Trial 2 finished with value: 0.843180349062702 and parameters: {'max_depth': 30, 'n_estimators': 100}. Best is trial 0 with value: 0.8434820081878905.[0m
[32m[I 2021-09-21 22:13:26,844][0m Trial 3 finished with value: 0.8431228901817137 and parameters: {'max_depth': 20, 'n_estimators': 100}. Best is trial 0 with value: 0.8434820081878905.[0m
[32m[I 2021-09-21 22:13:33,290][0m Trial 4 finished with value: 0.8431516196222079 and parameters: {'

Best score: 0.8434820081878905

Optimized parameters: {'max_depth': 20, 'n_estimators': 50}

{'max_depth': 20, 'n_estimators': 50}
Optuna çalışma süresi 168.68983602523804 sn
