In [49]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [50]:
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import metrics
import seaborn as sns
sns.set_style('ticks')

In [51]:
path_data_lama = "/content/drive/My Drive/skripsi/dataset/bisbul_ihsan.csv"
data_lama = pd.read_csv(path_data_lama)

In [52]:
def map_e(y_true, y_pred):
  y_true, y_pred = np.array(y_true), np.array(y_pred)
  return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

def eval(y_actual, y_pred):
  rmse = np.sqrt(metrics.mean_squared_error(y_actual, y_pred))
  mape = map_e(y_actual, y_pred)
  r2 = metrics.r2_score(y_actual, y_pred)
  print(f"rmse : {rmse}\nmape : {mape}\nr2 : {r2}")
  return rmse,mape,r2

In [53]:
# function for creating a feature importance dataframe
def imp_df(column_names, importances):
    df = pd.DataFrame({'feature': column_names,
                       'feature_importance': importances}) \
           .sort_values('feature_importance', ascending = False) \
           .reset_index(drop = True)
    return df

# plotting a feature importance dataframe (horizontal barchart)
def var_imp_plot(imp_df, title):
    imp_df.columns = ['feature', 'feature_importance']
    sns.barplot(x = 'feature_importance', y = 'feature', data = imp_df, orient = 'h', color = 'royalblue') \
       .set_title(title, fontsize = 20) 


# rf biasa

In [54]:
y = data_lama["Carotenoid"]
x = data_lama.drop("Carotenoid", axis=1)

In [55]:
x_train, x_test, y_train, y_test = train_test_split(x, 
                                                    y, 
                                                    test_size = 0.25, 
                                                    random_state=23)

In [56]:
x_train.shape

(553, 224)

In [57]:
import xgboost

In [58]:
rf = xgboost.XGBRegressor(random_state=123)

In [59]:
start_rf = time.time()
rf.fit(x_train,y_train)
stop_rf = time.time()
waktu = stop_rf - start_rf



In [60]:
rf_y_pred_test = rf.predict(x_test)
rf_y_pred_train = rf.predict(x_train)

In [61]:
rf_test_rmse, rf_test_mape, rf_test_r2 = eval(y_test,rf_y_pred_test)

rmse : 39.352438069072505
mape : 8.403697374805065
r2 : 0.9528001615294731


In [62]:
a = {'nama' : ["rf default"],
    'rmse':[rf_test_rmse], 
        'r2': [rf_test_r2], 
        'waktu':[waktu]} 
  
df = pd.DataFrame(a)

In [63]:
base_imp = imp_df(x_train.columns, rf.feature_importances_)

In [64]:
base_imp_d = base_imp.copy()

In [65]:
y = data_lama["Carotenoid"]
x = data_lama.drop("Carotenoid", axis=1)

In [66]:
w = x[base_imp.feature[:100]]

In [67]:
x_train, x_test, y_train, y_test = train_test_split(w, 
                                                    y, 
                                                    test_size = 0.25, 
                                                    random_state=23)

In [68]:
x_train.shape

(553, 100)

In [69]:
rf = xgboost.XGBRegressor(random_state=123)

In [70]:
start_rf = time.time()
rf.fit(x_train,y_train)
stop_rf = time.time()
waktu = stop_rf - start_rf



In [71]:
rf_y_pred_test = rf.predict(x_test)
rf_y_pred_train = rf.predict(x_train)

In [72]:
rf_test_rmse, rf_test_mape, rf_test_r2 = eval(y_test,rf_y_pred_test)

rmse : 40.09553031301504
mape : 8.637274720384843
r2 : 0.9510007821114702


In [73]:
a = {'nama' : ["rf default 70"],
    'rmse':[rf_test_rmse], 
        'r2': [rf_test_r2], 
        'waktu':[waktu]} 
  
df2 = pd.DataFrame(a)

# rf tunning

In [74]:
y = data_lama["Carotenoid"]
x = data_lama.drop("Carotenoid", axis=1)

In [75]:
x_train, x_test, y_train, y_test = train_test_split(x, 
                                                    y, 
                                                    test_size = 0.25, 
                                                    random_state=17)

In [76]:
x_train.shape

(553, 224)

In [77]:
best_params_ = {'colsample_bytree': 0.7,
 'eta': 0.13,
 'learning_rate': 0.1,
 'max_depth': 10,
 'min_child_weight': 5,
 'n_estimators': 200,
 'subsample': 0.8,
'objective': 'reg:squarederror',
}

In [78]:
rf = xgboost.XGBRegressor(**best_params_, random_state=123)

In [79]:
start_rf = time.time()
rf.fit(x_train,y_train)
stop_rf = time.time()
waktu = stop_rf - start_rf

rf_y_pred_test = rf.predict(x_test)
rf_y_pred_train = rf.predict(x_train)

In [80]:
rf_test_rmse, rf_test_mape, rf_test_r2 = eval(y_test,rf_y_pred_test)

rmse : 33.001878210506725
mape : 7.218395721437373
r2 : 0.9633556337712551


In [81]:
a = {'nama' : ["rf tuning"],
    'rmse':[rf_test_rmse], 
        'r2': [rf_test_r2], 
        'waktu':[waktu]} 
  
df5 = pd.DataFrame(a)

In [82]:
base_imp = imp_df(x_train.columns, rf.feature_importances_)

In [83]:
base_imp_t = base_imp.copy()

In [84]:
y = data_lama["Carotenoid"]
x = data_lama.drop("Carotenoid", axis=1)

In [85]:
w = x[base_imp.feature[:121]]

In [86]:
x_train, x_test, y_train, y_test = train_test_split(w, 
                                                    y, 
                                                    test_size = 0.25, 
                                                    random_state=17)

In [87]:
x_train.shape

(553, 121)

In [88]:
rf = xgboost.XGBRegressor(**best_params_, random_state=123)

In [89]:
start_rf = time.time()
rf.fit(x_train,y_train)
stop_rf = time.time()
waktu = stop_rf - start_rf

In [90]:
rf_y_pred_test = rf.predict(x_test)
rf_y_pred_train = rf.predict(x_train)

In [91]:
rf_test_rmse, rf_test_mape, rf_test_r2 = eval(y_test,rf_y_pred_test)

rmse : 33.31645964683692
mape : 7.4701563232918415
r2 : 0.9626536992132811


In [92]:
a = {'nama' : ["rf tuning 120"],
    'rmse':[rf_test_rmse], 
        'r2': [rf_test_r2], 
        'waktu':[waktu]} 
  
df6 = pd.DataFrame(a)

In [93]:
df_total = pd.concat([df, df2, df5, df6])
df_total

Unnamed: 0,nama,rmse,r2,waktu
0,rf default,39.352438,0.9528,0.87868
0,rf default 70,40.09553,0.951001,0.393501
0,rf tuning,33.001878,0.963356,3.249404
0,rf tuning 120,33.31646,0.962654,1.837859


In [94]:
df_total.to_excel('/content/drive/My Drive/skripsi/perbandingan/perbandingan xgb.xlsx')

In [95]:
df_base_imp = pd.concat([base_imp_d, base_imp_t])

In [96]:
df_base_imp.to_excel('/content/drive/My Drive/skripsi/perbandingan/base imp xgb.xlsx')