참고
https://practicaldatascience.co.uk/machine-learning/how-to-tune-an-xgbregressor-model-with-optuna
https://optuna.readthedocs.io/en/stable/reference/generated/optuna.trial.Trial.html#optuna.trial.Trial.suggest_float
https://xgboost.readthedocs.io/en/stable/python/python_api.html

In [13]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity="all"

In [14]:
import numpy as np
import pandas as pd
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

In [15]:
test_df = pd.read_csv('./data/rear_bumper_df.csv')
test_df

Unnamed: 0,id,car_name,Breakage_3,Crushed_2,Scratch_0,Seperated_1,repair_cost
0,sc-216666,올뉴모닝,0,1972,3771,0,83750
1,as-0056675,올뉴투싼,0,0,552,0,84320
2,sc-172370,아반떼AD,0,0,0,0,89280
3,sc-172370,아반떼AD,0,0,0,2278,89280
4,sc-172370,아반떼AD,9790,5,2689,3133,89280
...,...,...,...,...,...,...,...
57498,sc-1037200,아반떼,0,0,241,0,138580
57499,sc-1037200,아반떼,0,0,75,0,138580
57500,sc-1037200,아반떼,0,3020,318,607,138580
57501,sc-1023899,레이,561,0,2437,0,137500


In [16]:
train = pd.read_csv('./data/rear_bumper_df.csv')
train = train.dropna(axis=0)
train = train.reset_index(drop=True)
TARGET = 'repair_cost'
FEATURES = train.columns.difference([TARGET])

In [17]:
train

Unnamed: 0,id,car_name,Breakage_3,Crushed_2,Scratch_0,Seperated_1,repair_cost
0,sc-216666,올뉴모닝,0,1972,3771,0,83750
1,as-0056675,올뉴투싼,0,0,552,0,84320
2,sc-172370,아반떼AD,0,0,0,0,89280
3,sc-172370,아반떼AD,0,0,0,2278,89280
4,sc-172370,아반떼AD,9790,5,2689,3133,89280
...,...,...,...,...,...,...,...
57496,sc-1037200,아반떼,0,0,241,0,138580
57497,sc-1037200,아반떼,0,0,75,0,138580
57498,sc-1037200,아반떼,0,3020,318,607,138580
57499,sc-1023899,레이,561,0,2437,0,137500


In [18]:
for feature in ['id','car_name'] :
    le = LabelEncoder()
    train[feature] = le.fit_transform(train[feature])

In [19]:
train['id'].max()

18245

In [20]:
# check corr
# import matplotlib.pyplot as plt
# import seaborn as sns
# df_copy = train.copy()

# #Correlation Analysis
# plt.figure(figsize=(18,18))
# sns.heatmap(df_copy.corr(), annot=True)
# plt.title('Correlation Analysis')
# plt.show()

In [21]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(train[FEATURES], train[TARGET], test_size=0.3, random_state=42)

X_train

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

Unnamed: 0,Breakage_3,Crushed_2,Scratch_0,Seperated_1,car_name,id
53933,0,0,1066,0,192,12270
35286,284,3482,5,449,186,5916
50860,0,0,0,3618,218,1329
55270,76,1631,4458,9772,58,11001
11070,0,0,558,148,59,837
...,...,...,...,...,...,...
54343,0,2867,28,22,88,17704
38158,0,3516,4290,378,253,13345
860,0,590,1270,0,192,15355
15795,0,5511,73,1508,214,2027


In [53]:
import optuna
import xgboost

def objective(trial):
    param = {
        'max_depth': trial.suggest_int('max_depth', 1, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 1.0),
        'n_estimators': trial.suggest_int('n_estimators', 50, 1000),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'gamma': trial.suggest_float('gamma', 0.01, 1.0),
        'subsample': trial.suggest_float('subsample', 0.01, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.01, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.01, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.01, 1.0),
        'random_state': trial.suggest_int('random_state', 1, 1000)
    }
    model = xgboost.XGBRegressor(**param)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return mean_squared_error(y_test, y_pred)

study = optuna.create_study(direction='minimize', study_name='regression')
study.optimize(objective, n_trials=100)

[32m[I 2023-01-25 13:52:27,401][0m A new study created in memory with name: regression[0m
[32m[I 2023-01-25 13:52:28,109][0m Trial 0 finished with value: 3096034687.3911285 and parameters: {'max_depth': 4, 'learning_rate': 0.22476675053402115, 'n_estimators': 138, 'min_child_weight': 7, 'gamma': 0.3940737006595624, 'subsample': 0.8997509068605473, 'colsample_bytree': 0.1522930230775918, 'reg_alpha': 0.22885848164193023, 'reg_lambda': 0.6722952019590545, 'random_state': 527}. Best is trial 0 with value: 3096034687.3911285.[0m
[32m[I 2023-01-25 13:52:31,023][0m Trial 1 finished with value: 2553395836.2634745 and parameters: {'max_depth': 3, 'learning_rate': 0.07930349185566776, 'n_estimators': 632, 'min_child_weight': 10, 'gamma': 0.9843748760281986, 'subsample': 0.9442616871636202, 'colsample_bytree': 0.5296164525600527, 'reg_alpha': 0.2693344020043925, 'reg_lambda': 0.1695645524616851, 'random_state': 496}. Best is trial 1 with value: 2553395836.2634745.[0m
[32m[I 2023-01-25 

In [54]:
from optuna import visualization
print('Best trial: score {},\nparams {}'.format(study.best_trial.value,study.best_trial.params))
hist = study.trials_dataframe()
hist.head()

Best trial: score 924405856.1195315,
params {'max_depth': 10, 'learning_rate': 0.16407638550902337, 'n_estimators': 417, 'min_child_weight': 1, 'gamma': 0.07691296612173734, 'subsample': 0.9790885421941999, 'colsample_bytree': 0.5082188996877985, 'reg_alpha': 0.22584159955674005, 'reg_lambda': 0.7487868406002102, 'random_state': 125}


Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_colsample_bytree,params_gamma,params_learning_rate,params_max_depth,params_min_child_weight,params_n_estimators,params_random_state,params_reg_alpha,params_reg_lambda,params_subsample,state
0,0,3096035000.0,2023-01-25 13:52:27.402766,2023-01-25 13:52:28.109842,0 days 00:00:00.707076,0.152293,0.394074,0.224767,4,7,138,527,0.228858,0.672295,0.899751,COMPLETE
1,1,2553396000.0,2023-01-25 13:52:28.111345,2023-01-25 13:52:31.022680,0 days 00:00:02.911335,0.529616,0.984375,0.079303,3,10,632,496,0.269334,0.169565,0.944262,COMPLETE
2,2,1803139000.0,2023-01-25 13:52:31.024678,2023-01-25 13:52:31.557541,0 days 00:00:00.532863,0.201739,0.880033,0.340053,8,2,78,362,0.827211,0.065612,0.842184,COMPLETE
3,3,1575879000.0,2023-01-25 13:52:31.558541,2023-01-25 13:52:33.272445,0 days 00:00:01.713904,0.129889,0.72784,0.697046,9,6,233,326,0.446525,0.679258,0.734702,COMPLETE
4,4,2963391000.0,2023-01-25 13:52:33.274449,2023-01-25 13:52:35.753208,0 days 00:00:02.478759,0.668281,0.984619,0.514499,5,9,499,866,0.503414,0.362842,0.211126,COMPLETE


In [55]:
optuna.visualization.plot_param_importances(study)

In [58]:
train[FEATURES].shape
train[TARGET].shape

(897, 5)

(897,)

In [56]:
#  {'max_depth': 10, 'learning_rate': 0.16407638550902337,
# 'n_estimators': 417, 'min_child_weight': 1, 'gamma': 0.07691296612173734, 
# 'subsample': 0.9790885421941999, 'colsample_bytree': 0.5082188996877985, 
# 'reg_alpha': 0.22584159955674005, 'reg_lambda': 0.7487868406002102, 'random_state': 125}
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(train[FEATURES], train[TARGET], test_size=0.3,random_state=42)
clf=XGBRegressor()
parametres={'max_depth': ['10'], 'learning_rate':[0.16407638550902337], 'n_estimators': [417], 
'min_child_weight': [1], 'gamma': [0.07691296612173734], 'subsample': [0.9790885421941999], 
'colsample_bytree': [0.5082188996877985], 'reg_alpha': [0.22584159955674005], 'reg_lambda': [0.7487868406002102],'random_state': [125]}
grid_search_cv_clf=GridSearchCV(clf,parametres,cv=5)
grid_search_cv_clf.fit(X_train,y_train)
best_clf2=grid_search_cv_clf.best_estimator_
r2_score = best_clf2.score(X_test,y_test)

### 모델 저장

In [62]:
X_train.columns

Index(['Breakage_3', 'Crushed_2', 'Scratch_0', 'Seperated_1', 'car_name',
       'id'],
      dtype='object')

In [57]:
import joblib
model_path = './' + 'models/' + 'test_XGB03' +'.model'
# model.save_model(model_path)
joblib.dump(best_clf2, open(model_path, 'wb'))

In [59]:
import joblib
model_path = './' + 'models/' + 'test_XGB03' +'.model'
new_model = joblib.load(model_path)

In [68]:
X_test.iloc[1]
y_test.iloc[1]

Breakage_3        9
Crushed_2         0
Scratch_0      1562
Seperated_1       0
car_name        208
id             3843
Name: 5394, dtype: int64

146010

In [60]:
# 실제값 146010 인데 157412.22로 예측
print(new_model.predict(X_test.iloc[1].values.reshape(1,6)))

[157412.22]


In [15]:
X_test.shape
X_test.iloc[1].values.reshape(1,6).shape

(17251, 6)

(1, 6)

In [16]:
X_test.iloc[1].values.reshape(1,6)
print(model.predict(X_test.iloc[1].values.reshape(1,6)))

array([[ 300, 1145,    0,    0,  223, 3746]], dtype=int64)

[154320.7]


In [17]:
X_test.iloc[1].values.reshape(1,6).shape
X_test.iloc[1].shape

(1, 6)

(6,)

In [18]:
print(model.predict(X_test))

[155435.55  154320.7   135639.58  ... 109060.945 145241.6   137817.33 ]
