## Import

In [1]:
import pandas as pd
import random
import os
import numpy as np
import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import LabelEncoder

In [2]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(37) # Seed 고정

## Data Load

In [3]:
import pandas as pd
import numpy as np


train_df = pd.read_csv('./train.csv')
test_df = pd.read_csv('./test.csv')

train_time_df = train_df['TIMESTAMP']
test_time_df = test_df['TIMESTAMP']

def split_TIMESTEPS(train_df, test_df, train_time_df, test_time_df):

    train_col_year = []
    train_col_month = []
    train_col_day = []

    test_col_year = []
    test_col_month = []
    test_col_day = []


split_TIMESTEPS(train_df, test_df, train_time_df, test_time_df)

In [4]:
train_x = train_df.drop(columns=['PRODUCT_ID', 'Y_Class', 'Y_Quality', 'TIMESTAMP'])
train_y = train_df['Y_Class']

test_x = test_df.drop(columns=['PRODUCT_ID', 'TIMESTAMP'])

## Data Pre-processing

In [5]:
train_x = train_x.fillna(0)
test_x = test_x.fillna(0)

In [6]:
# qualitative to quantitative
qual_col = ['LINE', 'PRODUCT_CODE']

for i in qual_col:
    le = LabelEncoder()
    le = le.fit(train_x[i])
    train_x[i] = le.transform(train_x[i])
    
    for label in np.unique(test_x[i]): 
        if label not in le.classes_: 
            le.classes_ = np.append(le.classes_, label)
    test_x[i] = le.transform(test_x[i]) 
print('Done.')

Done.


## Split train / valid

In [7]:
from sklearn.model_selection import train_test_split

x_train, x_val, y_train, y_val = train_test_split(train_x, train_y, random_state=42, test_size=0.2)

print(x_train.shape)
print(y_train.shape)
print(x_val.shape)
print(y_val.shape)

(478, 2877)
(478,)
(120, 2877)
(120,)


## Classification Model Fit

### ✅Valid

In [8]:
### 나중에 optuna 적용해보기 ###


from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

def model_fit_pred(x_train, y_train, x_val, y_val):
    
    ### RandomForestClassifier ###
    RF = RandomForestClassifier(random_state=42).fit(x_train, y_train)
    print('RandomForestClassifier >>>>> Done', '\n\n')
    rf_pred = RF.predict(x_val)
    print(classification_report(y_val, rf_pred))

    ### XGBClassifier ###
    XGB = XGBClassifier(n_estimators=1000, max_depth=2, learning_rate=0.01, objective='multi:softmax').fit(x_train, y_train)
    print('\n\n', 'XGBClassifier >>>>> Done', '\n\n')
    xgb_pred = XGB.predict(x_val)
    print(classification_report(y_val, xgb_pred))   



In [9]:
model_fit_pred(x_train, y_train, x_val, y_val)

RandomForestClassifier >>>>> Done 


              precision    recall  f1-score   support

           0       0.71      0.22      0.33        23
           1       0.71      0.99      0.83        76
           2       0.88      0.33      0.48        21

    accuracy                           0.73       120
   macro avg       0.77      0.51      0.55       120
weighted avg       0.74      0.72      0.67       120



 XGBClassifier >>>>> Done 


              precision    recall  f1-score   support

           0       0.86      0.26      0.40        23
           1       0.70      0.96      0.81        76
           2       0.78      0.33      0.47        21

    accuracy                           0.72       120
   macro avg       0.78      0.52      0.56       120
weighted avg       0.74      0.72      0.67       120



#### 👉 RandomForestClassifier

In [10]:
### RF ###

RF = RandomForestClassifier(random_state=42).fit(x_train, y_train)
print('Done.')

from sklearn.metrics import classification_report

pred = RF.predict(x_val)
print(classification_report(y_val, pred))

Done.
              precision    recall  f1-score   support

           0       0.71      0.22      0.33        23
           1       0.71      0.99      0.83        76
           2       0.88      0.33      0.48        21

    accuracy                           0.73       120
   macro avg       0.77      0.51      0.55       120
weighted avg       0.74      0.72      0.67       120



In [12]:
import sklearn

classifier_obj = RandomForestClassifier(random_state=42,
                                        max_depth=8,
                                        n_estimators=11,
                                        min_samples_split=5).fit(x_train, y_train)

preds = classifier_obj.predict(x_val)
pred_labels = np.rint(preds)
accuracy = sklearn.metrics.accuracy_score(y_val, pred_labels)
accuracy

0.7

##### optuna

In [13]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
import sklearn.metrics
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import datasets
import optuna


n_trials = 2000

def objective_rfc(trial):

    rf_max_depth = trial.suggest_int("max_depth", 2, 8, log=True)
    rf_n_estimators = trial.suggest_int("n_estimators", 1, 500, log=True)
    rf_min_samples_split = trial.suggest_int("min_samples_split", 2, 5, log=True)
    #rf_random_state = trial.suggest_int("random_state", 1, 100, log=True)

    classifier_obj = RandomForestClassifier(random_state=42,
                                            max_depth=rf_max_depth,
                                            n_estimators=rf_n_estimators,
                                            min_samples_split=rf_min_samples_split)

    bst = classifier_obj.fit(x_train, y_train)
    preds = bst.predict(x_val)
    pred_labels = np.rint(preds)
    accuracy = sklearn.metrics.accuracy_score(y_val, pred_labels)
    return accuracy


if __name__ == "__main__":
    study = optuna.create_study(direction="maximize")
    study.optimize(objective_rfc, n_trials=n_trials, timeout=600)

    print("Number of finished trials: ", len(study.trials))
    print("Best trial:")
    trial = study.best_trial

    print("  Value: {}".format(trial.value))
    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

[32m[I 2023-02-06 18:41:20,248][0m A new study created in memory with name: no-name-dfc7e5b7-1c04-4c1f-aca9-99f78a06e2de[0m
[32m[I 2023-02-06 18:41:20,270][0m Trial 0 finished with value: 0.6583333333333333 and parameters: {'max_depth': 4, 'n_estimators': 1, 'min_samples_split': 2}. Best is trial 0 with value: 0.6583333333333333.[0m
[32m[I 2023-02-06 18:41:20,318][0m Trial 1 finished with value: 0.6833333333333333 and parameters: {'max_depth': 2, 'n_estimators': 35, 'min_samples_split': 4}. Best is trial 1 with value: 0.6833333333333333.[0m
[32m[I 2023-02-06 18:41:20,342][0m Trial 2 finished with value: 0.625 and parameters: {'max_depth': 3, 'n_estimators': 2, 'min_samples_split': 2}. Best is trial 1 with value: 0.6833333333333333.[0m
[32m[I 2023-02-06 18:41:20,372][0m Trial 3 finished with value: 0.7 and parameters: {'max_depth': 5, 'n_estimators': 6, 'min_samples_split': 2}. Best is trial 3 with value: 0.7.[0m
[32m[I 2023-02-06 18:41:20,413][0m Trial 4 finished with 

Number of finished trials:  2000
Best trial:
  Value: 0.7666666666666667
  Params: 
    max_depth: 7
    n_estimators: 16
    min_samples_split: 2


###### score

0.75  

Number of finished trials:  100
Best trial:
  Value: 0.775
  Params: 
    rf_max_depth: 12
    n_estimators: 23

#### 🎯 Stubmit

In [14]:
classifier_obj = RandomForestClassifier(random_state=42,
                                        max_depth=8,
                                        n_estimators=11,
                                        min_samples_split=5)

bst = classifier_obj.fit(train_x, train_y)
preds = bst.predict(test_x)
rf_pred = np.rint(preds)

submit = pd.read_csv('./sample_submission.csv')
submit['Y_Class'] = rf_pred
submit.to_csv('./rf_42_optuna_submission.csv', index=False)

#### 👉 XGB

In [15]:
# xgb (1)

import xgboost as xgb

dtrain = xgb.DMatrix(x_train, y_train)
dval = xgb.DMatrix(x_val, y_val)

# specify parameters via map
param = {"max_depth": 100, "eta": 1, "objective": "multi:softmax"}

# specify validations set to watch performance
watchlist = [(dval, "eval"), (dtrain, "train")]

# number of boosting rounds
num_round = 2
bst = xgb.train(param, dtrain, num_boost_round=num_round, evals=watchlist)
print('XGBClassifier >>>>> Done', '\n\n')

# run prediction
preds = bst.predict(dval)
labels = dval.get_label()


XGBoostError: value 0 for Parameter num_class should be greater equal to 1
num_class: Number of output class in the multi-class classification.

In [16]:
# xgb (2)

XGB = XGBClassifier(n_estimators=1000, max_depth=3, learning_rate=0.01, objective='multi:softmax').fit(x_train, y_train)
print('\n\n', 'XGBClassifier >>>>> Done', '\n\n')
xgb_pred = XGB.predict(x_val)
print(classification_report(y_val, xgb_pred))  

# submit = pd.read_csv('./sample_submission.csv')
# submit['Y_Class'] = xgb_pred
# submit.to_csv('./xgb_submission.csv', index=False)



 XGBClassifier >>>>> Done 


              precision    recall  f1-score   support

           0       0.83      0.22      0.34        23
           1       0.72      0.96      0.82        76
           2       0.75      0.43      0.55        21

    accuracy                           0.73       120
   macro avg       0.77      0.54      0.57       120
weighted avg       0.74      0.72      0.68       120



##### optuna

In [17]:
import pandas as pd
import xgboost as xgb
import sklearn.metrics
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import datasets

n_trials = 5000 # <<<<<

# 아래 함수에서 파라미터 조정
def objective_XGB(trial):
    
    dtrain = xgb.DMatrix(x_train, y_train)
    dvalid = xgb.DMatrix(x_val, y_val)

    param = {
        "verbosity": 0,
        
        "objective": "multi:softmax",   # 👈  binary:logistic >> multi:softmax 으로 변경
        "num_class": '3',  # 👈 "objective": "multi:softmax" 일 시 num_class 파라미터 지정해야 해서 추가함
        
        #  https://romg2.github.io/mlguide/03_머신러닝-완벽가이드-04.-분류-XGBoost/  👈 참고사이트


        # use exact for small dataset.
        "tree_method": "exact",
        # defines booster, gblinear for linear functions.
        "booster": trial.suggest_categorical("booster", ["gbtree", "gblinear", "dart"]),
        # L2 regularization weight.
        "lambda": trial.suggest_float("lambda", 1e-8, 1.0, log=True),
        # L1 regularization weight.
        "alpha": trial.suggest_float("alpha", 1e-8, 1.0, log=True),
        # sampling ratio for training data.
        "subsample": trial.suggest_float("subsample", 0.2, 1.0),
        # sampling according to each tree.
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.2, 1.0),
    }

    if param["booster"] in ["gbtree", "dart"]:
        # maximum depth of the tree, signifies complexity of the tree.
        param["max_depth"] = trial.suggest_int("max_depth", 2, 32, log=True)
        # minimum child weight, larger the term more conservative the tree.
        param["min_child_weight"] = trial.suggest_int("min_child_weight", 2, 10)
        param["eta"] = trial.suggest_float("eta", 1e-8, 1.0, log=True)
        # defines how selective algorithm is.
        param["gamma"] = trial.suggest_float("gamma", 1e-8, 1.0, log=True)
        param["grow_policy"] = trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"])

    if param["booster"] == "dart":
        param["sample_type"] = trial.suggest_categorical("sample_type", ["uniform", "weighted"])
        param["normalize_type"] = trial.suggest_categorical("normalize_type", ["tree", "forest"])
        param["rate_drop"] = trial.suggest_float("rate_drop", 1e-8, 1.0, log=True)
        param["skip_drop"] = trial.suggest_float("skip_drop", 1e-8, 1.0, log=True)

    bst = xgb.train(param, dtrain)
    preds = bst.predict(dvalid)
    pred_labels = np.rint(preds)
    accuracy = sklearn.metrics.accuracy_score(y_val, pred_labels)
    return accuracy

# 아래 함수에서 실행
if __name__ == "__main__":
    study = optuna.create_study(direction="maximize")
    study.optimize(objective_XGB, n_trials=n_trials, timeout=600)

    print("Number of finished trials: ", len(study.trials))
    print("Best trial:")
    trial = study.best_trial

    print("  Value: {}".format(trial.value))
    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

[32m[I 2023-02-06 18:46:26,940][0m A new study created in memory with name: no-name-99a6fcc4-18b7-4d62-beba-663d4567111b[0m
[32m[I 2023-02-06 18:46:27,101][0m Trial 0 finished with value: 0.7083333333333334 and parameters: {'booster': 'gblinear', 'lambda': 0.0059908382077577755, 'alpha': 3.863019119754019e-06, 'subsample': 0.37009476420639126, 'colsample_bytree': 0.22506025112915407}. Best is trial 0 with value: 0.7083333333333334.[0m
[32m[I 2023-02-06 18:46:27,246][0m Trial 1 finished with value: 0.7 and parameters: {'booster': 'gblinear', 'lambda': 0.08168865896995572, 'alpha': 0.0004414262669652733, 'subsample': 0.690874034628846, 'colsample_bytree': 0.28714128628994207}. Best is trial 0 with value: 0.7083333333333334.[0m
[32m[I 2023-02-06 18:46:27,397][0m Trial 2 finished with value: 0.675 and parameters: {'booster': 'gbtree', 'lambda': 1.0030412438232532e-05, 'alpha': 0.3330760366858005, 'subsample': 0.3863098944000003, 'colsample_bytree': 0.9354767610689342, 'max_depth

Number of finished trials:  1562
Best trial:
  Value: 0.7833333333333333
  Params: 
    booster: dart
    lambda: 0.017102894311724036
    alpha: 1.971844340310387e-06
    subsample: 0.7424305498423763
    colsample_bytree: 0.3090103415678434
    max_depth: 19
    min_child_weight: 8
    eta: 0.009561876624569008
    gamma: 0.002600386057468784
    grow_policy: depthwise
    sample_type: uniform
    normalize_type: forest
    rate_drop: 0.002897536439363007
    skip_drop: 0.000644203748104513


Number of finished trials:  1022
Best trial:
  Value: 0.7916666666666666
  Params: 
    booster: dart
    lambda: 0.2553839284082131
    alpha: 2.963365197931572e-05
    subsample: 0.5476984339603794
    colsample_bytree: 0.5739754370625748
    max_depth: 11
    min_child_weight: 3
    eta: 6.065247920269048e-06
    gamma: 0.7336338498338038
    grow_policy: lossguide
    sample_type: weighted
    normalize_type: forest
    rate_drop: 3.9072533559090704e-07
    skip_drop: 0.00014872786300187534

In [18]:
#Number of finished trials:  1022
#Best trial:
#  Value: 0.7916666666666666

dtrain = xgb.DMatrix(train_x, train_y)
dvalid = xgb.DMatrix(test_x)

param={'Pbooster': 'dart',
'lambda': 0.2553839284082131,
'alpha': 2.963365197931572e-05,
'subsample': 0.5476984339603794,
'colsample_bytree': 0.5739754370625748,
'max_depth': 11,
'min_child_weight': 3,
'eta': 6.065247920269048e-06,
'gamma': 0.7336338498338038,
'grow_policy': 'lossguide',
'sample_type': 'weighted',
'normalize_type': 'forest',
'rate_drop': 3.9072533559090704e-07,
'skip_drop': 0.00014872786300187534}

bst = xgb.train(param, dtrain)
preds = bst.predict(dvalid)
xgb_optuna_labels = np.rint(preds)
#accuracy = sklearn.metrics.accuracy_score(y_val, xgb_optuna_labels)

In [19]:
submit = pd.read_csv('./sample_submission.csv')
submit['Y_Class'] = xgb_optuna_labels
submit.to_csv('./xgb_optuna_submission.csv', index=False)

### 👉 XGBClassifier

In [20]:
from xgboost import XGBClassifier
from sklearn.metrics import classification_report

XGBClass = XGBClassifier(n_estimators=500, max_depth=2, learning_rate=0.01, objective='multi:softmax').fit(x_train, y_train)
print('\n\n', 'XGBClassifier >>>>> Done', '\n\n')
xgb_pred = XGBClass.predict(x_val)



 XGBClassifier >>>>> Done 




#### optuna

In [23]:
import sklearn.metrics
from sklearn.model_selection import train_test_split
import optuna
from xgboost import XGBClassifier
from sklearn.metrics import classification_report

import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

n_trials = 10000

def objective_xgbc(trial):

    xgb_max_depth = trial.suggest_int("rf_max_depth", 2, 10, log=True)
    xgb_n_estimators = trial.suggest_int("n_estimators", 1, 50, log=True)
    xgb_learning_rate = trial.suggest_float("learning_rate", low=0.001, high=0.1, step=0.01)

    classifier_obj = XGBClassifier(max_depth=xgb_max_depth,
                                n_estimators=xgb_n_estimators,
                                learning_rate = xgb_learning_rate,
                                objective='multi:softmax')
    
    bst = classifier_obj.fit(x_train, y_train)
    preds = bst.predict(x_val)
    pred_labels = np.rint(preds)
    accuracy = sklearn.metrics.accuracy_score(y_val, pred_labels)
    return accuracy


if __name__ == "__main__":
    study = optuna.create_study(direction="maximize")
    study.optimize(objective_xgbc, n_trials=n_trials, timeout=600)

    print("Number of finished trials: ", len(study.trials))
    print("Best trial:")
    trial = study.best_trial

    print("  Value: {}".format(trial.value))
    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

[32m[I 2023-02-06 18:57:52,299][0m A new study created in memory with name: no-name-90408d8f-ae1d-476d-9760-c75465628486[0m
[32m[I 2023-02-06 18:57:52,522][0m Trial 0 finished with value: 0.6833333333333333 and parameters: {'rf_max_depth': 2, 'n_estimators': 11, 'learning_rate': 0.091}. Best is trial 0 with value: 0.6833333333333333.[0m
[32m[I 2023-02-06 18:57:53,620][0m Trial 1 finished with value: 0.7 and parameters: {'rf_max_depth': 5, 'n_estimators': 36, 'learning_rate': 0.031}. Best is trial 1 with value: 0.7.[0m
[32m[I 2023-02-06 18:57:54,116][0m Trial 2 finished with value: 0.6833333333333333 and parameters: {'rf_max_depth': 6, 'n_estimators': 11, 'learning_rate': 0.041}. Best is trial 1 with value: 0.7.[0m
[32m[I 2023-02-06 18:57:54,305][0m Trial 3 finished with value: 0.6916666666666667 and parameters: {'rf_max_depth': 3, 'n_estimators': 7, 'learning_rate': 0.081}. Best is trial 1 with value: 0.7.[0m
[32m[I 2023-02-06 18:57:55,088][0m Trial 4 finished with val

Number of finished trials:  309
Best trial:
  Value: 0.7416666666666667
  Params: 
    rf_max_depth: 10
    n_estimators: 50
    learning_rate: 0.091


0.75  

Number of finished trials:  100
Best trial:
  Value: 0.75
  Params: 
    rf_max_depth: 7
    n_estimators: 18
    learning_rate: 0.051000000000000004

#### feature importance : 의미 X

In [22]:
import seaborn as sns
import matplotlib.pyplot as plt

ft_importance_values = XGBClassifier.feature_importances_

# 정렬과 시각화를 쉽게 하기 위해 series 전환
ft_series = pd.Series(ft_importance_values, index = train_x.columns)
ft_top141 = ft_series.sort_values(ascending=False)[:141]

# 시각화
plt.figure(figsize=(8,6))
plt.title('Feature Importance Top 141')
sns.barplot(x=ft_top141, y=ft_top141.index)
plt.show()

TypeError: '<' not supported between instances of 'property' and 'property'

In [None]:
fi_col_top141 = ft_top141.index.tolist()

x_train_fi = x_train[fi_col_top141]
x_val_fi = x_val[fi_col_top141]

In [None]:
XGBClass = XGBClassifier(n_estimators=500, max_depth=2, learning_rate=0.01, objective='multi:softmax').fit(x_train, y_train)
print('\n\n', 'XGBClassifier : origin >>>>> Done', '\n\n')
xgb_pred = XGBClass.predict(x_val)
print(classification_report(xgb_pred, y_val))

XGBClass_fi = XGBClassifier(n_estimators=500, max_depth=2, learning_rate=0.01, objective='multi:softmax').fit(x_train_fi, y_train)
print('\n\n', 'XGBClassifier : feature importance >>>>> Done', '\n\n')
xgb_pred = XGBClass_fi.predict(x_val_fi)
print(classification_report(xgb_pred, y_val))



 XGBClassifier : origin >>>>> Done 


              precision    recall  f1-score   support

           0       0.26      0.86      0.40         7
           1       0.96      0.70      0.81       104
           2       0.33      0.78      0.47         9

    accuracy                           0.72       120
   macro avg       0.52      0.78      0.56       120
weighted avg       0.87      0.72      0.76       120



 XGBClassifier : feature importance >>>>> Done 


              precision    recall  f1-score   support

           0       0.26      0.86      0.40         7
           1       0.96      0.70      0.81       104
           2       0.33      0.78      0.47         9

    accuracy                           0.72       120
   macro avg       0.52      0.78      0.56       120
weighted avg       0.87      0.72      0.76       120



### ✅Test

In [None]:


XGB = XGBClassifier(n_estimators=500, max_depth=2, learning_rate=0.01, objective='multi:softmax').fit(train_x, train_y)
print('\n\n', 'XGBClassifier >>>>> Done', '\n\n')
xgb_pred = XGB.predict(test_x)



 XGBClassifier >>>>> Done 




In [None]:
submit = pd.read_csv('./sample_submission.csv')
submit['Y_Class'] = xgb_pred
submit.to_csv('./xgb_submission.csv', index=False)

## Inference

In [None]:
preds = RF.predict(test_x)
print('Done.')

Done.


## Submit

In [None]:
submit = pd.read_csv('./sample_submission.csv')

submit['Y_Class'] = preds

submit.to_csv('./baseline_submission.csv', index=False)