## Load Data

In [213]:
import pandas as pd
import random
import os
import numpy as np
import warnings
warnings.filterwarnings('ignore')

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(37) # Seed 고정

train_df = pd.read_csv('./train.csv')
test_df = pd.read_csv('./test.csv')

#train_x = train_df.drop(columns=['PRODUCT_ID', 'Y_Class', 'TIMESTAMP'])
train_x = train_df.drop(columns=['PRODUCT_ID', 'Y_Quality', 'TIMESTAMP', 'Y_Class'])
train_y = train_df['Y_Class']

test_x = test_df.drop(columns=['PRODUCT_ID', 'TIMESTAMP'])

## Preprocessing

파생변수 생성

In [214]:
"""
new features 
"""

train_x['LINE_PRODUCT_CODE'] = train_x[['LINE','PRODUCT_CODE']].apply(lambda x: '-'.join(x.astype(str)),axis=1)
test_x['LINE_PRODUCT_CODE'] = test_x[['LINE','PRODUCT_CODE']].apply(lambda x: '-'.join(x.astype(str)),axis=1)

train_x.drop(['LINE','PRODUCT_CODE'], axis=1, inplace=True)
test_x.drop(['LINE','PRODUCT_CODE'], axis=1, inplace=True)

LabelEncoder

In [215]:
from sklearn.preprocessing import LabelEncoder

qual_col = ['LINE_PRODUCT_CODE']

for i in qual_col:
    le = LabelEncoder()
    le.fit(train_x[i])
    train_x[i] = le.transform(train_x[i])

    for label in np.unique(test_x[i]):
        if label not in le.classes_:
            le.classes_ = np.append(le.classes_, label)
    
    test_x[i] = le.transform(test_x[i])

print('done')

done


transformation

In [216]:
'''train_x_mun = train_x.drop(['LINE', 'PRODUCT_CODE', 'LINE_PRODUCT_CODE'], axis=1)

for var in train_x_mun.columns:
    
    train_x[var+'_log1'] = np.log(train_x[var]+1)
    test_x[var+'_log1'] = np.log(test_x[var]+1)


train_x.drop(train_x_mun.columns, axis=1, inplace=True)
test_x.drop(train_x_mun.columns, axis=1, inplace=True)

print(train_x.shape)
print(test_x.shape)'''

"train_x_mun = train_x.drop(['LINE', 'PRODUCT_CODE', 'LINE_PRODUCT_CODE'], axis=1)\n\nfor var in train_x_mun.columns:\n    \n    train_x[var+'_log1'] = np.log(train_x[var]+1)\n    test_x[var+'_log1'] = np.log(test_x[var]+1)\n\n\ntrain_x.drop(train_x_mun.columns, axis=1, inplace=True)\ntest_x.drop(train_x_mun.columns, axis=1, inplace=True)\n\nprint(train_x.shape)\nprint(test_x.shape)"

normalization

In [217]:
# XGBClassifier : up
# XGBRFClassifier : -
# LGBMClassifier : down

train_x_mun = train_x.drop(['LINE_PRODUCT_CODE'], axis=1)

for var in train_x_mun.columns:
    
    X_min = train_x[var].min()
    X_max = train_x[var].max()
    train_x[var] = (train_x[var] - X_min) / (X_max-X_min)
    test_x[var] = (test_x[var] - X_min) / (X_max-X_min)


print(train_x.shape)
print(train_y.shape)

(598, 2876)
(598,)


over sampling

In [218]:
'''from imblearn.over_sampling import SMOTE

oversmapling_instance = SMOTE(k_neighbors = 3)

# apply
o_train_x, o_train_y = oversmapling_instance.fit(train_x, train_y)

# dataframe
o_train_x = pd.DataFrame(o_train_x, columns=train_x.columns)
o_train_y = pd.DataFrame(o_train_y, columns=train_y.columns)'''

'from imblearn.over_sampling import SMOTE\n\noversmapling_instance = SMOTE(k_neighbors = 3)\n\n# apply\no_train_x, o_train_y = oversmapling_instance.fit(train_x, train_y)\n\n# dataframe\no_train_x = pd.DataFrame(o_train_x, columns=train_x.columns)\no_train_y = pd.DataFrame(o_train_y, columns=train_y.columns)'

size up

In [219]:
'''train_x_copy = train_x.copy()
train_y_copy = train_y.copy()

train_x = pd.concat([train_x, train_x_copy], axis=0)
train_y= pd.concat([train_y, train_y_copy], axis=0)


print(train_x.shape)
print(train_y.shape)'''

'train_x_copy = train_x.copy()\ntrain_y_copy = train_y.copy()\n\ntrain_x = pd.concat([train_x, train_x_copy], axis=0)\ntrain_y= pd.concat([train_y, train_y_copy], axis=0)\n\n\nprint(train_x.shape)\nprint(train_y.shape)'

train / valid split

In [220]:
from sklearn.model_selection import train_test_split

x_t, x_v, y_t, y_v = train_test_split(train_x, train_y, test_size=0.2, random_state=42)

In [221]:
'''x_t_copy = x_t.copy()
y_t_copy = y_t.copy()

x_t = pd.concat([x_t, x_t_copy], axis=0)
y_t= pd.concat([y_t, y_t_copy], axis=0)


print(x_t.shape)
print(y_t.shape)'''

'x_t_copy = x_t.copy()\ny_t_copy = y_t.copy()\n\nx_t = pd.concat([x_t, x_t_copy], axis=0)\ny_t= pd.concat([y_t, y_t_copy], axis=0)\n\n\nprint(x_t.shape)\nprint(y_t.shape)'

## Model

XGBClassifier

In [224]:
from sklearn.metrics import accuracy_score, confusion_matrix


from xgboost import XGBClassifier # 회귀트리
xgb = XGBClassifier()
xgb.fit(x_t, y_t)
pred = xgb.predict(x_v)
print('XGBClassifier :',accuracy_score(pred, y_v))
print(confusion_matrix(pred, y_v))




from xgboost import XGBRFClassifier
xgb = XGBRFClassifier()
xgb.fit(x_t, y_t)
pred = xgb.predict(x_v)
print('XGBRFClassifier :',accuracy_score(pred, y_v))
print(confusion_matrix(pred, y_v))



from lightgbm import LGBMClassifier
lgbm = LGBMClassifier()
lgbm.fit(x_t, y_t)
pred = lgbm.predict(x_v)
print('LGBMClassifier :',accuracy_score(pred, y_v))
print(confusion_matrix(pred, y_v))

XGBClassifier : 0.7333333333333333
[[ 8  1  2]
 [14 72 11]
 [ 1  3  8]]
XGBRFClassifier : 0.7666666666666667
[[10  1  0]
 [11 72 11]
 [ 2  3 10]]
LGBMClassifier : 0.725
[[ 8  1  0]
 [15 71 13]
 [ 0  4  8]]


## submit

In [18]:
from xgboost import XGBClassifier # 회귀트리

xgb = XGBClassifier()
xgb.fit(train_x, train_y)
pred = xgb.predict(test_x)


submit_csv = pd.read_csv('./sample_submission.csv')
submit_csv['Y_Class'] = pred
submit_csv.to_csv('XGBoost_notnull_submission.csv', index=False)



In [8]:
from xgboost import XGBRFClassifier

xgb = XGBRFClassifier()
xgb.fit(train_x, train_y)
pred = xgb.predict(test_x)


submit_csv = pd.read_csv('./sample_submission.csv')
submit_csv['Y_Class'] = pred
submit_csv.to_csv('XGBoost_notnull_XGBRFC.csv', index=False)