## Load Data

In [98]:
import pandas as pd
import random
import os
import numpy as np
import warnings
warnings.filterwarnings('ignore')

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(37) # Seed 고정

train_df = pd.read_csv('./train.csv')
test_df = pd.read_csv('./test.csv')

#train_x = train_df.drop(columns=['PRODUCT_ID', 'Y_Class', 'TIMESTAMP'])
train_x = train_df.drop(columns=['PRODUCT_ID', 'Y_Quality', 'TIMESTAMP'])
train_y = train_df['Y_Class']

test_x = test_df.drop(columns=['PRODUCT_ID', 'TIMESTAMP'])

## Preprocessing

### LabelEncoder

In [99]:
from sklearn.preprocessing import LabelEncoder

qual_col = ['LINE','PRODUCT_CODE']

for i in qual_col:
    le = LabelEncoder()
    le.fit(train_x[i])
    train_x[i] = le.transform(train_x[i])

    for label in np.unique(test_x[i]):
        if label not in le.classes_:
            le.classes_ = np.append(le.classes_, label)
    
    test_x[i] = le.transform(test_x[i])

print('done')

done


In [100]:
train_x_drop = train_x.drop('Y_Class', axis=1)

from sklearn.model_selection import train_test_split

x_t, x_v, y_t, y_v = train_test_split(train_x_drop, train_y, test_size=0.2, random_state=42)



from sklearn.metrics import accuracy_score, confusion_matrix


from xgboost import XGBClassifier # 회귀트리
xgb = XGBClassifier()
xgb.fit(x_t, y_t)
pred = xgb.predict(x_v)
print('XGBClassifier :',accuracy_score(pred, y_v))


from xgboost import XGBRFClassifier
xgb = XGBRFClassifier()
xgb.fit(x_t, y_t)
pred = xgb.predict(x_v)
print('XGBRFClassifier :',accuracy_score(pred, y_v))


from lightgbm import LGBMClassifier
lgbm = LGBMClassifier()
lgbm.fit(x_t, y_t)
pred = lgbm.predict(x_v)
print('LGBMClassifier :',accuracy_score(pred, y_v))

XGBClassifier : 0.7166666666666667
XGBRFClassifier : 0.7583333333333333
LGBMClassifier : 0.7333333333333333


### split

In [101]:
## PRODUCT_CODE

# train
train_x_1 = train_x[train_x['PRODUCT_CODE'] == 0].drop('Y_Class', axis=1)
train_x_2 = train_x[train_x['PRODUCT_CODE'] == 1].drop('Y_Class', axis=1)
train_x_3 = train_x[train_x['PRODUCT_CODE'] == 2].drop('Y_Class', axis=1)

train_y_1 = train_x['Y_Class'][train_x['PRODUCT_CODE'] == 0]
train_y_2 = train_x['Y_Class'][train_x['PRODUCT_CODE'] == 1]
train_y_3 = train_x['Y_Class'][train_x['PRODUCT_CODE'] == 2]

print('train_x_1 shape :', train_x_1.shape,
      '\ntrain_x_2 shape :', train_x_2.shape,
      '\ntrain_x_3 shape :', train_x_3.shape)

# test
test_x_1 = test_x[test_x['PRODUCT_CODE'] == 0]
test_x_2 = test_x[test_x['PRODUCT_CODE'] == 1]
test_x_3 = test_x[test_x['PRODUCT_CODE'] == 2]



# LINE

## TRAIN

# line 1
test_x_1_1 = test_x_1[test_x_1['LINE'] == 0]

# line 2
test_x_1_2 = test_x_1[test_x_1['LINE'] == 1]

# line 3
test_x_1_3 = test_x_1[test_x_1['LINE'] == 2]

# line 4
test_x_1_4 = test_x_1[test_x_1['LINE'] == 3]

# line 5
test_x_2_5 = test_x_2[test_x_2['LINE'] == 4]
test_x_3_5 = test_x_3[test_x_3['LINE'] == 4]

# line 6
test_x_2_6 = test_x_2[test_x_2['LINE'] == 5]
test_x_3_6 = test_x_3[test_x_3['LINE'] == 5]




## TRAIN

# line 1
train_x_1_1 = train_x_1[train_x_1['LINE'] == 0]

# line 2
train_x_1_2 = train_x_1[train_x_1['LINE'] == 1]

# line 3
train_x_1_3 = train_x_1[train_x_1['LINE'] == 2]

# line 4
train_x_1_4 = train_x_1[train_x_1['LINE'] == 3]

# line 5
train_x_2_5 = train_x_2[train_x_2['LINE'] == 4]
train_x_3_5 = train_x_3[train_x_3['LINE'] == 4]

# line 6
train_x_2_6 = train_x_2[train_x_2['LINE'] == 5]
train_x_3_6 = train_x_3[train_x_3['LINE'] == 5]


train_set = [train_x_1_1, train_x_1_2, train_x_1_3, train_x_1_4, train_x_2_5, train_x_3_5, train_x_2_6, train_x_3_6]
test_set = [test_x_1_1, test_x_1_2, test_x_1_3, test_x_1_4, test_x_2_5, test_x_3_5, test_x_2_6, test_x_3_6]

print('train')
for set in train_set:
    print(set.shape)

print('\n\ntest')
for set in test_set:
    print(set.shape)    

train_x_1 shape : (249, 2877) 
train_x_2 shape : (6, 2877) 
train_x_3 shape : (343, 2877)
train
(59, 2877)
(70, 2877)
(78, 2877)
(42, 2877)
(3, 2877)
(172, 2877)
(3, 2877)
(171, 2877)


test
(14, 2877)
(14, 2877)
(13, 2877)
(26, 2877)
(3, 2877)
(108, 2877)
(1, 2877)
(131, 2877)


#### null - mean

In [102]:
train_set_mean = train_set
test_set_mean = test_set

for set in train_set_mean:
    cols = set.columns
    for col in cols:
        set[col] = set[col].fillna(set[col].mean())

for set in test_set_mean:
    cols = set.columns
    for col in cols:
        set[col] = set[col].fillna(set[col].mean())

train_x = pd.concat(train_set_mean, axis=0).sort_index()
test_x = pd.concat(test_set_mean, axis=0).sort_index()

In [103]:
from sklearn.model_selection import train_test_split

x_t, x_v, y_t, y_v = train_test_split(train_x_drop, train_y, test_size=0.2, random_state=42)



from sklearn.metrics import accuracy_score, confusion_matrix


from xgboost import XGBClassifier # 회귀트리
xgb = XGBClassifier()
xgb.fit(x_t, y_t)
pred = xgb.predict(x_v)
print('XGBClassifier :',accuracy_score(pred, y_v))


from xgboost import XGBRFClassifier
xgb = XGBRFClassifier()
xgb.fit(x_t, y_t)
pred = xgb.predict(x_v)
print('XGBRFClassifier :',accuracy_score(pred, y_v))


from lightgbm import LGBMClassifier
lgbm = LGBMClassifier()
lgbm.fit(x_t, y_t)
pred = lgbm.predict(x_v)
print('LGBMClassifier :',accuracy_score(pred, y_v))

XGBClassifier : 0.7166666666666667
XGBRFClassifier : 0.7583333333333333
LGBMClassifier : 0.7333333333333333


#### null - mode

In [104]:
train_set_mode = train_set
test_set_mode = test_set

for set in train_set_mode:
    cols = set.columns
    for col in cols:
        set[col] = set[col].fillna(set[col].mode())

for set in test_set_mode:
    cols = set.columns
    for col in cols:
        set[col] = set[col].fillna(set[col].mode())

train_x = pd.concat(train_set_mode, axis=0).sort_index()
test_x = pd.concat(test_set_mode, axis=0).sort_index()

In [106]:
from sklearn.model_selection import train_test_split

x_t, x_v, y_t, y_v = train_test_split(train_x, train_y, test_size=0.2, random_state=42)



from sklearn.metrics import accuracy_score, confusion_matrix


from xgboost import XGBClassifier # 회귀트리
xgb = XGBClassifier()
xgb.fit(x_t, y_t)
pred = xgb.predict(x_v)
print('XGBClassifier :',accuracy_score(pred, y_v))


from xgboost import XGBRFClassifier
xgb = XGBRFClassifier()
xgb.fit(x_t, y_t)
pred = xgb.predict(x_v)
print('XGBRFClassifier :',accuracy_score(pred, y_v))


from lightgbm import LGBMClassifier
lgbm = LGBMClassifier()
lgbm.fit(x_t, y_t)
pred = lgbm.predict(x_v)
print('LGBMClassifier :',accuracy_score(pred, y_v))

XGBClassifier : 0.7583333333333333
XGBRFClassifier : 0.7333333333333333
LGBMClassifier : 0.7416666666666667


## train / valid split

In [71]:
from sklearn.model_selection import train_test_split

x_t, x_v, y_t, y_v = train_test_split(train_x, train_y, test_size=0.2, random_state=42)

## Model

In [72]:
from sklearn.model_selection import train_test_split

x_t, x_v, y_t, y_v = train_test_split(train_x, train_y, test_size=0.2, random_state=42)



from sklearn.metrics import accuracy_score, confusion_matrix


from xgboost import XGBClassifier # 회귀트리
xgb = XGBClassifier()
xgb.fit(x_t, y_t)
pred = xgb.predict(x_v)
print('XGBClassifier :',accuracy_score(pred, y_v))




from xgboost import XGBRFClassifier
xgb = XGBRFClassifier()
xgb.fit(x_t, y_t)
pred = xgb.predict(x_v)
print('XGBRFClassifier :',accuracy_score(pred, y_v))



from lightgbm import LGBMClassifier
lgbm = LGBMClassifier()
lgbm.fit(x_t, y_t)
pred = lgbm.predict(x_v)
print('LGBMClassifier :',accuracy_score(pred, y_v))

XGBClassifier : 0.7583333333333333
XGBRFClassifier : 0.7333333333333333
LGBMClassifier : 0.7416666666666667


## submit

In [18]:
from xgboost import XGBClassifier # 회귀트리

xgb = XGBClassifier()
xgb.fit(train_x, train_y)
pred = xgb.predict(test_x)


submit_csv = pd.read_csv('./sample_submission.csv')
submit_csv['Y_Class'] = pred
submit_csv.to_csv('XGBoost_notnull_submission.csv', index=False)



In [8]:
from xgboost import XGBRFClassifier

xgb = XGBRFClassifier()
xgb.fit(train_x, train_y)
pred = xgb.predict(test_x)


submit_csv = pd.read_csv('./sample_submission.csv')
submit_csv['Y_Class'] = pred
submit_csv.to_csv('XGBoost_notnull_XGBRFC.csv', index=False)