## Load Data

In [137]:
import pandas as pd
import random
import os
import numpy as np
import warnings
warnings.filterwarnings('ignore')

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(37) # Seed 고정

train_df = pd.read_csv('./train.csv')
test_df = pd.read_csv('./test.csv')

#train_x = train_df.drop(columns=['PRODUCT_ID', 'Y_Class', 'TIMESTAMP'])
train_x = train_df.drop(columns=['PRODUCT_ID', 'Y_Quality', 'TIMESTAMP'])
train_y = train_df['Y_Class']

test_x = test_df.drop(columns=['PRODUCT_ID', 'TIMESTAMP'])

In [138]:
train_x.shape

(598, 2878)

## Preprocessing

In [139]:
from sklearn.preprocessing import LabelEncoder

qual_col = ['LINE','PRODUCT_CODE']

for i in qual_col:
    le = LabelEncoder()
    le.fit(train_x[i])
    train_x[i] = le.transform(train_x[i])

    for label in np.unique(test_x[i]):
        if label not in le.classes_:
            le.classes_ = np.append(le.classes_, label)
    
    test_x[i] = le.transform(test_x[i])

print('done')

done


In [140]:
train_x.shape

(598, 2878)

### split

In [141]:
train_x

Unnamed: 0,Y_Class,LINE,PRODUCT_CODE,X_1,X_2,X_3,X_4,X_5,X_6,X_7,...,X_2866,X_2867,X_2868,X_2869,X_2870,X_2871,X_2872,X_2873,X_2874,X_2875
0,1,2,0,,,,,,,,...,39.34,40.89,32.56,34.09,77.77,,,,,
1,2,3,0,,,,,,,,...,38.89,42.82,43.92,35.34,72.55,,,,,
2,1,2,0,,,,,,,,...,39.19,36.65,42.47,36.53,78.35,,,,,
3,2,3,0,,,,,,,,...,37.74,39.17,52.17,30.58,71.78,,,,,
4,1,2,0,,,,,,,,...,38.70,41.89,46.93,33.09,76.97,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
593,1,5,2,2.0,95.0,0.0,45.0,10.0,0.0,50.0,...,,,,,,,,,,
594,0,2,0,,,,,,,,...,49.47,53.07,50.89,55.10,66.49,1.0,,,,
595,0,2,0,,,,,,,,...,,,,,,1.0,,,,
596,1,4,1,40.0,94.0,0.0,45.0,11.0,0.0,45.0,...,,,,,,,,,,


In [142]:
## PRODUCT_CODE

# train
train_x_1 = train_x[train_x['PRODUCT_CODE'] == 0].drop('Y_Class', axis=1)
train_x_2 = train_x[train_x['PRODUCT_CODE'] == 1].drop('Y_Class', axis=1)
train_x_3 = train_x[train_x['PRODUCT_CODE'] == 2].drop('Y_Class', axis=1)

train_y_1 = train_x['Y_Class'][train_x['PRODUCT_CODE'] == 0]
train_y_2 = train_x['Y_Class'][train_x['PRODUCT_CODE'] == 1]
train_y_3 = train_x['Y_Class'][train_x['PRODUCT_CODE'] == 2]

print('train_x_1 shape :', train_x_1.shape,
      '\ntrain_x_2 shape :', train_x_2.shape,
      '\ntrain_x_3 shape :', train_x_3.shape)

# test
test_x_1 = test_x[test_x['PRODUCT_CODE'] == 0]
test_x_2 = test_x[test_x['PRODUCT_CODE'] == 1]
test_x_3 = test_x[test_x['PRODUCT_CODE'] == 2]



# LINE

## TRAIN

# line 1
test_x_1_1 = test_x_1[test_x_1['LINE'] == 0]

# line 2
test_x_1_2 = test_x_1[test_x_1['LINE'] == 1]

# line 3
test_x_1_3 = test_x_1[test_x_1['LINE'] == 2]

# line 4
test_x_1_4 = test_x_1[test_x_1['LINE'] == 3]

# line 5
test_x_2_5 = test_x_2[test_x_2['LINE'] == 4]
test_x_3_5 = test_x_3[test_x_3['LINE'] == 4]

# line 6
test_x_2_6 = test_x_2[test_x_2['LINE'] == 5]
test_x_3_6 = test_x_3[test_x_3['LINE'] == 5]




## TRAIN

# line 1
train_x_1_1 = train_x_1[train_x_1['LINE'] == 0]

# line 2
train_x_1_2 = train_x_1[train_x_1['LINE'] == 1]

# line 3
train_x_1_3 = train_x_1[train_x_1['LINE'] == 2]

# line 4
train_x_1_4 = train_x_1[train_x_1['LINE'] == 3]

# line 5
train_x_2_5 = train_x_2[train_x_2['LINE'] == 4]
train_x_3_5 = train_x_3[train_x_3['LINE'] == 4]

# line 6
train_x_2_6 = train_x_2[train_x_2['LINE'] == 5]
train_x_3_6 = train_x_3[train_x_3['LINE'] == 5]


train_set = [train_x_1_1, train_x_1_2, train_x_1_3, train_x_1_4, train_x_2_5, train_x_3_5, train_x_2_6, train_x_3_6]
test_set = [test_x_1_1, test_x_1_2, test_x_1_3, test_x_1_4, test_x_2_5, test_x_3_5, test_x_2_6, test_x_3_6]

print('train')
for set in train_set:
    print(set.shape)

print('\n\ntest')
for set in test_set:
    print(set.shape)    

train_x_1 shape : (249, 2877) 
train_x_2 shape : (6, 2877) 
train_x_3 shape : (343, 2877)
train
(59, 2877)
(70, 2877)
(78, 2877)
(42, 2877)
(3, 2877)
(172, 2877)
(3, 2877)
(171, 2877)


test
(14, 2877)
(14, 2877)
(13, 2877)
(26, 2877)
(3, 2877)
(108, 2877)
(1, 2877)
(131, 2877)


In [143]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

train_scale = []

for train in train_set:
    scaler.fit_transform(train)
    train = pd.DataFrame(data=scaler.fit_transform(train), columns=train.columns, index=train.index)
    train_scale.append(train)


for set in train_scale:
    print(set.shape)

test_scale = []

for test in test_set:
    scaler.fit_transform(test)
    train = pd.DataFrame(data=scaler.fit_transform(test), columns=test.columns, index=test.index)
    test_scale.append(test)


for set in test_scale:
    print(set.shape)

(59, 2877)
(70, 2877)
(78, 2877)
(42, 2877)
(3, 2877)
(172, 2877)
(3, 2877)
(171, 2877)
(14, 2877)
(14, 2877)
(13, 2877)
(26, 2877)
(3, 2877)
(108, 2877)
(1, 2877)
(131, 2877)


In [144]:
train_x_scale = pd.concat(train_scale, axis=0).sort_index()
test_scale = pd.concat(test_scale, axis=0).sort_index()


print(train_x_scale.shape)
print(test_scale.shape)

(598, 2877)
(310, 2877)


In [145]:
"""
new features 
"""

train_x_scale['LINE_PRODUCT_CODE'] = train_x_scale[['LINE','PRODUCT_CODE']].apply(lambda x: '-'.join(x.astype(str)),axis=1)
test_scale['LINE_PRODUCT_CODE'] = test_scale[['LINE','PRODUCT_CODE']].apply(lambda x: '-'.join(x.astype(str)),axis=1)

encoder = LabelEncoder()

train_x_scale['LINE_PRODUCT_CODE'] = encoder.fit_transform(train_x_scale['LINE_PRODUCT_CODE'])
test_scale['LINE_PRODUCT_CODE'] = encoder.fit(test_scale['LINE_PRODUCT_CODE'])



train_x_scale.drop(['LINE','PRODUCT_CODE'], axis=1, inplace=True)
test_scale.drop(['LINE','PRODUCT_CODE'], axis=1, inplace=True)

In [146]:
from sklearn.model_selection import train_test_split

x_t, x_v, y_t, y_v = train_test_split(train_x_scale, train_y, test_size=0.2, random_state=42)

## Model

XGBClassifier

In [None]:
'''from xgboost import XGBClassifier # 회귀트리

xgb = XGBClassifier()
xgb.fit(x_t, y_t)
pred = xgb.predict(x_v)

from sklearn.metrics import accuracy_score

accuracy_score(pred, y_v)'''

'from xgboost import XGBClassifier # 회귀트리\n\nxgb = XGBClassifier()\nxgb.fit(x_t, y_t)\npred = xgb.predict(x_v)\n\nfrom sklearn.metrics import accuracy_score\n\naccuracy_score(pred, y_v)'

In [147]:
from lightgbm import LGBMClassifier

lgbm = LGBMClassifier(n_estimators=200, boost_from_average=False)
lgbm.fit(x_t, y_t)
pred = lgbm.predict(x_v)

from sklearn.metrics import accuracy_score

accuracy_score(pred, y_v)

0.7416666666666667

## submit

In [46]:
from xgboost import XGBClassifier # 회귀트리


lgbm.fit(train_x, train_y)
pred = lgbm.predict(test_x)

submit_csv = pd.read_csv('./sample_submission.csv')
submit_csv['Y_Class'] = pred
submit_csv.to_csv('LGBM_notnull_submission.csv', index=False)