## Load Data

In [36]:
import random
import os
import warnings
import numpy as np
import pandas as pd

import time
import datetime 

# datetime 라이브러리 import
#from keras.utils import np_utils
#from sklearn.preprocessing import LabelEncoder


warnings.filterwarnings('ignore')

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed = 42

seed_everything(seed) # Seed 고정

train_df = pd.read_csv('./train.csv')
test_df = pd.read_csv('./test.csv')

#train_x = train_df.drop(columns=['PRODUCT_ID', 'Y_Class', 'TIMESTAMP'])
train_x = train_df.drop(columns=['PRODUCT_ID', 'Y_Quality', 'TIMESTAMP'])
train_y = train_df['Y_Class']

test_x = test_df.drop(columns=['PRODUCT_ID', 'TIMESTAMP'])

## Preprocessing

### LabelEncoder

In [37]:
from sklearn.preprocessing import LabelEncoder

qual_col = ['LINE','PRODUCT_CODE']

for i in qual_col:
    le = LabelEncoder()
    le.fit(train_x[i])
    train_x[i] = le.transform(train_x[i])

    for label in np.unique(test_x[i]):
        if label not in le.classes_:
            le.classes_ = np.append(le.classes_, label)
    
    test_x[i] = le.transform(test_x[i])

print('done')

done


### split

In [3]:
## PRODUCT_CODE

# train
train_x_1 = train_x[train_x['PRODUCT_CODE'] == 0].drop('Y_Class', axis=1)
train_x_2 = train_x[train_x['PRODUCT_CODE'] == 1].drop('Y_Class', axis=1)
train_x_3 = train_x[train_x['PRODUCT_CODE'] == 2].drop('Y_Class', axis=1)

train_y_1 = train_x['Y_Class'][train_x['PRODUCT_CODE'] == 0]
train_y_2 = train_x['Y_Class'][train_x['PRODUCT_CODE'] == 1]
train_y_3 = train_x['Y_Class'][train_x['PRODUCT_CODE'] == 2]

# test
test_x_1 = test_x[test_x['PRODUCT_CODE'] == 0]
test_x_2 = test_x[test_x['PRODUCT_CODE'] == 1]
test_x_3 = test_x[test_x['PRODUCT_CODE'] == 2]



# LINE

## TRAIN

# line 1
test_x_1_1 = test_x_1[test_x_1['LINE'] == 0]

# line 2
test_x_1_2 = test_x_1[test_x_1['LINE'] == 1]

# line 3
test_x_1_3 = test_x_1[test_x_1['LINE'] == 2]

# line 4
test_x_1_4 = test_x_1[test_x_1['LINE'] == 3]

# line 5
test_x_2_5 = test_x_2[test_x_2['LINE'] == 4]
test_x_3_5 = test_x_3[test_x_3['LINE'] == 4]

# line 6
test_x_2_6 = test_x_2[test_x_2['LINE'] == 5]
test_x_3_6 = test_x_3[test_x_3['LINE'] == 5]




## TRAIN

# line 1
train_x_1_1 = train_x_1[train_x_1['LINE'] == 0]

# line 2
train_x_1_2 = train_x_1[train_x_1['LINE'] == 1]

# line 3
train_x_1_3 = train_x_1[train_x_1['LINE'] == 2]

# line 4
train_x_1_4 = train_x_1[train_x_1['LINE'] == 3]

# line 5
train_x_2_5 = train_x_2[train_x_2['LINE'] == 4]
train_x_3_5 = train_x_3[train_x_3['LINE'] == 4]

# line 6
train_x_2_6 = train_x_2[train_x_2['LINE'] == 5]
train_x_3_6 = train_x_3[train_x_3['LINE'] == 5]


train_set = [train_x_1_1, train_x_1_2, train_x_1_3, train_x_1_4, train_x_2_5, train_x_3_5, train_x_2_6, train_x_3_6]
test_set = [test_x_1_1, test_x_1_2, test_x_1_3, test_x_1_4, test_x_2_5, test_x_3_5, test_x_2_6, test_x_3_6]

print('done')

done


#### null - mean

In [4]:
train_set_mean = train_set
test_set_mean = test_set

for set in train_set_mean:
    cols = set.columns
    for col in cols:
        set[col] = set[col].fillna(set[col].mean())

for set in test_set_mean:
    cols = set.columns
    for col in cols:
        set[col] = set[col].fillna(set[col].mean())

train_x = pd.concat(train_set_mean, axis=0).sort_index()
test_x = pd.concat(test_set_mean, axis=0).sort_index()

### 파생변수 생성

In [5]:
train_x['LINE_PRODUCT_CODE'] = train_x[['LINE','PRODUCT_CODE']].apply(lambda x: '-'.join(x.astype(str)),axis=1)
test_x['LINE_PRODUCT_CODE'] = test_x[['LINE','PRODUCT_CODE']].apply(lambda x: '-'.join(x.astype(str)),axis=1)

train_x.drop(['LINE','PRODUCT_CODE'], axis=1, inplace=True)
test_x.drop(['LINE','PRODUCT_CODE'], axis=1, inplace=True)

le = LabelEncoder()
train_x['LINE_PRODUCT_CODE'] = le.fit_transform(train_x['LINE_PRODUCT_CODE'])
test_x['LINE_PRODUCT_CODE'] = le.transform(test_x['LINE_PRODUCT_CODE'])

### z-score

In [6]:
'''from scipy.stats import zscore

train_cols = train_x.columns.drop('LINE_PRODUCT_CODE')
test_cols = test_x.columns.drop('LINE_PRODUCT_CODE')

for col in train_cols:
    train_x[col] = zscore(train_x[col])

for col in test_cols:
    test_x[col] = zscore(test_x[col])'''

"from scipy.stats import zscore\n\ntrain_cols = train_x.columns.drop('LINE_PRODUCT_CODE')\ntest_cols = test_x.columns.drop('LINE_PRODUCT_CODE')\n\nfor col in train_cols:\n    train_x[col] = zscore(train_x[col])\n\nfor col in test_cols:\n    test_x[col] = zscore(test_x[col])"

### Normalize

In [8]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler(feature_range=(0, 1))
train_x = scaler.fit_transform(train_x)
train_x = pd.DataFrame(train_x)


test_x = scaler.transform(test_x)
test_x = pd.DataFrame(train_x)

### fillna(0)

In [9]:
train_x = train_x.fillna(0)
test_x = test_x.fillna(0)

In [10]:
print(train_x.shape)
print(test_x.shape)

(598, 2876)
(598, 2876)


In [26]:
from sklearn.model_selection import train_test_split

x_t, x_v, y_t, y_v = train_test_split(train_x, train_y, test_size=0.2, random_state=42)

In [35]:
train_x

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2866,2867,2868,2869,2870,2871,2872,2873,2874,2875
0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.248647,0.000000,0.122283,0.890487,0.0,0.0,0.0,0.0,0.0,0.285714
1,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.300866,0.407899,0.164742,0.601770,0.0,0.0,0.0,0.0,0.0,0.428571
2,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.133929,0.355835,0.205163,0.922566,0.0,0.0,0.0,0.0,0.0,0.285714
3,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.202110,0.704129,0.003057,0.559181,0.0,0.0,0.0,0.0,0.0,0.428571
4,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.275703,0.515978,0.088315,0.846239,0.0,0.0,0.0,0.0,0.0,0.285714
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
593,0.009804,0.533333,0.0,0.0,0.0,0.0,0.294118,0.0,1.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,1.000000
594,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.578193,0.658169,0.835938,0.266593,0.0,0.0,0.0,0.0,0.0,0.285714
595,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.609582,0.578285,0.741934,0.277432,0.0,0.0,0.0,0.0,0.0,0.285714
596,0.382353,0.466667,0.0,0.0,1.0,0.0,0.000000,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.571429


## Model

### tensor1

In [16]:
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

In [42]:
# For Keras, convert dataframe to array values (Inbuilt requirement of Keras)
X = train_x.values
Y = train_y.values

In [43]:
from sklearn.model_selection import train_test_split

x_t, x_v, y_t, y_v = train_test_split(X, Y, test_size=0.2, random_state=42)

In [50]:
# First define baseline model. Then use it in Keras Classifier for the training
def baseline_model():
    # Create model here
    model = Sequential()
    model.add(Dense(128, input_dim = 2878, activation = 'relu')) # Rectified Linear Unit Activation Function
    model.add(Dense(128, activation = 'relu'))
    model.add(Dense(128, activation = 'relu'))
    model.add(Dropout(.1))
    model.add(Dense(3, activation = 'softmax')) # Softmax for multi-class classification
    # Compile model here
    model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
    return model

In [31]:
'''model = Sequential()
model.add(Dense(128, input_dim = 2876, activation = 'relu')) # Rectified Linear Unit Activation Function
model.add(Dense(128, activation = 'relu'))
model.add(Dense(128, activation = 'relu'))
#model.add(Dropout(.2))
model.add(Dense(3, activation = 'softmax')) # Softmax for multi-class classification
# Compile model here
model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

model.summary()'''

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_20 (Dense)            (None, 128)               368256    
                                                                 
 dense_21 (Dense)            (None, 128)               16512     
                                                                 
 dense_22 (Dense)            (None, 128)               16512     
                                                                 
 dense_23 (Dense)            (None, 3)                 387       
                                                                 
Total params: 401,667
Trainable params: 401,667
Non-trainable params: 0
_________________________________________________________________


In [41]:
y_t   

145    1
9      2
374    1
521    1
188    2
      ..
71     1
106    1
270    1
435    2
102    1
Name: Y_Class, Length: 478, dtype: int64

In [49]:
model = baseline_model()
hist = model.fit(x_t, y_t, epochs=10, batch_size=10, validation_data=(x_v, y_v))

TypeError: Dimension value must be integer or None or have an __index__ method, got value '(2878, 1)' with type '<class 'tuple'>'

In [20]:
epoch = 100
batch_size = 10
n_splits = 5

estimator = KerasClassifier(build_fn = baseline_model, epochs = epoch, batch_size =batch_size, verbose = 0)
kfold = KFold(n_splits = n_splits, shuffle = True, random_state = seed)
results = cross_val_score(estimator, X, Y, cv = kfold)

print("Result: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

2023-02-18 18:35:22.023107: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2023-02-18 18:35:59.382754: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2023-02-18 18:35:59.672663: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2023-02-18 18:36:37.302792: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2023-02-18 18:36:37.581300: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2023-02-18 18:37:15.517615: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2023-02-18 18:37:15.798299: I tensorflow/core/grappler/optimizers/cust

Result: 73.24% (2.24%)


2023-02-18 18:38:30.879603: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


### tensor 2

In [None]:
from sklearn.model_selection import train_test_split

x_t, x_v, y_t, y_v = train_test_split(train_x, train_y, test_size=0.2, random_state=42)

In [None]:
batch_size = 32
train_ds = df_to_dataset(train, batch_size=batch_size)
val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size)
test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size)

In [None]:
# For Keras, convert dataframe to array values (Inbuilt requirement of Keras)
X = train_x.values
Y = train_y.values

In [None]:
import numpy as np
import pandas as pd

import tensorflow as tf

from tensorflow import feature_column
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split

In [None]:
model = tf.keras.Sequential([
  feature_layer,
  layers.Dense(128, activation='relu'),
  layers.Dense(128, activation='relu'),
  layers.Dropout(.1),
  layers.Dense(1)
])

model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])

model.fit(train_ds,
          validation_data=val_ds,
          epochs=10)

## submit

In [18]:
from xgboost import XGBClassifier # 회귀트리

xgb = XGBClassifier()
xgb.fit(train_x, train_y)
pred = xgb.predict(test_x)


submit_csv = pd.read_csv('./sample_submission.csv')
submit_csv['Y_Class'] = pred
submit_csv.to_csv('XGBoost_notnull_submission.csv', index=False)



In [8]:
from xgboost import XGBRFClassifier

xgb = XGBRFClassifier()
xgb.fit(train_x, train_y)
pred = xgb.predict(test_x)


submit_csv = pd.read_csv('./sample_submission.csv')
submit_csv['Y_Class'] = pred
submit_csv.to_csv('XGBoost_notnull_XGBRFC.csv', index=False)