## Import

In [9]:
import pandas as pd
import random
import os
import numpy as np
import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# 파이썬 ≥3.5 필수
import sys
assert sys.version_info >= (3, 5)

# 사이킷런 ≥0.20 필수
import sklearn
assert sklearn.__version__ >= "0.20"

try:
    # %tensorflow_version은 코랩 명령입니다.
    %tensorflow_version 2.x
    %pip install -q -U tfx
    print("패키지 호환 에러는 무시해도 괜찮습니다.")
except Exception:
    pass

# 텐서플로 ≥2.0 필수
import tensorflow as tf
from tensorflow import keras
assert tf.__version__ >= "2.0"

# 공통 모듈 임포트
import numpy as np
import os

# 노트북 실행 결과를 동일하게 유지하기 위해
np.random.seed(42)

# 깔끔한 그래프 출력을 위해
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

In [10]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(37) # Seed 고정

## Data Load

In [11]:
import pandas as pd
import numpy as np


train_df = pd.read_csv('./train.csv')
test_df = pd.read_csv('./test.csv')


# qualitative to quantitative
qual_col = ['LINE', 'PRODUCT_CODE']

for i in qual_col:
    le = LabelEncoder()
    le = le.fit(train_df[i])
    train_df[i] = le.transform(train_df[i])
    
    for label in np.unique(test_df[i]): 
        if label not in le.classes_: 
            le.classes_ = np.append(le.classes_, label)
    test_df[i] = le.transform(test_df[i]) 
print('Done.')


train_df = train_df.drop(columns=['PRODUCT_ID', 'Y_Quality', 'TIMESTAMP'])
#train_y = train_df['Y_Class']

test_df = test_df.drop(columns=['PRODUCT_ID', 'TIMESTAMP'])

Done.


In [12]:
train_df = pd.read_csv('./train.csv')
test_df = pd.read_csv('./test.csv')
train_df = train_df.drop(columns=['PRODUCT_ID', 'Y_Quality', 'TIMESTAMP'])
test_df = test_df.drop(columns=['PRODUCT_ID', 'TIMESTAMP'])

qual_col = ['LINE', 'PRODUCT_CODE']

for i in qual_col:
    le = LabelEncoder()
    le = le.fit(train_df[i])
    train_df[i] = le.transform(train_df[i])
    
    for label in np.unique(test_df[i]): 
        if label not in le.classes_: 
            le.classes_ = np.append(le.classes_, label)
    test_df[i] = le.transform(test_df[i]) 
print('Done.')

train_df = train_df.fillna(0)
test_df = test_df.fillna(0)

train_df_val = train_df.values
test_df_val = test_df.values

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


X_train_full, X_test, y_train_full, y_test = train_test_split(
   train_df_val[:, 1:],train_df_val[:, 1].reshape(-1,1), random_state=42)

X_train, X_valid, y_train, y_valid = train_test_split(
    X_train_full, y_train_full, random_state=42)

scaler = StandardScaler()
scaler.fit(X_train)
X_mean = scaler.mean_
X_std = scaler.scale_

Done.


## Data Pre-processing

## Split train / valid

In [13]:
def save_to_multiple_csv_files(data, name_prefix, header=None, n_parts=10):
    housing_dir = os.path.join("datasets", "LG")
    os.makedirs(housing_dir, exist_ok=True)
    path_format = os.path.join(housing_dir, "my_{}_{:02d}.csv")

    filepaths = []
    m = len(data)
    for file_idx, row_indices in enumerate(np.array_split(np.arange(m), n_parts)):
        part_csv = path_format.format(name_prefix, file_idx)
        filepaths.append(part_csv)
        with open(part_csv, "wt", encoding="utf-8") as f:
            if header is not None:
                f.write(header)
                f.write("\n")
            for row_idx in row_indices:
                f.write(",".join([repr(col) for col in data[row_idx]]))
                f.write("\n")
    return filepaths

In [25]:
train_data = np.c_[X_train, y_train]
valid_data = np.c_[X_valid, y_valid]
test_data = np.c_[X_test, y_test]
header_cols = train_df.columns.drop('Y_Class') + ["MedianValue"]
header = ",".join(header_cols)

In [26]:
train_filepaths = save_to_multiple_csv_files(train_data, "train", header, n_parts=10)
valid_filepaths = save_to_multiple_csv_files(valid_data, "valid", header, n_parts=5)
test_filepaths = save_to_multiple_csv_files(test_data, "test", header, n_parts=5)

In [27]:
import pandas as pd

pd.read_csv(train_filepaths[0]).head()

Unnamed: 0,LINEMedianValue,PRODUCT_CODEMedianValue,X_1MedianValue,X_2MedianValue,X_3MedianValue,X_4MedianValue,X_5MedianValue,X_6MedianValue,X_7MedianValue,X_8MedianValue,...,X_2866MedianValue,X_2867MedianValue,X_2868MedianValue,X_2869MedianValue,X_2870MedianValue,X_2871MedianValue,X_2872MedianValue,X_2873MedianValue,X_2874MedianValue,X_2875MedianValue
2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,52.24,55.33,57.49,67.31,1.0,0.0,0.0,0.0,0.0,2.0
4.0,2.0,2.0,101.0,0.0,45.0,11.0,0.0,45.0,10.0,31.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
4.0,2.0,2.0,102.0,0.0,45.0,11.0,0.0,45.0,10.0,31.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,57.1,51.49,56.08,64.4,1.0,0.0,0.0,0.0,0.0,2.0
2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,59.32,40.6,53.06,65.26,1.0,0.0,0.0,0.0,0.0,2.0


In [28]:
train_filepaths

['datasets/LG/my_train_00.csv',
 'datasets/LG/my_train_01.csv',
 'datasets/LG/my_train_02.csv',
 'datasets/LG/my_train_03.csv',
 'datasets/LG/my_train_04.csv',
 'datasets/LG/my_train_05.csv',
 'datasets/LG/my_train_06.csv',
 'datasets/LG/my_train_07.csv',
 'datasets/LG/my_train_08.csv',
 'datasets/LG/my_train_09.csv']

## 입력 파이프라인 만들기

In [29]:
filepath_dataset = tf.data.Dataset.list_files(train_filepaths, seed=42)

In [30]:
for filepath in filepath_dataset:
    print(filepath)

tf.Tensor(b'datasets/LG/my_train_05.csv', shape=(), dtype=string)
tf.Tensor(b'datasets/LG/my_train_03.csv', shape=(), dtype=string)
tf.Tensor(b'datasets/LG/my_train_07.csv', shape=(), dtype=string)
tf.Tensor(b'datasets/LG/my_train_01.csv', shape=(), dtype=string)
tf.Tensor(b'datasets/LG/my_train_04.csv', shape=(), dtype=string)
tf.Tensor(b'datasets/LG/my_train_00.csv', shape=(), dtype=string)
tf.Tensor(b'datasets/LG/my_train_02.csv', shape=(), dtype=string)
tf.Tensor(b'datasets/LG/my_train_08.csv', shape=(), dtype=string)
tf.Tensor(b'datasets/LG/my_train_06.csv', shape=(), dtype=string)
tf.Tensor(b'datasets/LG/my_train_09.csv', shape=(), dtype=string)


In [31]:
n_readers = 2
dataset = filepath_dataset.interleave(
    lambda filepath: tf.data.TextLineDataset(filepath).skip(1),
    cycle_length=n_readers)

In [32]:
@tf.function
def preprocess(line):

    defs = [0.] * n_inputs + [tf.constant([], dtype=tf.float32)]
    fields = tf.io.decode_csv(line, record_defaults=defs)
    x = tf.stack(fields[:-1])
    y = tf.stack(fields[-1:])
    return (x - X_mean) / X_std, y

In [33]:
def csv_reader_dataset(filepaths, repeat=1, n_readers=5,
                       n_read_threads=None, shuffle_buffer_size=10000,
                       n_parse_threads=5, batch_size=32):
    dataset = tf.data.Dataset.list_files(filepaths).repeat(repeat)
    dataset = dataset.interleave(
        lambda filepath: tf.data.TextLineDataset(filepath).skip(1),
        cycle_length=n_readers, num_parallel_calls=n_read_threads)
    dataset = dataset.shuffle(shuffle_buffer_size)
    dataset = dataset.map(preprocess, num_parallel_calls=n_parse_threads)
    dataset = dataset.batch(batch_size)
    return dataset.prefetch(1)

In [35]:
tf.random.set_seed(42)

n_inputs = 5

train_set = csv_reader_dataset(train_filepaths, repeat=None)
valid_set = csv_reader_dataset(valid_filepaths)
test_set = csv_reader_dataset(test_filepaths)

NotImplementedError: in user code:

    File "/var/folders/wk/gpgcc1357zsgh9w3qbfjsl0m0000gn/T/ipykernel_94720/2291767887.py", line 8, in preprocess  *
        return (x - X_mean) / X_std, y

    NotImplementedError: Cannot convert a symbolic tf.Tensor (stack:0) to a numpy array. This error may indicate that you're trying to pass a Tensor to a NumPy call, which is not supported.


In [None]:
import pandas as pd

pd.read_csv(train_filepaths[0]).head()

## Classification Model Fit

### ✅Test

In [None]:
XGB = XGBClassifier(n_estimators=500, max_depth=2, learning_rate=0.01, objective='multi:softmax').fit(train_x, train_y)
print('\n\n', 'XGBClassifier >>>>> Done', '\n\n')
xgb_pred = XGB.predict(test_x)

In [None]:
submit = pd.read_csv('./sample_submission.csv')
submit['Y_Class'] = xgb_pred
submit.to_csv('./xgb_submission.csv', index=False)

## Inference

In [None]:
preds = RF.predict(test_x)
print('Done.')

## Submit

In [None]:
submit = pd.read_csv('./sample_submission.csv')

submit['Y_Class'] = preds

submit.to_csv('./baseline_submission.csv', index=False)