## Load Data

In [100]:
import pandas as pd
import random
import os
import numpy as np
import warnings
warnings.filterwarnings('ignore')

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(37) # Seed 고정

train_df = pd.read_csv('./train.csv')
test_df = pd.read_csv('./test.csv')

#train_x = train_df.drop(columns=['PRODUCT_ID', 'Y_Class', 'TIMESTAMP'])
train_x = train_df.drop(columns=['PRODUCT_ID', 'Y_Quality', 'TIMESTAMP'])
train_y = train_df['Y_Class']

test_x = test_df.drop(columns=['PRODUCT_ID', 'TIMESTAMP'])

## Preprocessing

### LabelEncoder

In [101]:
from sklearn.preprocessing import LabelEncoder

qual_col = ['LINE','PRODUCT_CODE']

for i in qual_col:
    le = LabelEncoder()
    le.fit(train_x[i])
    train_x[i] = le.transform(train_x[i])

    for label in np.unique(test_x[i]):
        if label not in le.classes_:
            le.classes_ = np.append(le.classes_, label)
    
    test_x[i] = le.transform(test_x[i])

print('done')

done


### split

In [102]:
## PRODUCT_CODE

# train
train_x_1 = train_x[train_x['PRODUCT_CODE'] == 0].drop('Y_Class', axis=1)
train_x_2 = train_x[train_x['PRODUCT_CODE'] == 1].drop('Y_Class', axis=1)
train_x_3 = train_x[train_x['PRODUCT_CODE'] == 2].drop('Y_Class', axis=1)

train_y_1 = train_x['Y_Class'][train_x['PRODUCT_CODE'] == 0]
train_y_2 = train_x['Y_Class'][train_x['PRODUCT_CODE'] == 1]
train_y_3 = train_x['Y_Class'][train_x['PRODUCT_CODE'] == 2]

print('train_x_1 shape :', train_x_1.shape,
      '\ntrain_x_2 shape :', train_x_2.shape,
      '\ntrain_x_3 shape :', train_x_3.shape)

# test
test_x_1 = test_x[test_x['PRODUCT_CODE'] == 0]
test_x_2 = test_x[test_x['PRODUCT_CODE'] == 1]
test_x_3 = test_x[test_x['PRODUCT_CODE'] == 2]



# LINE

## TRAIN

# line 1
test_x_1_1 = test_x_1[test_x_1['LINE'] == 0]

# line 2
test_x_1_2 = test_x_1[test_x_1['LINE'] == 1]

# line 3
test_x_1_3 = test_x_1[test_x_1['LINE'] == 2]

# line 4
test_x_1_4 = test_x_1[test_x_1['LINE'] == 3]

# line 5
test_x_2_5 = test_x_2[test_x_2['LINE'] == 4]
test_x_3_5 = test_x_3[test_x_3['LINE'] == 4]

# line 6
test_x_2_6 = test_x_2[test_x_2['LINE'] == 5]
test_x_3_6 = test_x_3[test_x_3['LINE'] == 5]




## TRAIN

# line 1
train_x_1_1 = train_x_1[train_x_1['LINE'] == 0]

# line 2
train_x_1_2 = train_x_1[train_x_1['LINE'] == 1]

# line 3
train_x_1_3 = train_x_1[train_x_1['LINE'] == 2]

# line 4
train_x_1_4 = train_x_1[train_x_1['LINE'] == 3]

# line 5
train_x_2_5 = train_x_2[train_x_2['LINE'] == 4]
train_x_3_5 = train_x_3[train_x_3['LINE'] == 4]

# line 6
train_x_2_6 = train_x_2[train_x_2['LINE'] == 5]
train_x_3_6 = train_x_3[train_x_3['LINE'] == 5]


train_set = [train_x_1_1, train_x_1_2, train_x_1_3, train_x_1_4, train_x_2_5, train_x_3_5, train_x_2_6, train_x_3_6]
test_set = [test_x_1_1, test_x_1_2, test_x_1_3, test_x_1_4, test_x_2_5, test_x_3_5, test_x_2_6, test_x_3_6]

print('train')
for set in train_set:
    print(set.shape)

print('\n\ntest')
for set in test_set:
    print(set.shape)    

train_x_1 shape : (249, 2877) 
train_x_2 shape : (6, 2877) 
train_x_3 shape : (343, 2877)
train
(59, 2877)
(70, 2877)
(78, 2877)
(42, 2877)
(3, 2877)
(172, 2877)
(3, 2877)
(171, 2877)


test
(14, 2877)
(14, 2877)
(13, 2877)
(26, 2877)
(3, 2877)
(108, 2877)
(1, 2877)
(131, 2877)


#### null - mean

In [103]:
from scipy.stats import zscore

train_set_mean = train_set
test_set_mean = test_set

for set in train_set_mean:
    cols = set.columns
    for col in cols:
        set[col] = set[col].fillna(set[col].mean())

for set in test_set_mean:
    cols = set.columns
    for col in cols:
        set[col] = set[col].fillna(set[col].mean())

train_x = pd.concat(train_set_mean, axis=0).sort_index()
test_x = pd.concat(test_set_mean, axis=0).sort_index()

### 파생변수 생성

In [104]:
"""
new features 
"""

train_x['LINE_PRODUCT_CODE'] = train_x[['LINE','PRODUCT_CODE']].apply(lambda x: '-'.join(x.astype(str)),axis=1)
test_x['LINE_PRODUCT_CODE'] = test_x[['LINE','PRODUCT_CODE']].apply(lambda x: '-'.join(x.astype(str)),axis=1)

train_x.drop(['LINE','PRODUCT_CODE'], axis=1, inplace=True)
test_x.drop(['LINE','PRODUCT_CODE'], axis=1, inplace=True)

In [105]:
from sklearn.preprocessing import LabelEncoder

qual_col = ['LINE_PRODUCT_CODE']

for i in qual_col:
    le = LabelEncoder()
    le.fit(train_x[i])
    train_x[i] = le.transform(train_x[i])

    for label in np.unique(test_x[i]):
        if label not in le.classes_:
            le.classes_ = np.append(le.classes_, label)
    
    test_x[i] = le.transform(test_x[i])

print('done')

done


### z-score

In [106]:
from scipy.stats import zscore

train_cols = train_x.columns.drop('LINE_PRODUCT_CODE')
test_cols = test_x.columns.drop('LINE_PRODUCT_CODE')

for col in train_cols:
    train_x[col] = zscore(train_x[col])

for col in test_cols:
    test_x[col] = zscore(test_x[col])

### fillna(0)

In [107]:
train_x = train_x.fillna(0)
test_x = test_x.fillna(0)

## Model

### Pytorch1

In [72]:
from sklearn.model_selection import train_test_split

x_t, x_v, y_t, y_v = train_test_split(train_x, train_y, test_size=0.2, random_state=42)

x_torch = train_x
y_torch = train_y

print(x_torch.shape)
print(y_torch.shape)

(598, 2876)
(598,)


In [73]:
x_columns = x_torch.columns
x = x_torch[x_columns].values
y = np.array(y_torch)

x.shape

(598, 2876)

In [74]:
from sklearn import preprocessing

le = preprocessing.LabelEncoder()
y = le.fit_transform(y_torch)

products = le.classes_
products

array([0, 1, 2])

In [71]:
import torch
import io
import copy

try:
    COLAB = True
    print("Note: using Google CoLab")
except:
    print("Note: not using Google CoLab")
    COLAB = False

# Early stopping (see Module 3.4)
class EarlyStopping():
  def __init__(self, patience=5, min_delta=0, restore_best_weights=True):
    self.patience = patience
    self.min_delta = min_delta
    self.restore_best_weights = restore_best_weights
    self.best_model = None
    self.best_loss = None
    self.counter = 0
    self.status = ""
    
  def __call__(self, model, val_loss):
    if self.best_loss == None:
      self.best_loss = val_loss
      self.best_model = copy.deepcopy(model)
    elif self.best_loss - val_loss > self.min_delta:
      self.best_loss = val_loss
      self.counter = 0
      self.best_model.load_state_dict(model.state_dict())
    elif self.best_loss - val_loss < self.min_delta:
      self.counter += 1
      if self.counter >= self.patience:
        self.status = f"Stopped on {self.counter}"
        if self.restore_best_weights:
          model.load_state_dict(self.best_model.state_dict())
        return True
    self.status = f"{self.counter}/{self.patience}"
    return False

# Make use of a GPU or MPS (Apple) if one is available. (see Module 3.2)
device = "mps" if getattr(torch,'has_mps',False) \
    else "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Note: using Google CoLab
Using device: mps


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from torch.autograd import Variable
from sklearn import preprocessing
from torch.utils.data import DataLoader, TensorDataset
import tqdm
import time

# Define the PyTorch Neural Network
class Net(nn.Module):
    def __init__(self, in_count, out_count):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(in_count, 50)
        self.fc2 = nn.Linear(50, 25)
        self.fc3 = nn.Linear(25, out_count)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return self.softmax(self.fc3(x))

# Split into validation and training sets
x_train, x_test, y_train, y_test = train_test_split(    
    x, y, test_size=0.2, random_state=42)

# Numpy to Torch Tensor
x_train = torch.Tensor(x_train).float()
y_train = torch.Tensor(y_train).long()

x_test = torch.Tensor(x_test).float().to(device)
y_test = torch.Tensor(y_test).long().to(device)


# Create datasets

BATCH_SIZE = 1

dataset_train = TensorDataset(x_train, y_train)
dataloader_train = DataLoader(dataset_train,\
  batch_size=BATCH_SIZE, shuffle=True)

dataset_test = TensorDataset(x_test, y_test)
dataloader_test = DataLoader(dataset_test,\
  batch_size=BATCH_SIZE, shuffle=True)


# Create model
model = Net(x.shape[1],len(products)).to(device)

loss_fn = nn.CrossEntropyLoss()# cross entropy loss

optimizer = torch.optim.Adam(model.parameters()) # , lr=0.01
es = EarlyStopping()

epoch = 0
done = False
while epoch<1000 and not done:
  epoch += 1
  steps = list(enumerate(dataloader_train))
  pbar = tqdm.tqdm(steps)
  model.train()
  for i, (x_batch, y_batch) in pbar:
    y_batch_pred = model(x_batch.to(device))
    loss = loss_fn(y_batch_pred, y_batch.to(device))
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    loss, current = loss.item(), (i + 1)* len(x_batch)
    if i == len(steps)-1:
      model.eval()
      pred = model(x_test)
      vloss = loss_fn(pred, y_test)
      if es(model,vloss): done = True
      pbar.set_description(f"Epoch: {epoch}, tloss: {loss}, vloss: {vloss:>7f}, EStop:[{es.status}]")
    else:
      pbar.set_description(f"Epoch: {epoch}, tloss {loss:}")


pred = model(x_test)
_, predict_classes = torch.max(pred, 1)

predict_classes = predict_classes.tolist()
y_test = y_test.tolist()


In [None]:
from sklearn import metrics

score = metrics.accuracy_score(y_test, predict_classes)
print("Accuracy score: {}".format(score))

Accuracy score: 0.6333333333333333


### Pytorch 2

In [41]:
train_x = train_x.fillna(0)
test_x = test_x.fillna(0)

In [52]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from numpy import genfromtxt

train_x.to_csv('train_torch.csv', index_label=False)


data = genfromtxt('train_torch.csv', delimiter=',', skip_header=1)[:,1:]
input = data
target = np.array(train_y)

print(input.shape)
print(target.shape)

(598, 2876)
(598,)


In [53]:
input = torch.FloatTensor(input)
target = torch.LongTensor(target).squeeze()

print(input.shape)
print(target.shape)

torch.Size([598, 2876])
torch.Size([598])


In [54]:
from sklearn import preprocessing

le = preprocessing.LabelEncoder()
y = le.fit_transform(train_y)

products = le.classes_
products

array([0, 1, 2])

In [56]:
model = nn.Sequential(nn.Linear(input.shape[1],len(products)),
                      nn.Softmax(dim=1))

pred = model(input)
print('before pred :', pred)
loss = F.cross_entropy(pred, target)
print('before loss :', loss)
optimizer = optim.SGD(model.parameters(), lr=0.1)
epoches = 1000

for epoch in range(epoches + 1):
    pred = model(input)

    loss = F.cross_entropy(pred, target=target)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

pred = model(input)
print('after pred :', pred)
loss = F.cross_entropy(pred, target)
print('after loss :', loss)
print(list(model.parameters()))

before pred : tensor([[0.3372, 0.3335, 0.3293],
        [0.3379, 0.3330, 0.3291],
        [0.3372, 0.3335, 0.3293],
        ...,
        [0.3372, 0.3335, 0.3293],
        [0.3386, 0.3325, 0.3288],
        [0.3400, 0.3316, 0.3284]], grad_fn=<SoftmaxBackward0>)
before loss : tensor(1.0994, grad_fn=<NllLossBackward0>)
after pred : tensor([[3.5530e-02, 9.0778e-01, 5.6694e-02],
        [1.4328e-02, 9.5916e-01, 2.6510e-02],
        [3.5530e-02, 9.0778e-01, 5.6694e-02],
        ...,
        [3.5530e-02, 9.0778e-01, 5.6694e-02],
        [5.6006e-03, 9.8238e-01, 1.2016e-02],
        [8.2772e-04, 9.9678e-01, 2.3876e-03]], grad_fn=<SoftmaxBackward0>)
after loss : tensor(0.8729, grad_fn=<NllLossBackward0>)
[Parameter containing:
tensor([[ 1.5704e-02, -1.2264e-02,  1.5727e-02,  ...,  1.6666e-02,
         -4.9007e-04, -3.7153e-01],
        [-6.1633e-03, -1.5590e-03,  9.1266e-03,  ..., -1.5089e-02,
         -1.2696e-02,  5.9172e-01],
        [ 5.2213e-03,  1.4187e-02,  1.2306e-02,  ...,  1.4734e-02,


#### Iris

In [48]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from numpy import genfromtxt

data = genfromtxt('./IRIS_tiny.csv', delimiter=',', skip_header=1)
input = data[:, 0:4]
target = data[:, -1:]

print(input.shape)
print(target.shape)

(30, 4)
(30, 1)


In [49]:
input = torch.FloatTensor(input)
target = torch.LongTensor(target).squeeze()

print(input.shape)
print(target.shape)

torch.Size([30, 4])
torch.Size([30])


In [97]:
input = torch.FloatTensor(input)
target = torch.LongTensor(target).squeeze()

model = nn.Sequential(nn.Linear(4,3),
                      nn.Softmax(dim=1))

pred = model(input)
print('before pred :', pred)
loss = F.cross_entropy(pred, target)
print('before loss :', loss)
optimizer = optim.SGD(model.parameters(), lr=0.1)
epoches = 100

for epoch in range(epoches + 1):
    pred = model(input)

    loss = F.cross_entropy(pred, target=target)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

pred = model(input)
print('after pred :', pred)
loss = F.cross_entropy(pred, target)
print('after loss :', loss)
print(list(model.parameters()))

before pred : tensor([[0.1395, 0.3082, 0.5523],
        [0.1371, 0.3188, 0.5441],
        [0.1474, 0.3176, 0.5350],
        [0.1521, 0.3189, 0.5289],
        [0.1453, 0.3073, 0.5473],
        [0.1315, 0.2775, 0.5910],
        [0.1527, 0.3044, 0.5429],
        [0.1426, 0.3102, 0.5471],
        [0.1545, 0.3248, 0.5207],
        [0.1445, 0.3265, 0.5290],
        [0.0688, 0.1698, 0.7615],
        [0.0779, 0.1698, 0.7523],
        [0.0684, 0.1640, 0.7676],
        [0.0926, 0.2068, 0.7005],
        [0.0720, 0.1721, 0.7559],
        [0.0985, 0.1974, 0.7041],
        [0.0798, 0.1623, 0.7580],
        [0.1167, 0.2413, 0.6419],
        [0.0769, 0.1854, 0.7377],
        [0.1022, 0.1979, 0.6999],
        [0.0613, 0.1051, 0.8336],
        [0.0784, 0.1511, 0.7705],
        [0.0539, 0.1215, 0.8246],
        [0.0758, 0.1499, 0.7743],
        [0.0618, 0.1217, 0.8164],
        [0.0493, 0.1157, 0.8350],
        [0.1025, 0.1781, 0.7194],
        [0.0594, 0.1380, 0.8026],
        [0.0644, 0.1484, 0.7872],


### torch3

In [121]:
from sklearn.model_selection import train_test_split

x_t, x_v, y_t, y_v = train_test_split(train_x, train_y, test_size=0.2, random_state=42)



dataframe = x_t
dataframe['target'] = y_t

print(dataframe.shape)

(478, 2877)


In [122]:
import torch
import torch.nn as nn
from torch.optim.adam import Adam

# 모델 정의
model = nn.Sequential(
    nn.Linear(dataframe.shape[1]-1, 100),
    nn.ReLU(),
    nn.Linear(100,1)
)

X = dataframe.iloc[:, :-1].values
Y = dataframe.iloc[:, -1].values

batch_size = 16
learning_rate = 0.001

# 가중치 수정하는 최적화함수 정의
optim = Adam(model.parameters(), lr=learning_rate)

# epoch 반복
epoches = 100

for epoch in range(epoches):

    # 배치 반복
    for i in range(len(x) // batch_size):
        start = i * batch_size
        end = start + batch_size

        # 파이토치 실수형 텐서로 변환
        x = torch.FloatTensor(X[start:end])
        y = torch.FloatTensor(Y[start:end])

        optim.zero_grad()
        preds = model(x)
        loss = nn.MSELoss()(preds, y)
        loss.backward()
        optim.step()


    if epoch % 1 == 0:
        print(f"epoch:{epoch},  loss:{loss.item()}")

epoch:0,  loss:0.1875
epoch:1,  loss:0.1875
epoch:2,  loss:0.1875
epoch:3,  loss:0.1875
epoch:4,  loss:0.1875
epoch:5,  loss:0.1875
epoch:6,  loss:0.1875
epoch:7,  loss:0.1875
epoch:8,  loss:0.1875
epoch:9,  loss:0.1875
epoch:10,  loss:0.1875
epoch:11,  loss:0.1875
epoch:12,  loss:0.1875
epoch:13,  loss:0.1875
epoch:14,  loss:0.1875
epoch:15,  loss:0.1875
epoch:16,  loss:0.1875
epoch:17,  loss:0.1875
epoch:18,  loss:0.1875
epoch:19,  loss:0.1875
epoch:20,  loss:0.1875
epoch:21,  loss:0.1875
epoch:22,  loss:0.1875
epoch:23,  loss:0.1875
epoch:24,  loss:0.1875
epoch:25,  loss:0.1875
epoch:26,  loss:0.1875
epoch:27,  loss:0.1875
epoch:28,  loss:0.1875
epoch:29,  loss:0.1875
epoch:30,  loss:0.1875
epoch:31,  loss:0.1875
epoch:32,  loss:0.1875
epoch:33,  loss:0.1875
epoch:34,  loss:0.1875
epoch:35,  loss:0.1875
epoch:36,  loss:0.1875
epoch:37,  loss:0.1875
epoch:38,  loss:0.1875
epoch:39,  loss:0.1875
epoch:40,  loss:0.1875
epoch:41,  loss:0.1875
epoch:42,  loss:0.1875
epoch:43,  loss:0.187

In [120]:
prediction = model(torch.FloatTensor(X[0, :-1]))
real = Y[0]

Y[0]

RuntimeError: mat1 and mat2 shapes cannot be multiplied (1x2875 and 2876x100)

In [None]:
from torch.utils.data.dataloader import DataLoader


## submit

In [18]:
from xgboost import XGBClassifier # 회귀트리

xgb = XGBClassifier()
xgb.fit(train_x, train_y)
pred = xgb.predict(test_x)


submit_csv = pd.read_csv('./sample_submission.csv')
submit_csv['Y_Class'] = pred
submit_csv.to_csv('XGBoost_notnull_submission.csv', index=False)



In [8]:
from xgboost import XGBRFClassifier

xgb = XGBRFClassifier()
xgb.fit(train_x, train_y)
pred = xgb.predict(test_x)


submit_csv = pd.read_csv('./sample_submission.csv')
submit_csv['Y_Class'] = pred
submit_csv.to_csv('XGBoost_notnull_XGBRFC.csv', index=False)