# 제품 이상여부 판별 프로젝트

## 1. 데이터 불러오기


### 필수 라이브러리


In [1]:
import os
from pprint import pprint

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder

### 데이터 읽어오기


In [2]:
ROOT_DIR = "data"
RANDOM_STATE = 110

# Load data
train = pd.read_csv(os.path.join(ROOT_DIR, "train_junhyeok.csv"))
test = pd.read_csv('./data/test_junhyeok.csv')

### 필요하거나 묶일 수 있는 변수 가져오기

In [3]:
# 이 셀은 한번만 클릭
train['Equipment_Dam'] = train['Equipment_Dam'].str.slice(15, 16)
train['Equipment_Fill1'] = train['Equipment_Fill1'].str.slice(17, 18)
train['Equipment_Fill2'] = train['Equipment_Fill2'].str.slice(17, 18)

test['Equipment_Dam'] = test['Equipment_Dam'].str.slice(15, 16)
test['Equipment_Fill1'] = test['Equipment_Fill1'].str.slice(17, 18)
test['Equipment_Fill2'] = test['Equipment_Fill2'].str.slice(17, 18)

In [4]:
# 새로운 영역 지정
use_train = pd.DataFrame(train['target'], columns = ['target'])
use_test = pd.DataFrame(test['Set ID'], columns = ['Set ID'])

### Inconsistant

In [5]:
# Dam, Fill1, Fill2에서 지정된 값이 다를 경우 Abnormal 
def inconsistant(data, columnname, iwantthiscolumnsname, is_train = True):
    # 장비 번호가 다르면 불일치
    if is_train:
        cri = [
            train[columnname + '_Dam'] != train[columnname + '_Fill1'],
            train[columnname + '_Dam'] != train[columnname + '_Fill2'],
            train[columnname + '_Fill1'] != train[columnname + '_Fill1'],
            data[iwantthiscolumnsname] == 1
        ]
        
    else:
        cri = [
            test[columnname + '_Dam'] != test[columnname + '_Fill1'],
            test[columnname + '_Dam'] != test[columnname + '_Fill2'],
            test[columnname + '_Fill1'] != test[columnname + '_Fill1'],
            data[iwantthiscolumnsname] == 1
        ]
    con = [1, 1, 1, 1]

    data[iwantthiscolumnsname] = np.select(cri, con, default = 0)

In [6]:
# 불일치 변수
use_train['inconsistant'] = 0
use_test['inconsistant'] = 0

# 기준
columnname = ['Equipment', 'Receip No Collect Result', 'Production Qty Collect Result', 'PalletID Collect Result', ]

# 장착
for i in columnname:
    inconsistant(use_train, i, 'inconsistant', True)
    inconsistant(use_test, i, 'inconsistant', False)

In [7]:
# 시간이 0이하, 900이상인 값은 이상치로 분류
for j in ['Machine Tact time Collect Result_Dam', 'Machine Tact time Collect Result_Fill1', 'Machine Tact time Collect Result_Fill2']:
    cri = [
        train[j] <= 0,
        train[j] > 900
    ]
    cri2 = [
        test[j] <= 0,
        test[j] > 900
    ]
    con = [
        1, 1
    ]
    use_train['inconsistant'] = np.select(cri, con, default = use_train['inconsistant'])
    use_test['inconsistant'] = np.select(cri2, con, default = use_test['inconsistant'])

### Distance of head normal

In [8]:
# 유클리드 거리 계산
def euclide_distance(data, first_dot, second_dot):
    
    # 값 반환
    iwantcalculateeuclidedistance = np.array([0]*len(data))
    
    # 제곱값 더해주기
    for i, j in zip(first_dot, second_dot):
        iwantcalculateeuclidedistance += (data[i] - data[j])**2
        
    # 반환
    return iwantcalculateeuclidedistance**0.5

In [9]:
first_dot = [
    'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam',
    'HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Dam',
    'HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Dam'
] 
second_dot = [
    'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1',
    'HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Fill1',
    'HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Fill1'
]

use_train['dist_head_stage1'] = euclide_distance(train, first_dot, second_dot)
use_test['dist_head_stage1'] = euclide_distance(test, first_dot, second_dot)

In [10]:
first_dot = [
    'HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Dam',
    'HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Dam',
    'HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Dam'
] 
second_dot = [
    'HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Fill1',
    'HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill1',
    'HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Fill1'
]

use_train['dist_head_stage2'] = euclide_distance(train, first_dot, second_dot)
use_test['dist_head_stage2'] = euclide_distance(test, first_dot, second_dot)

In [11]:
first_dot = [
    'HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Dam',
    'HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Dam',
    'HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Dam'
] 
second_dot = [
    'HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Fill1',
    'HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Fill1',
    'HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Fill1'
]

use_train['dist_head_stage3'] = euclide_distance(train, first_dot, second_dot)
use_test['dist_head_stage3'] = euclide_distance(test, first_dot, second_dot)

### adjusted volume

In [12]:
### 각 Stage별 새로운 Resin 값을 제시하기 위한 코드
## train
# Stage별 토출량, 토출 속도, 토출 소요시간 데이터
train['RESIN Predicted_Volume Stage1 Dam'] = train['DISCHARGED SPEED OF RESIN Collect Result_Dam'] * train['DISCHARGED TIME OF RESIN(Stage1) Collect Result_Dam']
train['RESIN Predicted_Volume Stage2 Dam'] = train['DISCHARGED SPEED OF RESIN Collect Result_Dam'] * train['DISCHARGED TIME OF RESIN(Stage2) Collect Result_Dam']
train['RESIN Predicted_Volume Stage3 Dam'] = train['DISCHARGED SPEED OF RESIN Collect Result_Dam'] * train['DISCHARGED TIME OF RESIN(Stage3) Collect Result_Dam']

# Stage별 실제 토출량과 예측 토출량의 비율(조정 계수) 계산
train['Stage1 Scaling_Factor'] = train['Dispense Volume(Stage1) Collect Result_Dam'] / train['RESIN Predicted_Volume Stage1 Dam']
train['Stage2 Scaling_Factor'] = train['Dispense Volume(Stage2) Collect Result_Dam'] / train['RESIN Predicted_Volume Stage2 Dam']
train['Stage3 Scaling_Factor'] = train['Dispense Volume(Stage3) Collect Result_Dam'] / train['RESIN Predicted_Volume Stage3 Dam']

# Stage별 조정된 예측 토출량 계산
use_train['RESIN Adjusted_Predicted_Volume Stage1 Dam'] = train['RESIN Predicted_Volume Stage1 Dam'] * train['Stage1 Scaling_Factor'].mean()
use_train['RESIN Adjusted_Predicted_Volume Stage2 Dam'] = train['RESIN Predicted_Volume Stage2 Dam'] * train['Stage2 Scaling_Factor'].mean()
use_train['RESIN Adjusted_Predicted_Volume Stage3 Dam'] = train['RESIN Predicted_Volume Stage3 Dam'] * train['Stage3 Scaling_Factor'].mean()

## test
# Stage별 토출량, 토출 속도, 토출 소요시간 데이터
test['RESIN Predicted_Volume Stage1 Dam'] = test['DISCHARGED SPEED OF RESIN Collect Result_Dam'] * test['DISCHARGED TIME OF RESIN(Stage1) Collect Result_Dam']
test['RESIN Predicted_Volume Stage2 Dam'] = test['DISCHARGED SPEED OF RESIN Collect Result_Dam'] * test['DISCHARGED TIME OF RESIN(Stage2) Collect Result_Dam']
test['RESIN Predicted_Volume Stage3 Dam'] = test['DISCHARGED SPEED OF RESIN Collect Result_Dam'] * test['DISCHARGED TIME OF RESIN(Stage3) Collect Result_Dam']

# Stage별 실제 토출량과 예측 토출량의 비율(조정 계수) 계산
test['Stage1 Scaling_Factor'] = test['Dispense Volume(Stage1) Collect Result_Dam'] / test['RESIN Predicted_Volume Stage1 Dam']
test['Stage2 Scaling_Factor'] = test['Dispense Volume(Stage2) Collect Result_Dam'] / test['RESIN Predicted_Volume Stage2 Dam']
test['Stage3 Scaling_Factor'] = test['Dispense Volume(Stage3) Collect Result_Dam'] / test['RESIN Predicted_Volume Stage3 Dam']

# Stage별 조정된 예측 토출량 계산
use_test['RESIN Adjusted_Predicted_Volume Stage1 Dam'] = test['RESIN Predicted_Volume Stage1 Dam'] * test['Stage1 Scaling_Factor'].mean()
use_test['RESIN Adjusted_Predicted_Volume Stage2 Dam'] = test['RESIN Predicted_Volume Stage2 Dam'] * test['Stage2 Scaling_Factor'].mean()
use_test['RESIN Adjusted_Predicted_Volume Stage3 Dam'] = test['RESIN Predicted_Volume Stage3 Dam'] * test['Stage3 Scaling_Factor'].mean()

In [13]:
### 각 Stage별 새로운 Resin 값을 제시하기 위한 코드
## train
# Stage별 토출량, 토출 속도, 토출 소요시간 데이터
train['RESIN Predicted_Volume Stage1 Fill1'] = train['DISCHARGED SPEED OF RESIN Collect Result_Fill1'] * train['DISCHARGED TIME OF RESIN(Stage1) Collect Result_Fill1']
train['RESIN Predicted_Volume Stage2 Fill1'] = train['DISCHARGED SPEED OF RESIN Collect Result_Fill1'] * train['DISCHARGED TIME OF RESIN(Stage2) Collect Result_Fill1']
train['RESIN Predicted_Volume Stage3 Fill1'] = train['DISCHARGED SPEED OF RESIN Collect Result_Fill1'] * train['DISCHARGED TIME OF RESIN(Stage3) Collect Result_Fill1']

# Stage별 실제 토출량과 예측 토출량의 비율(조정 계수) 계산
train['Stage1 Scaling_Factor'] = train['Dispense Volume(Stage1) Collect Result_Fill1'] / train['RESIN Predicted_Volume Stage1 Fill1']
train['Stage2 Scaling_Factor'] = train['Dispense Volume(Stage2) Collect Result_Fill1'] / train['RESIN Predicted_Volume Stage2 Fill1']
train['Stage3 Scaling_Factor'] = train['Dispense Volume(Stage3) Collect Result_Fill1'] / train['RESIN Predicted_Volume Stage3 Fill1']

# Stage별 조정된 예측 토출량 계산
use_train['RESIN Adjusted_Predicted_Volume Stage1 Fill1'] = train['RESIN Predicted_Volume Stage1 Fill1'] * train['Stage1 Scaling_Factor'].mean()
use_train['RESIN Adjusted_Predicted_Volume Stage2 Fill1'] = train['RESIN Predicted_Volume Stage2 Fill1'] * train['Stage2 Scaling_Factor'].mean()
use_train['RESIN Adjusted_Predicted_Volume Stage3 Fill1'] = train['RESIN Predicted_Volume Stage3 Fill1'] * train['Stage3 Scaling_Factor'].mean()

## test
# Stage별 토출량, 토출 속도, 토출 소요시간 데이터
test['RESIN Predicted_Volume Stage1 Fill1'] = test['DISCHARGED SPEED OF RESIN Collect Result_Fill1'] * test['DISCHARGED TIME OF RESIN(Stage1) Collect Result_Fill1']
test['RESIN Predicted_Volume Stage2 Fill1'] = test['DISCHARGED SPEED OF RESIN Collect Result_Fill1'] * test['DISCHARGED TIME OF RESIN(Stage2) Collect Result_Fill1']
test['RESIN Predicted_Volume Stage3 Fill1'] = test['DISCHARGED SPEED OF RESIN Collect Result_Fill1'] * test['DISCHARGED TIME OF RESIN(Stage3) Collect Result_Fill1']

# Stage별 실제 토출량과 예측 토출량의 비율(조정 계수) 계산
test['Stage1 Scaling_Factor'] = test['Dispense Volume(Stage1) Collect Result_Fill1'] / test['RESIN Predicted_Volume Stage1 Fill1']
test['Stage2 Scaling_Factor'] = test['Dispense Volume(Stage2) Collect Result_Fill1'] / test['RESIN Predicted_Volume Stage2 Fill1']
test['Stage3 Scaling_Factor'] = test['Dispense Volume(Stage3) Collect Result_Fill1'] / test['RESIN Predicted_Volume Stage3 Fill1']

# Stage별 조정된 예측 토출량 계산
use_test['RESIN Adjusted_Predicted_Volume Stage1 Fill1'] = test['RESIN Predicted_Volume Stage1 Fill1'] * test['Stage1 Scaling_Factor'].mean()
use_test['RESIN Adjusted_Predicted_Volume Stage2 Fill1'] = test['RESIN Predicted_Volume Stage2 Fill1'] * test['Stage2 Scaling_Factor'].mean()
use_test['RESIN Adjusted_Predicted_Volume Stage3 Fill1'] = test['RESIN Predicted_Volume Stage3 Fill1'] * test['Stage3 Scaling_Factor'].mean()

In [14]:
# pressure * unit time
use_train['time_per_pressure_1st'] = train['1st Pressure Collect Result_AutoClave'] * train['1st Pressure 1st Pressure Unit Time_AutoClave']
use_train['time_per_pressure_2nd'] = train['2nd Pressure Collect Result_AutoClave'] * train['2nd Pressure Unit Time_AutoClave']
use_train['time_per_pressure_3rd'] = train['3rd Pressure Collect Result_AutoClave'] * train['3rd Pressure Unit Time_AutoClave']

use_test['time_per_pressure_1st'] = test['1st Pressure Collect Result_AutoClave'] * test['1st Pressure 1st Pressure Unit Time_AutoClave']
use_test['time_per_pressure_2nd'] = test['2nd Pressure Collect Result_AutoClave'] * test['2nd Pressure Unit Time_AutoClave']
use_test['time_per_pressure_3rd'] = test['3rd Pressure Collect Result_AutoClave'] * test['3rd Pressure Unit Time_AutoClave']

### circle, line sum

In [15]:
col = [
    'Stage1 Circle1 Distance Speed Collect Result_Dam',
    'Stage1 Circle2 Distance Speed Collect Result_Dam',
    'Stage1 Circle3 Distance Speed Collect Result_Dam',
    'Stage1 Circle4 Distance Speed Collect Result_Dam'
]

use_train['circle_stage1'] = train[col].sum(axis = 1)
use_test['circle_stage1'] = train[col].sum(axis = 1)

In [16]:
col = [
    'Stage2 Circle1 Distance Speed Collect Result_Dam',
    'Stage2 Circle2 Distance Speed Collect Result_Dam',
    'Stage2 Circle3 Distance Speed Collect Result_Dam',
    'Stage2 Circle4 Distance Speed Collect Result_Dam'
]

use_train['circle_stage2'] = train[col].sum(axis = 1)
use_test['circle_stage2'] = train[col].sum(axis = 1)

In [17]:
col = [
    'Stage3 Circle1 Distance Speed Collect Result_Dam',
    'Stage3 Circle2 Distance Speed Collect Result_Dam',
    'Stage3 Circle3 Distance Speed Collect Result_Dam',
    'Stage3 Circle4 Distance Speed Collect Result_Dam'
]

use_train['circle_stage3'] = train[col].sum(axis = 1)
use_test['circle_stage3'] = train[col].sum(axis = 1)

In [18]:
col = [
    'Stage1 Line1 Distance Speed Collect Result_Dam',
    'Stage1 Line2 Distance Speed Collect Result_Dam',
    'Stage1 Line3 Distance Speed Collect Result_Dam',
    'Stage1 Line4 Distance Speed Collect Result_Dam'
]

use_train['line_stage1'] = train[col].sum(axis = 1)
use_test['line_stage1'] = train[col].sum(axis = 1)

In [19]:
col = [
    'Stage2 Line1 Distance Speed Collect Result_Dam',
    'Stage2 Line2 Distance Speed Collect Result_Dam',
    'Stage2 Line3 Distance Speed Collect Result_Dam',
    'Stage2 Line4 Distance Speed Collect Result_Dam'
]

use_train['line_stage2'] = train[col].sum(axis = 1)
use_test['line_stage2'] = train[col].sum(axis = 1)

In [20]:
col = [
    'Stage3 Line1 Distance Speed Collect Result_Dam',
    'Stage3 Line1 Distance Speed Collect Result_Dam',
    'Stage3 Line1 Distance Speed Collect Result_Dam',
    'Stage3 Line1 Distance Speed Collect Result_Dam'
]

use_train['line_stage3'] = train[col].sum(axis = 1)
use_test['line_stage3'] = train[col].sum(axis = 1)

### machine time - cure time 비율

In [21]:
train['fill_time'] = train['Machine Tact time Collect Result_Fill1'] + train['Machine Tact time Collect Result_Fill2']
use_train['cure_time_ratio_fill'] = round(train['Machine Tact time Collect Result_Fill2'] / train['fill_time'], 2)

test['fill_time'] = test['Machine Tact time Collect Result_Fill1'] + test['Machine Tact time Collect Result_Fill2']
use_test['cure_time_ratio_fill'] = round(test['Machine Tact time Collect Result_Fill2'] / test['fill_time'], 2)

In [22]:
use_train['cure_time_ratio_dam'] = round((train['Machine Tact time Collect Result_Dam'] - train['DISCHARGED TIME OF RESIN(Stage1) Collect Result_Dam'] - train['DISCHARGED TIME OF RESIN(Stage2) Collect Result_Dam'] - train['DISCHARGED TIME OF RESIN(Stage3) Collect Result_Dam'])/train['Machine Tact time Collect Result_Dam'], 2)
use_test['cure_time_ratio_dam'] = round((test['Machine Tact time Collect Result_Dam'] - test['DISCHARGED TIME OF RESIN(Stage1) Collect Result_Dam'] - test['DISCHARGED TIME OF RESIN(Stage2) Collect Result_Dam'] - test['DISCHARGED TIME OF RESIN(Stage3) Collect Result_Dam'])/test['Machine Tact time Collect Result_Dam'], 2)

### workorder, receip 합체

In [23]:
# workorder별로 불량률 계산
a = pd.crosstab([train['Workorder_Dam'], train['Receip No Collect Result_Dam']], train['target']).reset_index()
a['total'] = a['AbNormal'] + a['Normal']
a['ratio'] = a['AbNormal'] / (a['total'])

In [24]:
# 기준
cri = [
    (a['ratio'] == 0),
    (a['ratio'] >= 0.076923)
]

con = [
    -1, 1
]

a['select_workorder'] = np.select(cri, con, default = 0)

In [25]:
# 결합 시킬 변수만 뽑기
a2 = a[['Workorder_Dam', 'Receip No Collect Result_Dam', 'select_workorder']]

In [26]:
# receip과 workorder 혼합
train = pd.merge(train, a2, how = 'left', on = ['Workorder_Dam', 'Receip No Collect Result_Dam'])
test = pd.merge(test, a2, how = 'left', on = ['Workorder_Dam', 'Receip No Collect Result_Dam'])

In [27]:
# train에 정보가 없는 test는 기본 값인 0으로 채우기
test['select_workorder'][test['select_workorder'].isna()] = 0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['select_workorder'][test['select_workorder'].isna()] = 0


In [28]:
# 대입 시키기
use_train['select_workorder'] = train['select_workorder']
use_test['select_workorder'] = test['select_workorder']

### 그대로 가져올 변수

In [29]:
col = [
    'CURE SPEED Collect Result_Dam',
    'CURE SPEED Collect Result_Fill2',
    'PalletID Collect Result_Dam',
    'Production Qty Collect Result_Dam',
    'Chamber Temp. Collect Result_AutoClave',
    'DISCHARGED SPEED OF RESIN Collect Result_Dam',
    'DISCHARGED TIME OF RESIN(Stage1) Collect Result_Dam',
    'DISCHARGED TIME OF RESIN(Stage2) Collect Result_Dam',
    'DISCHARGED TIME OF RESIN(Stage3) Collect Result_Dam',
    'Dispense Volume(Stage1) Collect Result_Dam',
    'Dispense Volume(Stage2) Collect Result_Dam',
    'Dispense Volume(Stage3) Collect Result_Dam',
    'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam',
    'HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Dam',
    'HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Dam',
    'HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Dam',
    'HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Dam',
    'HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Dam',
    'HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Dam',
    'HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Dam',
    'HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Dam',
    'DISCHARGED SPEED OF RESIN Collect Result_Fill1',
    'DISCHARGED TIME OF RESIN(Stage1) Collect Result_Fill1',
    'DISCHARGED TIME OF RESIN(Stage2) Collect Result_Fill1',
    'DISCHARGED TIME OF RESIN(Stage3) Collect Result_Fill1',
    'Dispense Volume(Stage1) Collect Result_Fill1',
    'Dispense Volume(Stage2) Collect Result_Fill1',
    'Dispense Volume(Stage3) Collect Result_Fill1',
    'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1',
    'HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Fill1',
    'HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Fill1',
    'HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Fill1',
    'HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill1',
    'HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Fill1',
    'HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Fill1',
    'HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Fill1',
    'HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Fill1',
    'THICKNESS 1 Collect Result_Dam',
    'THICKNESS 2 Collect Result_Dam',
    'THICKNESS 3 Collect Result_Dam',
    '1st Pressure Collect Result_AutoClave',
    '1st Pressure 1st Pressure Unit Time_AutoClave',
    '2nd Pressure Collect Result_AutoClave',
    '2nd Pressure Unit Time_AutoClave',
    '3rd Pressure Collect Result_AutoClave',
    '3rd Pressure Unit Time_AutoClave'
]

use_train[col] = train[col]
use_test[col] = test[col]

### LabelEncoder

In [30]:
# modelsuffix
le = LabelEncoder()

In [31]:
# 변환
use_train['model_suffix'] = le.fit_transform(train['Model.Suffix_Dam'])
use_test['model_suffix'] = le.transform(test['Model.Suffix_Dam'])

### Setting

In [32]:
# 1, 0 으로 바꿔주기
use_train['target'] = np.where(train['target'] == 'Normal', 0, 1)

In [33]:
# set id 추가
use_test['Set ID'] = test['Set ID']

### Save

In [34]:
# 저장
use_train.to_csv('./data/train_0818_variable.csv', index = False)
use_test.to_csv('./data/test_0818_variable.csv', index = False)