# 제품 이상여부 판별 프로젝트

## 1. 데이터 불러오기


### 필수 라이브러리


In [1]:
import os
from pprint import pprint

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder

### 데이터 읽어오기


In [2]:
ROOT_DIR = "data"
RANDOM_STATE = 110

# Load data
train = pd.read_csv(os.path.join(ROOT_DIR, "train_junhyeok.csv"))
test = pd.read_csv('./data/test_junhyeok.csv')

### 필요하거나 묶일 수 있는 변수 가져오기

In [4]:
# 이 셀은 한번만 클릭
train['Equipment_Dam'] = train['Equipment_Dam'].str.slice(15, 16)
train['Equipment_Fill1'] = train['Equipment_Fill1'].str.slice(17, 18)
train['Equipment_Fill2'] = train['Equipment_Fill2'].str.slice(17, 18)

test['Equipment_Dam'] = test['Equipment_Dam'].str.slice(15, 16)
test['Equipment_Fill1'] = test['Equipment_Fill1'].str.slice(17, 18)
test['Equipment_Fill2'] = test['Equipment_Fill2'].str.slice(17, 18)

In [5]:
# 새로운 영역 지정
use_train = pd.DataFrame(train['target'], columns = ['target'])
use_test = pd.DataFrame(test['Set ID'], columns = ['Set ID'])

In [6]:
# Dam, Fill1, Fill2에서 지정된 값이 다를 경우 Abnormal 
def inconsistant(data, columnname, iwantthiscolumnsname, is_train = True):
    # 장비 번호가 다르면 불일치
    if is_train:
        cri = [
            train[columnname + '_Dam'] != train[columnname + '_Fill1'],
            train[columnname + '_Dam'] != train[columnname + '_Fill2'],
            train[columnname + '_Fill1'] != train[columnname + '_Fill1'],
            data[iwantthiscolumnsname] == 1
        ]
        
    else:
        cri = [
            test[columnname + '_Dam'] != test[columnname + '_Fill1'],
            test[columnname + '_Dam'] != test[columnname + '_Fill2'],
            test[columnname + '_Fill1'] != test[columnname + '_Fill1'],
            data[iwantthiscolumnsname] == 1
        ]
    con = [1, 1, 1, 1]

    data[iwantthiscolumnsname] = np.select(cri, con, default = 0)

In [7]:
# 불일치 변수
use_train['inconsistant'] = 0
use_test['inconsistant'] = 0

# 기준
columnname = ['Equipment', 'Receip No Collect Result', 'Production Qty Collect Result', 'PalletID Collect Result', ]

# 장착
for i in columnname:
    inconsistant(use_train, i, 'inconsistant', True)
    inconsistant(use_test, i, 'inconsistant', False)

In [8]:
# 시간이 0이하, 900이상인 값은 이상치로 분류
for j in ['Machine Tact time Collect Result_Dam', 'Machine Tact time Collect Result_Fill1', 'Machine Tact time Collect Result_Fill2']:
    cri = [
        train[j] <= 0,
        train[j] > 900
    ]
    cri2 = [
        test[j] <= 0,
        test[j] > 900
    ]
    con = [
        1, 1
    ]
    use_train['inconsistant'] = np.select(cri, con, default = use_train['inconsistant'])
    use_test['inconsistant'] = np.select(cri2, con, default = use_test['inconsistant'])

In [9]:
# 유클리드 거리 계산
def euclide_distance(data, first_dot, second_dot):
    
    # 값 반환
    iwantcalculateeuclidedistance = np.array([0]*len(data))
    
    # 제곱값 더해주기
    for i, j in zip(first_dot, second_dot):
        iwantcalculateeuclidedistance += (data[i] - data[j])**2
        
    # 반환
    return iwantcalculateeuclidedistance**0.5

### stage by head

In [10]:
# dam , fill1
first_dot = [
    'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam',
    'HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Dam',
    'HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Dam'
]

second_dot = [
    'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1',
    'HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Fill1',
    'HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Fill1'
]

# 유클리드 계산을 통한 위치 차이
train['distance_diff_dam_fill1_stage1'] = euclide_distance(train, first_dot, second_dot)
test['distance_diff_dam_fill1_stage1'] = euclide_distance(test, first_dot, second_dot)

In [11]:
# fill1, fill2
first_dot = [
    'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam',
    'HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Dam',
    'HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Dam'
]

second_dot = [
    'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2',
    'HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Fill2',
    'HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Fill2'
]

# 유클리드 계산을 통한 위치 차이
train['distance_diff_dam_fill2_stage1'] = euclide_distance(train, first_dot, second_dot)
test['distance_diff_dam_fill2_stage1'] = euclide_distance(test, first_dot, second_dot)

In [12]:
# dam fill2
first_dot = [
    'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1',
    'HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Fill1',
    'HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Fill1'
]

second_dot = [
    'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2',
    'HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Fill2',
    'HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Fill2'
]

# 유클리드 계산을 통한 위치 차이
train['distance_diff_fill1_fill2_stage1'] = euclide_distance(train, first_dot, second_dot)
test['distance_diff_fill1_fill2_stage1'] = euclide_distance(test, first_dot, second_dot)

In [13]:
# 거리 범주를 반영시키자
cri = [
    (train['distance_diff_dam_fill1_stage1'] < train['distance_diff_dam_fill2_stage1']) & (train['distance_diff_dam_fill2_stage1'] < train['distance_diff_fill1_fill2_stage1']),
    (train['distance_diff_dam_fill1_stage1'] < train['distance_diff_fill1_fill2_stage1']) & (train['distance_diff_fill1_fill2_stage1'] < train['distance_diff_dam_fill2_stage1']),
    (train['distance_diff_dam_fill2_stage1'] < train['distance_diff_dam_fill1_stage1']) & (train['distance_diff_dam_fill1_stage1'] < train['distance_diff_fill1_fill2_stage1']),
    (train['distance_diff_fill1_fill2_stage1'] < train['distance_diff_dam_fill2_stage1']) & (train['distance_diff_dam_fill2_stage1'] < train['distance_diff_dam_fill1_stage1']),
    (train['distance_diff_dam_fill2_stage1'] < train['distance_diff_fill1_fill2_stage1']) & (train['distance_diff_dam_fill1_stage1'] > train['distance_diff_fill1_fill2_stage1']),
    (train['distance_diff_fill1_fill2_stage1'] < train['distance_diff_dam_fill1_stage1']) & (train['distance_diff_dam_fill2_stage1'] > train['distance_diff_dam_fill1_stage1'])
]
cri2 = [
    (test['distance_diff_dam_fill1_stage1'] < test['distance_diff_dam_fill2_stage1']) & (test['distance_diff_dam_fill2_stage1'] < test['distance_diff_fill1_fill2_stage1']),
    (test['distance_diff_dam_fill1_stage1'] < test['distance_diff_fill1_fill2_stage1']) & (test['distance_diff_fill1_fill2_stage1'] < test['distance_diff_dam_fill2_stage1']),
    (test['distance_diff_dam_fill2_stage1'] < test['distance_diff_dam_fill1_stage1']) & (test['distance_diff_dam_fill1_stage1'] < test['distance_diff_fill1_fill2_stage1']),
    (test['distance_diff_fill1_fill2_stage1'] < test['distance_diff_dam_fill2_stage1']) & (test['distance_diff_dam_fill2_stage1'] < test['distance_diff_dam_fill1_stage1']),
    (test['distance_diff_dam_fill2_stage1'] < test['distance_diff_fill1_fill2_stage1']) & (test['distance_diff_dam_fill1_stage1'] > test['distance_diff_fill1_fill2_stage1']),
    (test['distance_diff_fill1_fill2_stage1'] < test['distance_diff_dam_fill1_stage1']) & (test['distance_diff_dam_fill2_stage1'] > test['distance_diff_dam_fill1_stage1'])
]
con = [
    1, 2, 3, 4, 5, 6
]
use_train['stage1_dist_cat'] = np.select(cri, con, default = 0)
use_test['stage1_dist_cat'] = np.select(cri2, con, default = 0)

In [14]:
# dam , fill1
first_dot = [
    'HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Dam',
    'HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Dam',
    'HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Dam'
]

second_dot = [
    'HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Fill1',
    'HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill1',
    'HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Fill1'
]

# 유클리드 계산을 통한 위치 차이
train['distance_diff_dam_fill1_stage2'] = euclide_distance(train, first_dot, second_dot)
test['distance_diff_dam_fill1_stage2'] = euclide_distance(test, first_dot, second_dot)

In [15]:
# fill1, fill2
first_dot = [
    'HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Dam',
    'HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Dam',
    'HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Dam'
]

second_dot = [
    'HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Fill2',
    'HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill2',
    'HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Fill2'
]

# 유클리드 계산을 통한 위치 차이
train['distance_diff_dam_fill2_stage2'] = euclide_distance(train, first_dot, second_dot)
test['distance_diff_dam_fill2_stage2'] = euclide_distance(test, first_dot, second_dot)

In [16]:
# dam fill2
first_dot = [
    'HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Fill1',
    'HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill1',
    'HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Fill1'
]

second_dot = [
    'HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Fill2',
    'HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill2',
    'HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Fill2'
]

# 유클리드 계산을 통한 위치 차이
train['distance_diff_fill1_fill2_stage2'] = euclide_distance(train, first_dot, second_dot)
test['distance_diff_fill1_fill2_stage2'] = euclide_distance(test, first_dot, second_dot)

In [17]:
# 거리 범주를 반영시키자
cri = [
    (train['distance_diff_dam_fill1_stage2'] < train['distance_diff_dam_fill2_stage2']) & (train['distance_diff_dam_fill2_stage2'] < train['distance_diff_fill1_fill2_stage2']),
    (train['distance_diff_dam_fill1_stage2'] < train['distance_diff_fill1_fill2_stage2']) & (train['distance_diff_fill1_fill2_stage2'] < train['distance_diff_dam_fill2_stage2']),
    (train['distance_diff_dam_fill2_stage2'] < train['distance_diff_dam_fill1_stage2']) & (train['distance_diff_dam_fill1_stage2'] < train['distance_diff_fill1_fill2_stage2']),
    (train['distance_diff_fill1_fill2_stage2'] < train['distance_diff_dam_fill2_stage2']) & (train['distance_diff_dam_fill2_stage2'] < train['distance_diff_dam_fill1_stage2']),
    (train['distance_diff_dam_fill2_stage2'] < train['distance_diff_fill1_fill2_stage2']) & (train['distance_diff_dam_fill1_stage2'] > train['distance_diff_fill1_fill2_stage2']),
    (train['distance_diff_fill1_fill2_stage2'] < train['distance_diff_dam_fill1_stage2']) & (train['distance_diff_dam_fill2_stage2'] > train['distance_diff_dam_fill1_stage2'])
]
cri2 = [
    (test['distance_diff_dam_fill1_stage2'] < test['distance_diff_dam_fill2_stage2']) & (test['distance_diff_dam_fill2_stage2'] < test['distance_diff_fill1_fill2_stage2']),
    (test['distance_diff_dam_fill1_stage2'] < test['distance_diff_fill1_fill2_stage2']) & (test['distance_diff_fill1_fill2_stage2'] < test['distance_diff_dam_fill2_stage2']),
    (test['distance_diff_dam_fill2_stage2'] < test['distance_diff_dam_fill1_stage2']) & (test['distance_diff_dam_fill1_stage2'] < test['distance_diff_fill1_fill2_stage2']),
    (test['distance_diff_fill1_fill2_stage2'] < test['distance_diff_dam_fill2_stage2']) & (test['distance_diff_dam_fill2_stage2'] < test['distance_diff_dam_fill1_stage2']),
    (test['distance_diff_dam_fill2_stage2'] < test['distance_diff_fill1_fill2_stage2']) & (test['distance_diff_dam_fill1_stage2'] > test['distance_diff_fill1_fill2_stage2']),
    (test['distance_diff_fill1_fill2_stage2'] < test['distance_diff_dam_fill1_stage2']) & (test['distance_diff_dam_fill2_stage2'] > test['distance_diff_dam_fill1_stage2'])
]
con = [
    1, 2, 3, 4, 5, 6
]
use_train['stage2_dist_cat'] = np.select(cri, con, default = 0)
use_test['stage2_dist_cat'] = np.select(cri2, con, default = 0)

In [18]:
# dam , fill1
first_dot = [
    'HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Dam',
    'HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Dam',
    'HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Dam'
]

second_dot = [
    'HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Fill1',
    'HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Fill1',
    'HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Fill1'
]

# 유클리드 계산을 통한 위치 차이
train['distance_diff_dam_fill1_stage3'] = euclide_distance(train, first_dot, second_dot)
test['distance_diff_dam_fill1_stage3'] = euclide_distance(test, first_dot, second_dot)

In [19]:
# fill1, fill2
first_dot = [
    'HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Dam',
    'HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Dam',
    'HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Dam'
]

second_dot = [
    'HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Fill2',
    'HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Fill2',
    'HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Fill2'
]

# 유클리드 계산을 통한 위치 차이
train['distance_diff_dam_fill2_stage3'] = euclide_distance(train, first_dot, second_dot)
test['distance_diff_dam_fill2_stage3'] = euclide_distance(test, first_dot, second_dot)

In [20]:
# dam fill2
first_dot = [
    'HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Fill1',
    'HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Fill1',
    'HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Fill1'
]

second_dot = [
    'HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Fill2',
    'HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Fill2',
    'HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Fill2'
]

# 유클리드 계산을 통한 위치 차이
train['distance_diff_fill1_fill2_stage3'] = euclide_distance(train, first_dot, second_dot)
test['distance_diff_fill1_fill2_stage3'] = euclide_distance(test, first_dot, second_dot)

In [21]:
# 거리 범주를 반영시키자
cri = [
    (train['distance_diff_dam_fill1_stage3'] < train['distance_diff_dam_fill2_stage3']) & (train['distance_diff_dam_fill2_stage3'] < train['distance_diff_fill1_fill2_stage3']),
    (train['distance_diff_dam_fill1_stage3'] < train['distance_diff_fill1_fill2_stage3']) & (train['distance_diff_fill1_fill2_stage3'] < train['distance_diff_dam_fill2_stage3']),
    (train['distance_diff_dam_fill2_stage3'] < train['distance_diff_dam_fill1_stage3']) & (train['distance_diff_dam_fill1_stage3'] < train['distance_diff_fill1_fill2_stage3']),
    (train['distance_diff_fill1_fill2_stage3'] < train['distance_diff_dam_fill2_stage3']) & (train['distance_diff_dam_fill2_stage3'] < train['distance_diff_dam_fill1_stage3']),
    (train['distance_diff_dam_fill2_stage3'] < train['distance_diff_fill1_fill2_stage3']) & (train['distance_diff_dam_fill1_stage3'] > train['distance_diff_fill1_fill2_stage3']),
    (train['distance_diff_fill1_fill2_stage3'] < train['distance_diff_dam_fill1_stage3']) & (train['distance_diff_dam_fill2_stage3'] > train['distance_diff_dam_fill1_stage3'])
]
cri2 = [
    (test['distance_diff_dam_fill1_stage3'] < test['distance_diff_dam_fill2_stage3']) & (test['distance_diff_dam_fill2_stage3'] < test['distance_diff_fill1_fill2_stage3']),
    (test['distance_diff_dam_fill1_stage3'] < test['distance_diff_fill1_fill2_stage3']) & (test['distance_diff_fill1_fill2_stage3'] < test['distance_diff_dam_fill2_stage3']),
    (test['distance_diff_dam_fill2_stage3'] < test['distance_diff_dam_fill1_stage3']) & (test['distance_diff_dam_fill1_stage3'] < test['distance_diff_fill1_fill2_stage3']),
    (test['distance_diff_fill1_fill2_stage3'] < test['distance_diff_dam_fill2_stage3']) & (test['distance_diff_dam_fill2_stage3'] < test['distance_diff_dam_fill1_stage3']),
    (test['distance_diff_dam_fill2_stage3'] < test['distance_diff_fill1_fill2_stage3']) & (test['distance_diff_dam_fill1_stage3'] > test['distance_diff_fill1_fill2_stage3']),
    (test['distance_diff_fill1_fill2_stage3'] < test['distance_diff_dam_fill1_stage3']) & (test['distance_diff_dam_fill2_stage3'] > test['distance_diff_dam_fill1_stage3'])
]
con = [
    1, 2, 3, 4, 5, 6
]
use_train['stage3_dist_cat'] = np.select(cri, con, default = 0)
use_test['stage3_dist_cat'] = np.select(cri2, con, default = 0)

### cure

In [22]:
# standby, start
first_dot = [
    'CURE STANDBY POSITION X Collect Result_Dam',
    'CURE STANDBY POSITION Z Collect Result_Dam'
]

second_dot = [
    'CURE START POSITION X Collect Result_Dam',
    'CURE START POSITION Z Collect Result_Dam'
]

# 유클리드 계산을 통한 위치 차이
train['dist_standby_start_cure_dam'] = euclide_distance(train, first_dot, second_dot)
test['dist_standby_start_cure_dam'] = euclide_distance(test, first_dot, second_dot)

In [23]:
# start, end
first_dot = [
    'CURE START POSITION X Collect Result_Dam',
    'CURE START POSITION Z Collect Result_Dam'
]

second_dot = [
    'CURE END POSITION X Collect Result_Dam',
    'CURE END POSITION Z Collect Result_Dam'
]

# 유클리드 계산을 통한 위치 차이
train['dist_start_end_cure_dam'] = euclide_distance(train, first_dot, second_dot)
test['dist_start_end_cure_dam'] = euclide_distance(test, first_dot, second_dot)

In [24]:
# standby, end
first_dot = [
    'CURE STANDBY POSITION X Collect Result_Dam',
    'CURE STANDBY POSITION Z Collect Result_Dam'
]

second_dot = [
    'CURE END POSITION X Collect Result_Dam',
    'CURE END POSITION Z Collect Result_Dam'
]

# 유클리드 계산을 통한 위치 차이
train['dist_standby_end_cure_dam'] = euclide_distance(train, first_dot, second_dot)
test['dist_standby_end_cure_dam'] = euclide_distance(test, first_dot, second_dot)

In [25]:
# cure_time
use_train['cure_time_dam'] = train['dist_start_end_cure_dam'] / train['CURE SPEED Collect Result_Dam']
use_test['cure_time_dam'] = test['dist_start_end_cure_dam'] / test['CURE SPEED Collect Result_Dam']

In [26]:
# standby, start
first_dot = [
    'CURE STANDBY POSITION X Collect Result_Fill2',
    'CURE STANDBY POSITION Z Collect Result_Fill2'
]

second_dot = [
    'CURE START POSITION X Collect Result_Fill2',
    'CURE START POSITION Z Collect Result_Fill2'
]

# 유클리드 계산을 통한 위치 차이
train['dist_standby_start_cure_fill2'] = euclide_distance(train, first_dot, second_dot)
test['dist_standby_start_cure_fill2'] = euclide_distance(test, first_dot, second_dot)

In [27]:
# start, end
first_dot = [
    'CURE START POSITION X Collect Result_Fill2',
    'CURE START POSITION Z Collect Result_Fill2'
]

second_dot = [
    'CURE END POSITION X Collect Result_Fill2',
    'CURE END POSITION Z Collect Result_Fill2'
]

# 유클리드 계산을 통한 위치 차이
train['dist_start_end_cure_fill2'] = euclide_distance(train, first_dot, second_dot)
test['dist_start_end_cure_fill2'] = euclide_distance(test, first_dot, second_dot)

In [28]:
# standby, end
first_dot = [
    'CURE STANDBY POSITION X Collect Result_Fill2',
    'CURE STANDBY POSITION Z Collect Result_Fill2'
]

second_dot = [
    'CURE END POSITION X Collect Result_Fill2',
    'CURE END POSITION Z Collect Result_Fill2'
]

# 유클리드 계산을 통한 위치 차이
train['dist_standby_end_cure_fill2'] = euclide_distance(train, first_dot, second_dot)
test['dist_standby_end_cure_fill2'] = euclide_distance(test, first_dot, second_dot)

In [29]:
# cure_time
use_train['cure_time_fill2'] = train['dist_start_end_cure_fill2'] / train['CURE SPEED Collect Result_Fill2']
use_test['cure_time_fill2'] = test['dist_start_end_cure_fill2'] / test['CURE SPEED Collect Result_Fill2']

In [30]:
# press21
use_train['press21_autoclave'] = np.where(train['2nd Pressure Collect Result_AutoClave'] - train['1st Pressure Collect Result_AutoClave'] > 0.3, 1, 0)
use_test['press21_autoclave'] = np.where(test['2nd Pressure Collect Result_AutoClave'] - test['1st Pressure Collect Result_AutoClave'] > 0.3, 1, 0)

In [31]:
# thon time
train['thon_time'] = train['Machine Tact time Collect Result_Dam'] - train['DISCHARGED TIME OF RESIN(Stage1) Collect Result_Dam'] - train['DISCHARGED TIME OF RESIN(Stage2) Collect Result_Dam'] - train['DISCHARGED TIME OF RESIN(Stage3) Collect Result_Dam']
test['thon_time'] = test['Machine Tact time Collect Result_Dam'] - test['DISCHARGED TIME OF RESIN(Stage1) Collect Result_Dam'] - test['DISCHARGED TIME OF RESIN(Stage2) Collect Result_Dam'] - test['DISCHARGED TIME OF RESIN(Stage3) Collect Result_Dam']

In [32]:
# 75% 이상이면 1 아니면 0
use_train['thon_time'] = np.where(train['thon_time'] >= 38.8, 1, 0)
use_test['thon_time'] = np.where(test['thon_time'] >= 38.8, 1, 0)

In [33]:
# Discharged 거리
use_train['discharged_dist_stage1_dam'] = train['DISCHARGED SPEED OF RESIN Collect Result_Dam'] * train['DISCHARGED TIME OF RESIN(Stage1) Collect Result_Dam']
use_train['discharged_dist_stage2_dam'] = train['DISCHARGED SPEED OF RESIN Collect Result_Dam'] * train['DISCHARGED TIME OF RESIN(Stage2) Collect Result_Dam']
use_train['discharged_dist_stage3_dam'] = train['DISCHARGED SPEED OF RESIN Collect Result_Dam'] * train['DISCHARGED TIME OF RESIN(Stage3) Collect Result_Dam']

use_test['discharged_dist_stage1_dam'] = test['DISCHARGED SPEED OF RESIN Collect Result_Dam'] * test['DISCHARGED TIME OF RESIN(Stage1) Collect Result_Dam']
use_test['discharged_dist_stage2_dam'] = test['DISCHARGED SPEED OF RESIN Collect Result_Dam'] * test['DISCHARGED TIME OF RESIN(Stage2) Collect Result_Dam']
use_test['discharged_dist_stage3_dam'] = test['DISCHARGED SPEED OF RESIN Collect Result_Dam'] * test['DISCHARGED TIME OF RESIN(Stage3) Collect Result_Dam']

In [34]:
# Discharged 거리
use_train['discharged_dist_stage1_fill1'] = train['DISCHARGED SPEED OF RESIN Collect Result_Fill1'] * train['DISCHARGED TIME OF RESIN(Stage1) Collect Result_Fill1']
use_train['discharged_dist_stage2_fill1'] = train['DISCHARGED SPEED OF RESIN Collect Result_Fill1'] * train['DISCHARGED TIME OF RESIN(Stage2) Collect Result_Fill1']
use_train['discharged_dist_stage3_fill1'] = train['DISCHARGED SPEED OF RESIN Collect Result_Fill1'] * train['DISCHARGED TIME OF RESIN(Stage3) Collect Result_Fill1']

use_test['discharged_dist_stage1_fill1'] = test['DISCHARGED SPEED OF RESIN Collect Result_Fill1'] * test['DISCHARGED TIME OF RESIN(Stage1) Collect Result_Fill1']
use_test['discharged_dist_stage2_fill1'] = test['DISCHARGED SPEED OF RESIN Collect Result_Fill1'] * test['DISCHARGED TIME OF RESIN(Stage2) Collect Result_Fill1']
use_test['discharged_dist_stage3_fill1'] = test['DISCHARGED SPEED OF RESIN Collect Result_Fill1'] * test['DISCHARGED TIME OF RESIN(Stage3) Collect Result_Fill1']

In [35]:
# 기존 정보
use_train[['fill2_time', 'autoclave_time', 'qty', 'thick1', 'thick2', 'thick3']] = train[[
                                                            'Machine Tact time Collect Result_Fill2',
                                                            'Chamber Temp. Unit Time_AutoClave',
                                                            'Production Qty Collect Result_Dam',
                                                            'THICKNESS 1 Collect Result_Dam',
                                                            'THICKNESS 2 Collect Result_Dam',
                                                            'THICKNESS 3 Collect Result_Dam'
]]
use_test[['fill2_time', 'autoclave_time', 'qty', 'thick1', 'thick2', 'thick3']] = test[[
                                                          'Machine Tact time Collect Result_Fill2',
                                                          'Chamber Temp. Unit Time_AutoClave',
                                                          'Production Qty Collect Result_Dam',
                                                            'THICKNESS 1 Collect Result_Dam',
                                                            'THICKNESS 2 Collect Result_Dam',
                                                            'THICKNESS 3 Collect Result_Dam']]

In [36]:
# 챔버 온도 영역 세개로 나누기
cri = [
    (train['Chamber Temp. Collect Result_AutoClave'] < 50) & (train['Chamber Temp. Judge Value_AutoClave'] == 'OK'),
    (train['Chamber Temp. Collect Result_AutoClave'] < 50) & (train['Chamber Temp. Judge Value_AutoClave'] == 'NG')
]
cri2 = [
    (test['Chamber Temp. Collect Result_AutoClave'] < 50) & (test['Chamber Temp. Judge Value_AutoClave'] == 'OK'),
    (test['Chamber Temp. Collect Result_AutoClave'] < 50) & (test['Chamber Temp. Judge Value_AutoClave'] == 'NG')
]
con = [
    1, 2
]

use_train['chamber_temp'] = np.select(cri, con, default = 0)
use_test['chamber_temp'] = np.select(cri2, con, default = 0)

In [37]:
# modelsuffix
le = LabelEncoder()

In [38]:
# 변환
use_train['model_suffix'] = le.fit_transform(train['Model.Suffix_Dam'])
use_test['model_suffix'] = le.transform(test['Model.Suffix_Dam'])

In [39]:
# 1, 0 으로 바꿔주기
use_train['target'] = np.where(train['target'] == 'Normal', 0, 1)

In [40]:
# workorder별로 불량률 계산
a = pd.crosstab([train['Workorder_Dam'], train['Receip No Collect Result_Dam']], train['target']).reset_index()
a['total'] = a['AbNormal'] + a['Normal']
a['ratio'] = a['AbNormal'] / (a['total'])

In [41]:
# 기준
cri = [
    (a['ratio'] == 0),
    (a['ratio'] > 0.1)
]

con = [
    -1, 1
]

a['select_workorder'] = np.select(cri, con, default = 0)

In [42]:
# 결합 시킬 변수만 뽑기
a2 = a[['Workorder_Dam', 'Receip No Collect Result_Dam', 'select_workorder']]

In [43]:
# receip과 workorder 혼합
train = pd.merge(train, a2, how = 'left', on = ['Workorder_Dam', 'Receip No Collect Result_Dam'])
test = pd.merge(test, a2, how = 'left', on = ['Workorder_Dam', 'Receip No Collect Result_Dam'])

In [44]:
# train에 정보가 없는 test는 기본 값인 0으로 채우기
test['select_workorder'][test['select_workorder'].isna()] = 0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['select_workorder'][test['select_workorder'].isna()] = 0


In [45]:
# 대입 시키기
use_train['select_workorder'] = train['select_workorder']
use_test['select_workorder'] = test['select_workorder']

In [46]:
# pallet별 불량률 계산
a = pd.crosstab(train['PalletID Collect Result_Dam'], train['target']).reset_index()
a['total'] = a['AbNormal'] + a['Normal']
a['ratio'] = a['AbNormal'] / (a['total'])

In [47]:
# 기준
a['select_pallet'] = np.where(a['ratio'] >= 0.056903, 1, 0)

In [48]:
# 결합 시킬 변수만 뽑기
a2 = a[['select_pallet', 'PalletID Collect Result_Dam']]

In [49]:
# receip과 workorder 혼합
train = pd.merge(train, a2, how = 'left', on = ['PalletID Collect Result_Dam'])
test = pd.merge(test, a2, how = 'left', on = ['PalletID Collect Result_Dam'])

In [50]:
# 대입 시키기
use_train['select_pallet'] = train['select_pallet']
use_test['select_pallet'] = test['select_pallet']

In [51]:
# 경화 방향이 dam, fill2에서 같은지 다른지
train['cure_direction_dam'] = np.where(train['CURE END POSITION X Collect Result_Dam'] == 240, 1, -1)
train['cure_direction_fill2'] = np.where(train['CURE END POSITION X Collect Result_Fill2'] == 240, 1, -1)
use_train['direction'] = np.where(train['cure_direction_dam'] == train['cure_direction_fill2'], 1, -1)

test['cure_direction_dam'] = np.where(test['CURE END POSITION X Collect Result_Dam'] == 240, 1, -1)
test['cure_direction_fill2'] = np.where(test['CURE END POSITION X Collect Result_Fill2'] == 240, 1, -1)
use_test['direction'] = np.where(test['cure_direction_dam'] == test['cure_direction_fill2'], 1, -1)

In [52]:
# 레진 volume dam, fill1 의 비율
use_train['volume_ratio_stage1'] = train['Dispense Volume(Stage1) Collect Result_Fill1'] / (train['Dispense Volume(Stage1) Collect Result_Fill1'] + train['Dispense Volume(Stage1) Collect Result_Dam'])
use_train['volume_ratio_stage2'] = train['Dispense Volume(Stage2) Collect Result_Fill1'] / (train['Dispense Volume(Stage2) Collect Result_Fill1'] + train['Dispense Volume(Stage2) Collect Result_Dam'])
use_train['volume_ratio_stage3'] = train['Dispense Volume(Stage3) Collect Result_Fill1'] / (train['Dispense Volume(Stage3) Collect Result_Fill1'] + train['Dispense Volume(Stage3) Collect Result_Dam'])

use_test['volume_ratio_stage1'] = test['Dispense Volume(Stage1) Collect Result_Fill1'] / (test['Dispense Volume(Stage1) Collect Result_Fill1'] + test['Dispense Volume(Stage1) Collect Result_Dam'])
use_test['volume_ratio_stage2'] = test['Dispense Volume(Stage2) Collect Result_Fill1'] / (test['Dispense Volume(Stage2) Collect Result_Fill1'] + test['Dispense Volume(Stage2) Collect Result_Dam'])
use_test['volume_ratio_stage3'] = test['Dispense Volume(Stage3) Collect Result_Fill1'] / (test['Dispense Volume(Stage3) Collect Result_Fill1'] + test['Dispense Volume(Stage3) Collect Result_Dam'])

In [53]:
# set id 추가
use_test['Set ID'] = test['Set ID']

In [54]:
# 저장
use_train.to_csv('./data/train_0816_variable.csv', index = False)
use_test.to_csv('./data/test_0816_variable.csv', index = False)