In [1]:
import os
from pprint import pprint

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
import shap
from sklearn.model_selection import train_test_split
from tqdm import tqdm



def split_preprocess(df,test_df):
    scaler = StandardScaler()
    train_df,val_df = train_test_split(df,test_size=0.3,stratify=df["target"],random_state=RANDOM_STATE)
    num_cols = [col for col in  train_df.select_dtypes(exclude=['object']).columns if train_df[col].nunique() > 4]

    train_df[num_cols] = scaler.fit_transform(train_df[num_cols])
    val_df[num_cols] = scaler.transform(val_df[num_cols])
    test_df[num_cols] = scaler.transform(test_df[num_cols])
    return train_df,val_df,test_df    

#데이터 읽어오기
ROOT_DIR = "data"
RANDOM_STATE = 2024
# Load data
train_data = pd.read_csv(os.path.join(ROOT_DIR, "train.csv"))
test_data = pd.read_csv(os.path.join(ROOT_DIR, "test.csv"))
# training데이터기준 missing value가 50%이상인 컬럼 제거
all_data = len(train_data) 
missing = train_data.isnull().sum()
missing_cols = missing[missing/all_data>0.5].sort_values(ascending=True).index
train_data = train_data.drop(columns = missing_cols)
test_data = test_data.drop(columns = missing_cols)
# 정보가 없는 데이터 제거
useless = [col for col in  train_data.drop(columns=['target']).select_dtypes(exclude=['object']).columns if (train_data[col].nunique() <= 1)]
train_data = train_data.drop(columns = useless)
test_data = test_data.drop(columns = useless)
#training, test 데이터 프레임으로 분리 => 'target'을 제외한 object객체 제거
train_df = train_data.select_dtypes(exclude=['object'])
test_df = test_data.select_dtypes(exclude=['object'])
train_df['target'] = train_data['target']
test_df['target'] =test_data['target']
dam_columns = [column for column in train_df.columns if 'Dam' in column] + ['target']
fill1_columns = [column for column in train_df.columns if 'Fill1' in column] + ['target']
fill2_columns = [column for column in train_df.columns if 'Fill2' in column] + ['target']
ac_columns = [column for column in train_df.columns if 'AutoClave' in column] + ['target']

  from .autonotebook import tqdm as notebook_tqdm


Unnamed: 0,CURE END POSITION X Collect Result_Dam,CURE END POSITION Z Collect Result_Dam,CURE END POSITION Θ Collect Result_Dam,CURE SPEED Collect Result_Dam,CURE START POSITION X Collect Result_Dam,CURE START POSITION Θ Collect Result_Dam,DISCHARGED SPEED OF RESIN Collect Result_Dam,DISCHARGED TIME OF RESIN(Stage1) Collect Result_Dam,DISCHARGED TIME OF RESIN(Stage2) Collect Result_Dam,DISCHARGED TIME OF RESIN(Stage3) Collect Result_Dam,...,Head Clean Position Z Collect Result_Fill2,Head Purge Position X Collect Result_Fill2,Head Purge Position Y Collect Result_Fill2,Head Purge Position Z Collect Result_Fill2,Machine Tact time Collect Result_Fill2,PalletID Collect Result_Fill2,Production Qty Collect Result_Fill2,Receip No Collect Result_Fill2,WorkMode Collect Result_Fill2,target
0,1000.0,12.5,90,70,280,90,10,17.0,4.9,17.0,...,91.8,270.0,50,85,19.8,13.0,195,1,0,
1,1000.0,12.5,90,70,280,90,16,14.2,8.3,14.2,...,50.0,91.8,270,50,85.0,19.8,14,256,1,
2,240.0,2.5,-90,70,1030,-90,10,9.7,4.9,9.7,...,91.8,270.0,50,85,19.7,1.0,98,1,0,
3,1000.0,12.5,90,70,280,90,10,21.3,10.6,21.3,...,50.0,91.8,270,50,85.0,20.0,14,0,1,
4,240.0,2.5,-90,70,1030,-90,16,13.2,7.5,13.2,...,50.0,91.8,270,50,85.0,19.8,1,215,1,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17356,1000.0,12.5,90,70,280,90,10,21.3,10.6,21.3,...,50.0,91.8,270,50,85.0,19.5,14,131,1,
17357,1000.0,12.5,90,70,280,90,16,13.2,7.6,13.2,...,50.0,91.8,270,50,85.0,19.8,12,279,1,
17358,240.0,2.5,-90,70,1030,-90,16,13.2,6.6,13.2,...,50.0,91.8,270,50,85.0,20.5,4,66,1,
17359,240.0,2.5,-90,70,1030,-90,10,9.7,3.9,9.7,...,91.8,270.0,50,85,18.9,1.0,117,1,0,


In [108]:
train_dam_df,val_dam_df,test_dam_df = split_preprocess(train_df[dam_columns],test_df[dam_columns])
train_fill1_df,val_fill1_df,test_fill1_df = split_preprocess(train_df[fill1_columns],test_df[fill1_columns])
train_fill2_df,val_fill2_df,test_fill2_df = split_preprocess(train_df[fill2_columns],test_df[fill2_columns])
train_ac_df,val_ac_df,test_ac_df = split_preprocess(train_df[ac_columns],test_df[ac_columns])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/

In [109]:
def pretrain(train_df,val_df,name):
    from catboost import CatBoostClassifier
    from sklearn.utils.class_weight import compute_class_weight
    X_train = train_df.drop(columns =['target'])
    X_train[X_train.select_dtypes(include=['int']).columns] = X_train.select_dtypes(include=['int']).astype(object)
    y_train = train_df['target']
    X_val = val_df.drop(columns =['target'])
    X_val[X_val.select_dtypes(include=['int']).columns] = X_val.select_dtypes(include=['int']).astype(object)
    y_val = val_df['target']
    
    class_name =['AbNormal', 'Normal']
    class_weights = compute_class_weight(class_weight='balanced', classes=class_name, y=y_val)
    class_weights_dict = {class_name[i]: class_weights[i] for i in range(len(class_weights))}

    best_model_param ={"border_count":32,"bagging_temperature":1.0,"random_strength":10,"depth":7,
                        "learning_rate":0.03,"l2_leaf_reg":3,"iterations":8000}

    
    # model = CatBoostClassifier(**best_model_param, 
    #                         loss_function='Logloss', verbose=100, class_weights=class_weights_dict,
    #                         random_state=RANDOM_STATE,eval_metric='F1', task_type='GPU', devices='0', use_best_model=True)
    model = CatBoostClassifier(**best_model_param,
                               early_stopping_rounds = 2000, 
                            loss_function='Logloss', verbose=100, scale_pos_weight=0.125,
                            random_state=RANDOM_STATE,eval_metric='F1', task_type='GPU', devices='0', use_best_model=True)
    model.fit(X_train, y_train,eval_set=(X_val,y_val))
    explainer = shap.TreeExplainer(model)
    shap_values = explainer(X_train)
    feature_names = train_df.columns
    drop_names = []
    for i in range(X_train.shape[1]):
        feature_imp = np.mean(np.abs(shap_values.values[:, i]))
        th = np.percentile(np.mean(np.abs(shap_values.values),axis=0),10)
        print(f'{feature_names[i]}의 중요도 :', feature_imp)
        if feature_imp<0.01:
            drop_names.append(feature_names[i])
    pred_train = model.predict(X_train)
    pred_val = model.predict(X_val)
    train_f1score = f1_score(y_train,pred_train,pos_label='AbNormal')
    val_f1score = f1_score(y_val,pred_val,pos_label='AbNormal')
    print(train_f1score)
    print(val_f1score)
    print(classification_report(y_train, pred_train))
    print(classification_report(y_val, pred_val))
    return drop_names,model

def train(train_df,val_df,name):
    from catboost import CatBoostClassifier
    from sklearn.utils.class_weight import compute_class_weight
    X_train = train_df.drop(columns =['target'])
    X_train[X_train.select_dtypes(include=['int']).columns] = X_train.select_dtypes(include=['int']).astype(object)
    y_train = train_df['target']
    X_val = val_df.drop(columns =['target'])
    X_val[X_val.select_dtypes(include=['int']).columns] = X_val.select_dtypes(include=['int']).astype(object)
    y_val = val_df['target']
    
    class_name =['AbNormal', 'Normal']
    class_weights = compute_class_weight(class_weight='balanced', classes=class_name, y=y_val)
    class_weights_dict = {class_name[i]: class_weights[i] for i in range(len(class_weights))}

    best_model_param ={"border_count":32,"bagging_temperature":1.0,"random_strength":10,"depth":7,
                        "learning_rate":0.03,"l2_leaf_reg":3,"iterations":8000}

        

    # model = CatBoostClassifier(**best_model_param, 
    #                         loss_function='Logloss', verbose=100, class_weights=class_weights_dict,
    #                         random_state=RANDOM_STATE,eval_metric='F1', task_type='GPU', devices='0', use_best_model=True)
    model = CatBoostClassifier(**best_model_param, 
                               early_stopping_rounds = 2000,
                            loss_function='Logloss', verbose=100, scale_pos_weight=0.125,
                            random_state=RANDOM_STATE,eval_metric='F1', task_type='GPU', devices='0', use_best_model=True)
    model.fit(X_train, y_train,eval_set=(X_val,y_val))
    pred_train = model.predict(X_train)
    pred_val = model.predict(X_val)
    train_f1score = f1_score(y_train,pred_train,pos_label='AbNormal')
    val_f1score = f1_score(y_val,pred_val,pos_label='AbNormal')
    print(f"-------Result of {name}-------")
    print(train_f1score)
    print(val_f1score)
    print(classification_report(y_train, pred_train))
    print(classification_report(y_val, pred_val))
    return model

In [38]:
train_dam_df = AdaSyn_OverSampling(train_dam_df)
train_fill1_df = AdaSyn_OverSampling(train_fill1_df)
train_fill2_df = AdaSyn_OverSampling(train_fill2_df)
train_ac_df = AdaSyn_OverSampling(train_ac_df)

dam_drop = pretrain(train_dam_df,val_dam_df,'Dam')
fill1_drop = pretrain(train_fill1_df,val_fill1_df,'Fill1')
fill2_drop = pretrain(train_fill2_df,val_fill2_df,'Fill2')
ac_drop = pretrain(train_ac_df,val_ac_df,'AutoClave')

0:	learn: 0.0000000	test: 0.0000000	best: 0.0000000 (0)	total: 7.44ms	remaining: 11.2s
100:	learn: 0.0851590	test: 0.0798255	best: 0.0798255 (100)	total: 905ms	remaining: 12.5s
200:	learn: 0.2346256	test: 0.2114223	best: 0.2114223 (200)	total: 1.84s	remaining: 11.9s
300:	learn: 0.3610953	test: 0.3162936	best: 0.3162936 (300)	total: 2.79s	remaining: 11.1s
400:	learn: 0.4465781	test: 0.3810460	best: 0.3810460 (400)	total: 3.7s	remaining: 10.1s
500:	learn: 0.5102629	test: 0.4325866	best: 0.4325866 (500)	total: 4.61s	remaining: 9.2s
600:	learn: 0.5558607	test: 0.4623067	best: 0.4627061 (599)	total: 5.54s	remaining: 8.28s
700:	learn: 0.5896036	test: 0.4844152	best: 0.4844152 (700)	total: 6.43s	remaining: 7.33s
800:	learn: 0.6215749	test: 0.5038992	best: 0.5038992 (800)	total: 7.34s	remaining: 6.41s
900:	learn: 0.6456808	test: 0.5175907	best: 0.5179582 (897)	total: 8.34s	remaining: 5.55s
1000:	learn: 0.6660824	test: 0.5323735	best: 0.5325077 (999)	total: 9.25s	remaining: 4.61s
1100:	learn: 0

In [39]:
print(dam_drop)
print(fill1_drop)
print(fill2_drop)
print(ac_drop)

['Stage1 Circle2 Distance Speed Collect Result_Dam']
[]
[]
[]


In [40]:
dam = train(train_dam_df.drop(columns=dam_drop),val_dam_df.drop(columns=dam_drop),'Dam')
fill1 = train(train_fill1_df.drop(columns=fill1_drop),val_fill1_df.drop(columns=fill1_drop),'Fill1')
fill2 = train(train_fill2_df.drop(columns=fill2_drop),val_fill2_df.drop(columns=fill2_drop),'Fill2')
ac = train(train_ac_df.drop(columns=ac_drop),val_ac_df.drop(columns=ac_drop),'AutoClave')

0:	learn: 0.0000000	test: 0.0000000	best: 0.0000000 (0)	total: 9.25ms	remaining: 13.9s
100:	learn: 0.0892094	test: 0.0825348	best: 0.0825348 (100)	total: 968ms	remaining: 13.4s
200:	learn: 0.2395415	test: 0.2179196	best: 0.2179196 (200)	total: 1.92s	remaining: 12.4s
300:	learn: 0.3616323	test: 0.3155085	best: 0.3155085 (300)	total: 2.94s	remaining: 11.7s
400:	learn: 0.4432165	test: 0.3811786	best: 0.3811786 (400)	total: 3.88s	remaining: 10.6s
500:	learn: 0.5022706	test: 0.4223390	best: 0.4223390 (500)	total: 4.83s	remaining: 9.63s
600:	learn: 0.5498887	test: 0.4586719	best: 0.4586719 (600)	total: 5.76s	remaining: 8.62s
700:	learn: 0.5887407	test: 0.4836719	best: 0.4837512 (699)	total: 6.7s	remaining: 7.64s
800:	learn: 0.6214569	test: 0.4987986	best: 0.4992085 (796)	total: 7.63s	remaining: 6.66s
900:	learn: 0.6459513	test: 0.5180532	best: 0.5180532 (900)	total: 8.54s	remaining: 5.68s
1000:	learn: 0.6699299	test: 0.5345206	best: 0.5345206 (1000)	total: 9.56s	remaining: 4.76s
1100:	learn:

In [118]:
def AdaSyn_OverSampling(df):
    from sklearn.preprocessing import LabelEncoder
    from imblearn.over_sampling import ADASYN

    label_encoders = {}
    X = df.drop(columns='target')
    y = df['target']
    cat_cols = [col for col in  X.select_dtypes(exclude=['object']).columns if X[col].nunique() <= 4]
    
    for col in cat_cols:
        le = LabelEncoder()
        X[col] = le.fit_transform(X[col].astype(str))
        label_encoders[col] = le
    
    adasyn = ADASYN(sampling_strategy=0.3, random_state=RANDOM_STATE)
    X_resampled, y_resampled = adasyn.fit_resample(X, y)
    resampled_df = pd.DataFrame(X_resampled, columns=X.columns)
    for col in cat_cols:
        X_resampled[col] = X_resampled[col].round().astype(int)
        X_resampled[col] = label_encoders[col].inverse_transform(X_resampled[col])        
    resampled_df['target'] = y_resampled
    return resampled_df

In [20]:
train_df.head()

Unnamed: 0,CURE END POSITION X Collect Result_Dam,CURE END POSITION Z Collect Result_Dam,CURE END POSITION Θ Collect Result_Dam,CURE SPEED Collect Result_Dam,CURE START POSITION X Collect Result_Dam,CURE START POSITION Θ Collect Result_Dam,DISCHARGED SPEED OF RESIN Collect Result_Dam,DISCHARGED TIME OF RESIN(Stage1) Collect Result_Dam,DISCHARGED TIME OF RESIN(Stage2) Collect Result_Dam,DISCHARGED TIME OF RESIN(Stage3) Collect Result_Dam,...,Head Clean Position Z Collect Result_Fill2,Head Purge Position X Collect Result_Fill2,Head Purge Position Y Collect Result_Fill2,Head Purge Position Z Collect Result_Fill2,Machine Tact time Collect Result_Fill2,PalletID Collect Result_Fill2,Production Qty Collect Result_Fill2,Receip No Collect Result_Fill2,WorkMode Collect Result_Fill2,target
0,240.0,2.5,-90,100,1030,-90,16,14.9,8.4,14.7,...,50.0,91.8,270,50,114.612,19.9,7,127,1,Normal
1,240.0,2.5,-90,70,1030,-90,10,21.3,4.9,21.3,...,91.8,270.0,50,85,19.6,7.0,185,1,0,Normal
2,1000.0,12.5,90,85,280,90,16,14.7,8.5,14.7,...,50.0,91.8,270,50,114.612,19.8,10,73,1,Normal
3,1000.0,12.5,90,70,280,90,10,21.3,8.4,21.3,...,91.8,270.0,50,85,19.9,12.0,268,1,0,Normal
4,240.0,2.5,-90,70,1030,-90,10,9.7,4.9,9.6,...,91.8,270.0,50,85,19.7,8.0,121,1,0,Normal


In [119]:
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import ADASYN
label_encoders = {}

def cat_encode(df):
    global label_encoders
    X = df.drop(columns='target')
    y = df['target']
    cat_cols = [col for col in  X.select_dtypes(exclude=['object']).columns if X[col].nunique() <= 4]

    for col in cat_cols:
        le = LabelEncoder()
        X[col] = le.fit_transform(X[col].astype(str))
        label_encoders[col] = le

all_train_df,all_val_df,all_test_df = split_preprocess(train_df,test_df)
cat_encode(all_train_df)
all_train_df = AdaSyn_OverSampling(all_train_df)
for c in label_encoders.keys():
    all_val_df[c] = all_val_df[c].astype(str)
    all_test_df[c] = all_test_df[c].astype(str)

DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


In [132]:
all_train_df.drop(columns=['target']).info()
all_val_df.drop(columns=['target']).info()
all_test_df.drop(columns=['target']).info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34502 entries, 0 to 34501
Columns: 130 entries, CURE END POSITION X Collect Result_Dam to WorkMode Collect Result_Fill2
dtypes: float64(84), object(46)
memory usage: 34.2+ MB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 12152 entries, 14737 to 37121
Columns: 130 entries, CURE END POSITION X Collect Result_Dam to WorkMode Collect Result_Fill2
dtypes: float64(84), object(46)
memory usage: 12.1+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17361 entries, 0 to 17360
Columns: 130 entries, CURE END POSITION X Collect Result_Dam to WorkMode Collect Result_Fill2
dtypes: float64(84), object(46)
memory usage: 17.2+ MB


In [121]:
all_drop,all_model =  pretrain(all_train_df,all_val_df,'all')
all_var = train(all_train_df.drop(columns=all_drop),all_val_df.drop(columns=all_drop),'all')

0:	learn: 0.2259344	test: 0.2322751	best: 0.2322751 (0)	total: 69.1ms	remaining: 9m 12s
100:	learn: 0.5634041	test: 0.5543393	best: 0.5543393 (100)	total: 7.11s	remaining: 9m 16s
200:	learn: 0.7182591	test: 0.7082124	best: 0.7082124 (200)	total: 14.2s	remaining: 9m 9s
300:	learn: 0.7628385	test: 0.7461416	best: 0.7461416 (300)	total: 21.2s	remaining: 9m 2s
400:	learn: 0.7946020	test: 0.7747740	best: 0.7747740 (400)	total: 28.2s	remaining: 8m 54s
500:	learn: 0.8126155	test: 0.7892511	best: 0.7892511 (500)	total: 35.6s	remaining: 8m 52s
600:	learn: 0.8219937	test: 0.7959239	best: 0.7962321 (598)	total: 42.8s	remaining: 8m 46s
700:	learn: 0.8313665	test: 0.7993199	best: 0.7997526 (699)	total: 49.8s	remaining: 8m 38s
800:	learn: 0.8365807	test: 0.8012475	best: 0.8012533 (713)	total: 56.9s	remaining: 8m 31s
900:	learn: 0.8429081	test: 0.8023046	best: 0.8023046 (900)	total: 1m 4s	remaining: 8m 25s
1000:	learn: 0.8496901	test: 0.8038667	best: 0.8040675 (982)	total: 1m 11s	remaining: 8m 18s
11

In [136]:
from scipy.stats import mode
def predict(model,df):
    X = df.drop(columns='target')
    # X[X.select_dtypes(include=['int']).columns] = X.select_dtypes(include=['int']).astype(object)
    return model.predict(X)



all_val_pred = predict(all_var,all_val_df.drop(columns=all_drop))
all_val_origin_pred = predict(all_model,all_val_df)

all_test_pred = predict(all_var,all_test_df.drop(columns=all_drop))
all_test_origin_pred = predict(all_model,all_test_df)

f1score_all = f1_score(all_val_df['target'],all_val_pred,pos_label='AbNormal')
f1score_origin_all = f1_score(all_val_df['target'],all_val_origin_pred,pos_label='AbNormal')

print(f"Original: {f1score_origin_all}")
print(f"Dropped: {f1score_all}")


Original: 0.20326678765880218
Dropped: 0.19667590027700835


In [137]:
all_test_df

Unnamed: 0,CURE END POSITION X Collect Result_Dam,CURE END POSITION Z Collect Result_Dam,CURE END POSITION Θ Collect Result_Dam,CURE SPEED Collect Result_Dam,CURE START POSITION X Collect Result_Dam,CURE START POSITION Θ Collect Result_Dam,DISCHARGED SPEED OF RESIN Collect Result_Dam,DISCHARGED TIME OF RESIN(Stage1) Collect Result_Dam,DISCHARGED TIME OF RESIN(Stage2) Collect Result_Dam,DISCHARGED TIME OF RESIN(Stage3) Collect Result_Dam,...,Head Clean Position Z Collect Result_Fill2,Head Purge Position X Collect Result_Fill2,Head Purge Position Y Collect Result_Fill2,Head Purge Position Z Collect Result_Fill2,Machine Tact time Collect Result_Fill2,PalletID Collect Result_Fill2,Production Qty Collect Result_Fill2,Receip No Collect Result_Fill2,WorkMode Collect Result_Fill2,target
0,1000.0,12.5,90,-8.457038,280,90,10,-5.184700,-6.529416,-5.201055,...,91.8,270.0,50,85,-1.747835,-2.296651,-0.707813,-0.834999,-1.949416,
1,1000.0,12.5,90,-8.457038,280,90,16,-5.188544,-6.421326,-5.204954,...,50.0,91.8,270,50,-1.747834,-2.296314,-0.707813,-0.834999,-1.656943,
2,240.0,2.5,-90,-8.457038,1030,-90,10,-5.194722,-6.529416,-5.211220,...,91.8,270.0,50,85,-1.747835,-2.297246,-0.707813,-0.834999,-1.949416,
3,1000.0,12.5,90,-8.457038,280,90,10,-5.178797,-6.348205,-5.195068,...,50.0,91.8,270,50,-1.747834,-2.296304,-0.707813,-0.834999,-1.656943,
4,240.0,2.5,-90,-8.457038,1030,-90,16,-5.189917,-6.446759,-5.206346,...,50.0,91.8,270,50,-1.747834,-2.296314,-0.707813,-0.834999,-1.656943,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17356,1000.0,12.5,90,-8.457038,280,90,10,-5.178797,-6.348205,-5.195068,...,50.0,91.8,270,50,-1.747834,-2.296329,-0.707813,-0.834999,-1.656943,
17357,1000.0,12.5,90,-8.457038,280,90,16,-5.189917,-6.443580,-5.206346,...,50.0,91.8,270,50,-1.747834,-2.296314,-0.707813,-0.834999,-1.656943,
17358,240.0,2.5,-90,-8.457038,1030,-90,16,-5.189917,-6.475371,-5.206346,...,50.0,91.8,270,50,-1.747834,-2.296280,-0.707813,-0.834999,-1.656943,
17359,240.0,2.5,-90,-8.457038,1030,-90,10,-5.194722,-6.561208,-5.211220,...,91.8,270.0,50,85,-1.747835,-2.297246,-0.707813,-0.834999,-1.949416,


In [138]:
all_val_df

Unnamed: 0,CURE END POSITION X Collect Result_Dam,CURE END POSITION Z Collect Result_Dam,CURE END POSITION Θ Collect Result_Dam,CURE SPEED Collect Result_Dam,CURE START POSITION X Collect Result_Dam,CURE START POSITION Θ Collect Result_Dam,DISCHARGED SPEED OF RESIN Collect Result_Dam,DISCHARGED TIME OF RESIN(Stage1) Collect Result_Dam,DISCHARGED TIME OF RESIN(Stage2) Collect Result_Dam,DISCHARGED TIME OF RESIN(Stage3) Collect Result_Dam,...,Head Clean Position Z Collect Result_Fill2,Head Purge Position X Collect Result_Fill2,Head Purge Position Y Collect Result_Fill2,Head Purge Position Z Collect Result_Fill2,Machine Tact time Collect Result_Fill2,PalletID Collect Result_Fill2,Production Qty Collect Result_Fill2,Receip No Collect Result_Fill2,WorkMode Collect Result_Fill2,target
14737,240.0,2.5,-90,3.201378,1030,-90,16,0.168065,0.773981,0.117365,...,50.0,91.8,270,50,1.439859,0.855983,-0.672649,0.989914,0.181425,Normal
9101,240.0,2.5,-90,1.146112,1030,-90,16,0.168065,0.824153,0.117365,...,50.0,91.8,270,50,1.439859,0.993701,-0.653681,-0.775132,0.181425,Normal
29779,1000.0,12.5,90,1.146112,280,90,16,0.114541,0.824153,0.117365,...,50.0,91.8,270,50,1.439859,0.663177,-0.577810,-0.706364,0.181425,Normal
5251,1000.0,12.5,90,1.146112,280,90,16,0.168065,0.874325,0.117365,...,50.0,91.8,270,50,1.439859,0.952385,-0.606262,1.410163,0.181425,Normal
36101,1000.0,12.5,90,-0.395338,280,90,16,-0.286891,0.723809,-0.285207,...,50.0,91.8,270,50,0.628634,0.718264,-0.549358,0.424488,0.181425,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26319,240.0,2.5,-90,-0.395338,1030,-90,10,-1.250329,-0.982041,-1.224541,...,91.8,270.0,50,85,-1.187664,-1.843296,-0.388132,-0.820978,-0.600594,Normal
39462,1000.0,12.5,90,-0.395338,280,90,10,1.880844,-0.982041,1.888680,...,91.8,270.0,50,85,-1.160268,-0.466113,1.793165,-0.820978,-0.600594,AbNormal
26852,240.0,2.5,-90,-0.395338,1030,-90,10,-1.250329,-0.982041,-1.224541,...,91.8,270.0,50,85,-1.165747,-0.879268,-0.198454,-0.820978,-0.600594,Normal
25109,240.0,2.5,-90,-0.395338,1030,-90,10,-1.223567,-1.483762,-1.224541,...,91.8,270.0,50,85,-1.182185,-1.430141,0.816324,-0.820978,-0.600594,Normal


In [135]:
print(np.unique(all_test_origin_pred))
print(np.unique(all_test_pred))

['Normal']
['Normal']


In [52]:
submission = pd.read_csv("submission.csv")
submission['target'] = all_test_pred
submission.to_csv("16_66.csv",index=False)

In [58]:
df = pd.read_csv("16_66.csv")
df.value_counts('target')

target
Normal      14697
AbNormal     2664
dtype: int64