In [126]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler, Normalizer, RobustScaler, MaxAbsScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import ADASYN
import numpy as np
import joblib
from sklearn import metrics
from scikeras.wrappers import KerasClassifier

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow import keras
from tensorflow.keras import layers as Layer
from tensorflow.keras.metrics import Recall, Precision, BinaryAccuracy, TruePositives, TrueNegatives, FalsePositives, FalseNegatives
from tensorflow.keras.regularizers import L1L2
from timeit import default_timer as timer
import json

In [21]:
def load_data(csv_path='https://raw.githubusercontent.com/urmd1f/Project1/main/data/steel.csv', scaler=None, upsampling=None, TEST_SIZE=0.2, VAL_SIZE=0.2, RANDOM_STATE=42):

    # 글로벌로 만들어서 저장하기
    global df, X, y, X_train, X_val, X_test, y_train, y_val, y_test
    
    df = pd.read_csv(csv_path)

    # 이상치 전처리 
    # 일단 카피 
    df_clear = df.copy()
    # 타겟특성 빼고 변수에 저장
    df_clear = df_clear.drop(df_clear.iloc[:,-7:].columns, axis=1)
    # Area라는 컬럼 생성
    df_clear['Area'] = (df['X_Maximum'] - df['X_Minimum']) * (df['Y_Maximum'] - df['Y_Minimum'])
    # Area컬럼을 생성하는데 사용된 피쳐 제거(X_Maximum, X_Minimum, Y_Maximum, Y_Minimum)
    df_clear = df_clear.drop(df.iloc[:, :4].columns, axis=1)
    # A300 제거(300 or 400이라 하나 제거 후 컬럼명 어떻게 할지 고민)
    df_clear = df_clear.drop('TypeOfSteel_A300', axis=1)
    # 일단 TypeOfSteel로만 변경해서 설명란에 0은 300 1은 400으로 해보기로
    df_clear.rename(columns={'TypeOfSteel_A400':'TypeOfSteel'}, inplace=True)
    # Log_X_index, Log_Y_index 제거 LogOFAreas가 합친결과값으로 판단되어 제거하기로 함
    df_clear = df_clear.drop(['Log_X_Index', 'Log_Y_Index'], axis=1)
    df_clear = df_clear.drop('Outside_Global_Index', axis=1)
    # 정규분포를 위한 log 변환

    log_list = [
        'X_Perimeter',
        'Y_Perimeter',
        'Steel_Plate_Thickness',
        # 'Edges_Index', # <- 0이 들어있어서 로그변환이 안됨 zerodivision_error
        'Outside_X_Index',
        'Area',
        'Edges_Y_Index',
        'Pixels_Areas',
        'Sum_of_Luminosity'
        ]

    for i in log_list:
        df_clear[i] = np.log(df_clear[i])

    y_list = list(df.iloc[:,-7:].columns)
    df_target = df.copy()
    df_target["Type"] = df_target.loc[:,y_list][y_list].idxmax(axis=1)
    df_target = df_target.drop(columns=y_list)

    # 이진분류를 위한 'Type' 컬럼 값 수정
    df_target1 = df_target.copy()
    df_target1['Type'] = df_target1['Type'].apply(lambda x: 1 if x != 'Other_Faults' else 0)
    df_target1['Type'].value_counts(normalize=True)

    ## 다중분류를 위한 'Type' 컬럼 값 수정
    #df_target2 = df_target.copy()
    ## 'Other_Faults'를 제외한 인덱스 가져오기
    #indices_to_remove = df_target2[df_target2['Type'] == 'Other_Faults'].index
    ## 해당 인덱스들을 제거하여 새로운 데이터프레임 생성
    #df_clear2 = df_clear.copy()
    #df_clear2 = df_clear.drop(indices_to_remove)
    ## 'Other_Faults' 행을 제외하고 결함이 있는 것만 남기기
    #df_target2 = df_target2[df_target2['Type'] != 'Other_Faults']
    #encoder = LabelEncoder()
    #df_target2['Type'] = encoder.fit_transform(df_target2['Type'])

    X = df_clear
    y = df_target1['Type']

    # 업스케일링
    # 스모트(SMOTE) 대신에 아다신(ADASYN) 사용된 이유는 좀 더 랜덤하게 업스케일링이 되게 하게 위해 사용
    if upsampling:
        adasyn = ADASYN(random_state=RANDOM_STATE)
        X, y = adasyn.fit_resample(X, y)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=VAL_SIZE, random_state=RANDOM_STATE, stratify=y_train)
    

    # MinMaxScaler - 0, StandardScaler - 1, , MaxAbsScaler - 2, RobustScaler - 3, Normalizer - 4
    if scaler == 0:
        scaler = MinMaxScaler()
        X_train = scaler.fit_transform(X_train)
        X_val = scaler.transform(X_val)
        X_test = scaler.transform(X_test)
    elif scaler == 1:
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_val = scaler.transform(X_val)
        X_test = scaler.transform(X_test)
    elif scaler == 2:
        scaler = MaxAbsScaler()
        X_train = scaler.fit_transform(X_train)
        X_val = scaler.transform(X_val)
        X_test = scaler.transform(X_test)
    elif scaler == 3:
        scaler = RobustScaler()
        X_train = scaler.fit_transform(X_train)
        X_val = scaler.transform(X_val)
        X_test = scaler.transform(X_test)
    elif scaler == 4:
        scaler = Normalizer()
        X_train = scaler.fit_transform(X_train)
        X_val = scaler.transform(X_val)
        X_test = scaler.transform(X_test)
    else:
        pass
    
    X_train = pd.DataFrame(X_train, columns=X.columns)
    X_val = pd.DataFrame(X_val, columns=X.columns)
    X_test = pd.DataFrame(X_test, columns=X.columns)

    return X_train, X_val, X_test, y_train, y_val, y_test, scaler

In [127]:
X_train, X_val, X_test, y_train, y_val, y_test, scaler = load_data(scaler=0)
joblib.dump(scaler, 'MinMaxScaler.joblib')
X_train, X_val, X_test, y_train, y_val, y_test, scaler = load_data(scaler=1)
joblib.dump(scaler, 'StandardScaler.joblib')
X_train, X_val, X_test, y_train, y_val, y_test, scaler = load_data(scaler=2)
joblib.dump(scaler, 'MaxAbsScaler.joblib')
X_train, X_val, X_test, y_train, y_val, y_test, scaler = load_data(scaler=3)
joblib.dump(scaler, 'RobustScaler.joblib')
X_train, X_val, X_test, y_train, y_val, y_test, scaler = load_data(scaler=4)
joblib.dump(scaler, 'Normalizer.joblib')


['Normalizer.joblib']

In [102]:
X_train, X_val, X_test, y_train, y_val, y_test, scaler = load_data(scaler=1, upsampling=1)
X_train.shape, X_val.shape, X_test.shape, y_train.shape, y_val.shape, y_test.shape, scaler

((1580, 20), (396, 20), (494, 20), (1580,), (396,), (494,), StandardScaler())

In [67]:
X_train

Unnamed: 0,Pixels_Areas,X_Perimeter,Y_Perimeter,Sum_of_Luminosity,Minimum_of_Luminosity,Maximum_of_Luminosity,Length_of_Conveyer,TypeOfSteel,Steel_Plate_Thickness,Edges_Index,Empty_Index,Square_Index,Outside_X_Index,Edges_X_Index,Edges_Y_Index,LogOfAreas,Orientation_Index,Luminosity_Index,SigmoidOfAreas,Area
0,0.428385,0.249975,0.310458,0.422466,0.220513,0.425926,0.299180,1.0,0.000000,0.715736,0.228732,0.711909,0.311360,0.880682,1.000000,0.428390,0.643920,0.470011,0.456527,0.358230
1,0.335572,0.235355,0.276147,0.342344,0.543590,0.416667,0.280738,1.0,0.454757,0.901728,0.444962,0.663910,0.248424,0.661830,1.000000,0.335573,0.667928,0.565196,0.166856,0.305697
2,0.301028,0.199127,0.234801,0.309553,0.548718,0.449074,0.264344,1.0,0.000000,0.084506,0.166437,0.697489,0.195170,0.631088,1.000000,0.301022,0.651132,0.586382,0.063905,0.251734
3,0.487295,0.399215,0.428764,0.478764,0.420513,0.300926,0.952869,1.0,1.000000,0.155647,0.747113,0.379449,0.359378,0.401177,0.989976,0.487292,0.810208,0.443229,0.999773,0.474398
4,0.480345,0.389226,0.417512,0.535682,0.394872,0.851852,0.327869,1.0,0.000000,0.788183,0.562242,0.265100,0.319772,0.275264,0.994439,0.480349,0.867403,0.883126,0.984449,0.435941
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1236,0.369916,0.312360,0.282729,0.369076,0.492308,0.365741,0.272541,1.0,0.000000,0.500904,0.464668,0.628517,0.350654,0.650162,0.905001,0.369918,0.314016,0.511631,0.294779,0.335344
1237,0.344327,0.175687,0.261555,0.375921,0.656410,0.587963,0.327869,1.0,0.000000,0.143388,0.190168,0.689725,0.230202,1.000000,1.000000,0.344338,0.655016,0.734440,0.120658,0.288105
1238,0.318739,0.337616,0.253393,0.334518,0.615385,0.476852,0.256148,1.0,0.000000,0.328477,0.706325,0.369769,0.388354,0.661830,0.905001,0.318738,0.184597,0.628065,0.273666,0.331515
1239,0.313655,0.199127,0.234801,0.316273,0.482051,0.439815,0.944672,0.0,0.201233,0.521503,0.258926,0.899163,0.198161,0.815544,1.000000,0.313658,0.550260,0.546712,0.086720,0.269528


In [28]:
def get_metrics(model, x_train, y_train):

    # 교차 검증 수행 및 평가 지표 계산
    scoring = {
    'accuracy': 'accuracy',
    'recall': make_scorer(recall_score),
    'precision': make_scorer(precision_score),
    'f1_score': make_scorer(f1_score)
    }
    # 훈련 평가지표 출력
    model.fit(x_train, y_train)
    y_pred = model.predict(x_train)
    print("훈련 정확도: ", model.score(x_train, y_train))
    print("훈련 recall: ", recall_score(y_train, y_pred))
    print("훈련 Precision: ", precision_score(y_train, y_pred))
    print("훈련 F1-score: ", f1_score(y_train, y_pred))

    # 교차검증 5개로 분할
    cv_results = cross_validate(model, x_train, y_train, cv=5, scoring=scoring)
    
    # 교차 검증 결과를 데이터프레임으로 변환
    cv_results_df = pd.DataFrame(cv_results)
    
    # 평균값을 추가한 새로운 행을 생성하여 데이터프레임에 추가
    mean_scores = cv_results_df.mean().to_frame().T
    mean_scores.rename(index={0: 'mean'}, inplace=True)
    cv_results_df = pd.concat([cv_results_df, mean_scores], axis=0)

    return cv_results_df

In [49]:

def get_model1(nodes1 = 8, nodes2= 6, nodes3 = 4, drop_rate= 0.2, activation= 'relu'):
  # 모델 만들기
  # 뉴런의 개수는 input과 output 사이의 숫자를 넣으라고 한다. --> 사용, 왜냐하면 시간이 적게 걸려서
  # 뉴런의 개수는 input의 2/3 정도 넣라고 한다.
  # 뉴런의 개수는 input의 두배보다는 적게 넣라고 한다.
  regularizer = L1L2(l1=0.001, l2=0.001)

  model = Sequential([Layer.Dense(12, input_shape=(20,))])
  model.add(Layer.Dense(nodes1, activation = activation, kernel_regularizer=regularizer))
  model.add(Layer.BatchNormalization())
  model.add(Layer.Dropout(drop_rate))

  model.add(Layer.Dense(nodes2, activation = activation, kernel_regularizer=regularizer))
  model.add(Layer.BatchNormalization())
  model.add(Layer.Dropout(drop_rate))

  model.add(Layer.Dense(nodes3, activation = activation, kernel_regularizer=regularizer))
  model.add(Layer.BatchNormalization())
  model.add(Layer.Dropout(drop_rate))

  model.add(Layer.Dense(1, activation = 'sigmoid'))

  # metrics에는 1이 나오는 recall 이랑 acc만 중요하다.
  metrics = [
      Recall(name = 'recall'),
      Precision(name = 'precision'),
      BinaryAccuracy(name = 'binary accuracy') # Accuracy를 사용 안 하는 이유는 Accuracy가 이상하세 나왔기 때문.
  ]

  model.compile(optimizer = 'adam',
                loss='binary_crossentropy',
                metrics = metrics)

  return model

In [56]:
def get_model2():
    model = Sequential()
    model.add(Layer.Dense(256, activation='relu'))
    model.add(Layer.Dense(256, activation='relu'))
    model.add(Layer.Dense(1, activation='sigmoid'))

    model.compile(optimizer='adam',
                loss='binary_crossentropy',
                metrics=['accuracy'])

    return model

In [31]:
class TimingCallback(keras.callbacks.Callback):
    def on_train_begin(self, logs=None):
        print("Starting training")
        self.starttime = timer()

    def on_train_end(self, logs=None):
        print("End of training, took {} seconds".format(timer()-self.starttime))

    def on_epoch_end(self, epoch, logs=None):
        if epoch % 10 == 0:
          print("Epoch is {} and {} seconds passed".format(epoch, timer()-self.starttime))

In [117]:

check_path = 'model_weights.h5'

callback = [
    #tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=50),
    tf.keras.callbacks.ModelCheckpoint(filepath=check_path, save_weights_only=True, monitor='val_loss', mode='min', save_best_only=True),
    #tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=200, min_lr=0.001),
    TimingCallback()
]

In [52]:
model1 = get_model1()

In [57]:
model2 = get_model2()

In [118]:
history1 = model1.fit(X_train, y_train, batch_size= 1024, epochs= 1000,  validation_data=(X_val, y_val),callbacks=callback, verbose = 0)

Starting training
Epoch is 0 and 0.04754120809957385 seconds passed
Epoch is 10 and 0.213104666210711 seconds passed
Epoch is 20 and 0.37147129117511213 seconds passed
Epoch is 30 and 0.5222185000311583 seconds passed
Epoch is 40 and 0.6805434999987483 seconds passed
Epoch is 50 and 0.8306591662112623 seconds passed
Epoch is 60 and 0.9791686660610139 seconds passed
Epoch is 70 and 1.1553187081590295 seconds passed
Epoch is 80 and 1.3133613751269877 seconds passed
Epoch is 90 and 1.4753460411448032 seconds passed
Epoch is 100 and 1.6494560830760747 seconds passed
Epoch is 110 and 1.805825041141361 seconds passed
Epoch is 120 and 1.9517773331608623 seconds passed
Epoch is 130 and 2.112878500018269 seconds passed
Epoch is 140 and 2.26979725016281 seconds passed
Epoch is 150 and 2.424461083021015 seconds passed
Epoch is 160 and 2.578341916203499 seconds passed
Epoch is 170 and 2.732729708077386 seconds passed
Epoch is 180 and 2.8851344999857247 seconds passed
Epoch is 190 and 3.03176133311

In [120]:
results = model1.evaluate(X_test, y_test)



In [121]:
    # results = model.fit(x_train1, y_train1, batch_size = 64, epochs=500)
history2 = model2.fit(X_train, y_train, batch_size= 64, epochs= 500, verbose = 1, callbacks=callback)

Starting training
Epoch 1/500
Epoch is 0 and 0.04241558420471847 seconds passed
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch is 10 and 0.33867841702885926 seconds passed
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch is 20 and 0.6793590001761913 seconds passed
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch is 30 and 1.047366000013426 seconds passed
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch is 40 and 1.3344181252177805 seconds passed
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch is 50 and 1.6137714171782136 seconds passed
Epoch 52/500
Epoch 53/500
Ep

In [125]:
model2.save('model_layers')

INFO:tensorflow:Assets written to: model_layers/assets


INFO:tensorflow:Assets written to: model_layers/assets


In [124]:
results = model2.evaluate(X_test, y_test)

