# 데이터 로드

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
def load_dataset(csv_path, TRAIN_RATIO=0.8):
    
    global X, y, X_train, X_test, y_train, y_test, df
    
    # 데이터셋 로드
    df = pd.read_csv(csv_path)
    
    # 성별 인코딩(원핫인코딩보다 차원수가 줄어드는 효과)
    from sklearn.preprocessing import LabelEncoder
    encoder = LabelEncoder()
    df['Sex'] = encoder.fit_transform(df['Sex'])
    
    """
    # 성별 원핫 인코딩
    df=pd.get_dummies(df,columns=['Sex'])
    """
    
    # 학습 데이터 분리
    X = df.drop('Rings', axis=1)
    y = df['Rings']
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=TRAIN_RATIO, random_state = 83)
    
    return X_train, X_test, y_train, y_test

csv_path = '../colabo/Data/Regression_data.csv'
X_train, X_test, y_train, y_test = load_dataset(csv_path)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(3341, 8) (836, 8) (3341,) (836,)


In [2]:
# 전복 전체 무게가 살 + 내장 + 껍질보다 적게 나가는 경우는 말이 안됨
body = X_train['Whole weight'] - (X_train['Shucked weight'] + X_train['Viscera weight'] + X_train['Shell weight'])
X_train['body'] = body

index = X_train[X_train['body'] < 0].index

body = X_test['Whole weight'] - (X_test['Shucked weight'] + X_test['Viscera weight'] + X_test['Shell weight'])
X_test['body'] = body

index2 = X_test[X_test['body'] < 0].index

print(index.shape, index2.shape)

(118,) (37,)


In [3]:
X_train.drop(index, axis=0, inplace=True)
X_test.drop(index2, axis=0, inplace=True)
y_train.drop(index, axis=0, inplace=True)
y_test.drop(index2, axis=0, inplace=True)

In [4]:
import numpy as np

# train 껍질의 넓이 ( a * b * π)
area = 0.5 * X_train['Length'] * 0.5 * X_train['Diameter'] * np.pi
X_train['Area'] = area

# test 껍질의 넓이 
area2 = 0.5 * X_test['Length'] * 0.5 * X_test['Diameter'] * np.pi
X_test['Area'] = area2

In [5]:
# train 껍질의 둘레 (근사) ( 2π*(0.5 * √(a^2 + b^2)))
perimeter = np.pi * np.sqrt(0.5 * ((X_train['Length'] ** 2) + (X_train['Diameter'] ** 2)))
X_train['Perimeter'] = perimeter

# test 껍질의 둘레 (근사) ( 2π*(0.5 * √(a^2 + b^2)))
perimeter2 = np.pi * np.sqrt(0.5 * ((X_test['Length'] ** 2) + (X_test['Diameter'] ** 2)))
X_test['Perimeter'] = perimeter2

X_train.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,body,Area,Perimeter
461,0,0.585,0.465,0.17,0.9915,0.3865,0.224,0.265,0.116,0.213648,1.660072
2835,2,0.57,0.42,0.14,0.8745,0.416,0.165,0.25,0.0435,0.188024,1.572837
1378,0,0.62,0.5,0.15,1.293,0.596,0.3135,0.354,0.0295,0.243473,1.769361
2569,1,0.46,0.345,0.115,0.4215,0.1895,0.102,0.111,0.019,0.124643,1.277329
369,0,0.69,0.56,0.215,1.719,0.68,0.299,0.47,0.27,0.303478,1.974085


In [6]:
# 타겟값과 각 변수들 간의 상관관계
co = X_train.corrwith(y_train)

# 상관계수를 내림차순으로 정리
print(co.sort_values(ascending=False))

# 절대값
co_abs = abs(co)

Shell weight      0.619796
Diameter          0.559512
Perimeter         0.548978
Length            0.539317
Height            0.534439
body              0.533203
Area              0.532949
Whole weight      0.525487
Viscera weight    0.489945
Shucked weight    0.403014
Sex              -0.032245
dtype: float64


In [7]:
columns = ['Shell weight', 'Diameter', 'Perimeter', 'Length', 'Height', 'Area', 'Viscera weight', 'Shucked weight', 'Whole weight', 'Sex']
X_train_2 = X_train[columns]
X_test_2 = X_test[columns]
X_train_2.head()

Unnamed: 0,Shell weight,Diameter,Perimeter,Length,Height,Area,Viscera weight,Shucked weight,Whole weight,Sex
461,0.265,0.465,1.660072,0.585,0.17,0.213648,0.224,0.3865,0.9915,0
2835,0.25,0.42,1.572837,0.57,0.14,0.188024,0.165,0.416,0.8745,2
1378,0.354,0.5,1.769361,0.62,0.15,0.243473,0.3135,0.596,1.293,0
2569,0.111,0.345,1.277329,0.46,0.115,0.124643,0.102,0.1895,0.4215,1
369,0.47,0.56,1.974085,0.69,0.215,0.303478,0.299,0.68,1.719,0


In [8]:
from sklearn.preprocessing import MinMaxScaler

# 스케일링할 피처 선택
scaling_features = columns[:-1]  # 원핫인코딩되지 않은 연속형 또는 순서형 변수들

# 스케일링
scaler = MinMaxScaler()
X_train_scaled = X_train_2.copy()  # 원본 데이터 복사
X_test_scaled = X_test_2.copy()    # 원본 데이터 복사
X_train_scaled[scaling_features] = scaler.fit_transform(X_train_2[scaling_features])
X_test_scaled[scaling_features] = scaler.transform(X_test_2[scaling_features])

In [9]:
from tensorflow.keras.layers import *
from tensorflow.keras.models import *
from tensorflow.keras.optimizers import *
from tensorflow.keras.callbacks import *

2023-05-22 12:38:44.502146: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [10]:
# 사용자 정의 평가 지표 클래스
import tensorflow as tf

class EvalAccuracy(tf.keras.metrics.Metric): # TensorFlow의 Metric 클래스를 상속 받음

    def __init__(self, name="eval_accuracy", **kwargs): # 부모 클래스의 __init__() 메소드를 호출하여 필요한 초기화를 수행
        super(EvalAccuracy, self).__init__(name=name, **kwargs)
        self.correct = self.add_weight(name="ctp", initializer="zeros")
        # add_weight() 메소드를 사용하여 평가 지표를 계산하는데 필요한 변수를 생성(각 배치에서의 평가 결과를 누적하기 위해)
        # add_weight() 는 텐서플로우 Layer 클래스의 메서드(새로운 가중치를 추가하는 기능, 여기서는 평가 지표를 계산하는 데 사용되는 일종의 내부 변수를 의미)
        # 이 구문이 실행되면, EvalAccuracy 인스턴스는 새로운 가중치를 추가하고 그 가중치를 self.correct에 저장한다.
        # 이 self.correct는 update_state() 메서드에서 업데이트되며, '현재까지 처리한 모든 배치에 대한 평가 지표의 평균을 저장'한다.

    def update_state(self, y_true, y_predict, sample_weight=None):
        value = tf.abs((y_predict - y_true) / y_true)
        self.correct.assign(tf.reduce_mean(value)) # 오차율을 계산해서 correct 변수에 누적한 후, assign() 메소드를 사용하여 correct 변수의 값을 업데이트

    def result(self):
        return 1 - self.correct

    def reset_states(self):
        # 에포크마다 평가 지표 초기화
        self.correct.assign(0.)

In [11]:
# 베이스모델
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, mean_squared_error
from tensorflow_estimator import estimator as tf_estimator

def Base_Model(features, labels, mode, params):
    
    np.random.seed(42)
    tf.random.set_seed(42)
    
    global X, y, X_train, X_test, y_train, y_test, df
    
    y_train = y_train.astype('float32')
    y_test = y_test.astype('float32')
    
    
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(units=64, activation='relu', input_shape=(len(X_train_2.keys()),)),
        tf.keras.layers.Dense(units=128, activation='relu'),
        tf.keras.layers.Dense(units=256, activation='relu'),
        tf.keras.layers.Dense(units=512, activation='relu'),
        tf.keras.layers.Dense(units=256, activation='relu'),
        tf.keras.layers.Dense(units=128, activation='relu'),
        tf.keras.layers.Dense(units=64, activation='relu'),
        tf.keras.layers.Dense(units=1)
    ])
    """
    
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(units=64, input_shape=(len(X_train_2.keys()),)),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Activation('relu'),
        
        tf.keras.layers.Dense(units=128),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Activation('relu'),
        
        tf.keras.layers.Dense(units=256),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Activation('relu'),
        
        tf.keras.layers.Dense(units=512),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Activation('relu'),
        
        tf.keras.layers.Dense(units=256),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Activation('relu'),
        
        tf.keras.layers.Dense(units=128),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Activation('relu'),
        
        tf.keras.layers.Dense(units=64),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Activation('relu'),
        
        tf.keras.layers.Dense(units=1)
    ])
    """
    
    # 옵티마이저와 손실 함수 설정
    optimizer = tf.keras.optimizers.Adam(
                                        learning_rate=0.01,
                                        beta_1=0.9,
                                        beta_2=0.99,
                                        epsilon=1e-08
                                        )

    model.compile(loss='mean_squared_error',
                  optimizer=optimizer,
                  metrics=[EvalAccuracy()])
    
    return model

# 모델 객체 생성
model = tf_estimator.Estimator(model_fn=Base_Model, params={'learning_rate': 0.01, 'batch_size': 100})

Instructions for updating:
Use tf.keras instead.
Instructions for updating:
Use tf.keras instead.
INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/tmp/tmpxpggxkit', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_checkpoint_save_graph_def': True, '_service': None, '_cluster_spec': ClusterSpec({}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_work

In [12]:
"""
import numpy as np
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, mean_squared_error

# Base_Model 함수를 KerasRegressor로 래핑하여 scikit-learn 추정기로 만듦
def Base_Model(learning_rate=0.01, units=64, activation='relu'):
    np.random.seed(42)
    tf.random.set_seed(42)
    
    global X, y, X_train, X_test, y_train, y_test, df
    
    y_train = y_train.astype('float64')
    y_test = y_test.astype('float64')
    
    
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(units=64, activation='relu', input_shape=(len(X_train_2.keys()),)),
        tf.keras.layers.Dense(units=128, activation='relu'),
        tf.keras.layers.Dense(units=256, activation='relu'),
        tf.keras.layers.Dense(units=512, activation='relu'),
        tf.keras.layers.Dense(units=256, activation='relu'),
        tf.keras.layers.Dense(units=128, activation='relu'),
        tf.keras.layers.Dense(units=64, activation='relu'),
        tf.keras.layers.Dense(units=1)
    ])
    
    optimizer = tf.keras.optimizers.Adam(
                                        learning_rate=0.01,
                                        beta_1=0.9,
                                        beta_2=0.99,
                                        epsilon=1e-08
                                        )

    model.compile(loss='mean_squared_error',
                  optimizer=optimizer,
                  metrics=[EvalAccuracy()])
    
    return model

# KerasRegressor 객체 생성
regressor = KerasRegressor(build_fn=Base_Model, verbose=0)

# 하이퍼파라미터 그리드 설정
param_grid = {
    'learning_rate': [0.001, 0.01, 0.1],
    'units': [32, 64, 128, 256],
    'activation': ['relu']
}

# GridSearchCV 객체 생성
grid_search = GridSearchCV(estimator=regressor, param_grid=param_grid, scoring=make_scorer(mean_squared_error), cv=5)

# Grid Search 수행
grid_search.fit(X_train_scaled, y_train)

# 최적의 모델과 하이퍼파라미터 출력
print("Best Score:", grid_search.best_score_)
print("Best Parameters:", grid_search.best_params_)
"""

'\nimport numpy as np\nfrom tensorflow import keras\nfrom tensorflow.keras import layers\nfrom tensorflow.keras.wrappers.scikit_learn import KerasRegressor\nfrom sklearn.model_selection import GridSearchCV\nfrom sklearn.metrics import make_scorer, mean_squared_error\n\n# Base_Model 함수를 KerasRegressor로 래핑하여 scikit-learn 추정기로 만듦\ndef Base_Model(learning_rate=0.01, units=64, activation=\'relu\'):\n    np.random.seed(42)\n    tf.random.set_seed(42)\n    \n    global X, y, X_train, X_test, y_train, y_test, df\n    \n    y_train = y_train.astype(\'float64\')\n    y_test = y_test.astype(\'float64\')\n    \n    \n    model = tf.keras.Sequential([\n        tf.keras.layers.Dense(units=64, activation=\'relu\', input_shape=(len(X_train_2.keys()),)),\n        tf.keras.layers.Dense(units=128, activation=\'relu\'),\n        tf.keras.layers.Dense(units=256, activation=\'relu\'),\n        tf.keras.layers.Dense(units=512, activation=\'relu\'),\n        tf.keras.layers.Dense(units=256, activation=\'relu\

In [91]:
# 베이스모델
import numpy as np

def Base_Model():
    
    np.random.seed(42)
    tf.random.set_seed(42)
    
    global X, y, X_train, X_test, y_train, y_test, df
    
    y_train = y_train.astype('float32')
    y_test = y_test.astype('float32')
    
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(units=1024, activation='relu', input_shape=(len(X_train_2.keys()),)),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.1),
        tf.keras.layers.Dense(units=1024, activation='relu'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.1),
        tf.keras.layers.Dense(units=512, activation='relu'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.1),
        tf.keras.layers.Dense(units=512, activation='relu'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.1),
        tf.keras.layers.Dense(units=256, activation='relu'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.1),
        tf.keras.layers.Dense(units=256, activation='relu'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.1),
        tf.keras.layers.Dense(units=128, activation='relu'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.1),
        tf.keras.layers.Dense(units=128, activation='relu'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.1),
        tf.keras.layers.Dense(units=64, activation='relu'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.1),
        tf.keras.layers.Dense(units=1)
    ])

    # 옵티마이저와 손실 함수 설정
    optimizer = tf.keras.optimizers.Adam(
                                        learning_rate=0.01,
                                        beta_1=0.9,
                                        beta_2=0.99,
                                        epsilon=1e-7
                                        )
    
    """                         
    # RMSprop 사용 예
    optimizer = tf.keras.optimizers.RMSprop(
                                            learning_rate=0.01,
                                            rho=0.9,
                                            momentum=0.1,
                                            epsilon=1e-07,
                                            centered=False,
                                            name="RMSprop")

    # SGD 사용 예
    optimizer = tf.keras.optimizers.SGD(
                                        learning_rate=0.01,
                                        momentum=0.0,
                                        nesterov=False,
                                        name="SGD")
    """                                        

    model.compile(loss='mean_squared_error',
                  optimizer=optimizer,
                  metrics=[EvalAccuracy()])
    
    return model

model = Base_Model()
model.summary()

Model: "sequential_21"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_458 (Dense)           (None, 1024)              11264     
                                                                 
 batch_normalization_109 (Ba  (None, 1024)             4096      
 tchNormalization)                                               
                                                                 
 dropout_109 (Dropout)       (None, 1024)              0         
                                                                 
 dense_459 (Dense)           (None, 1024)              1049600   
                                                                 
 batch_normalization_110 (Ba  (None, 1024)             4096      
 tchNormalization)                                               
                                                                 
 dropout_110 (Dropout)       (None, 1024)            

In [92]:
# 학습 자동 중단 설정
es = EarlyStopping(monitor='loss', patience=50, mode='min')
rlrp = ReduceLROnPlateau(monitor='loss', factor=0.2, patience=40, mode='min')

In [93]:
# 하이퍼파라미터 세팅
LEARNING_RATE = 0.01
EPOCHS = 1024
MB_SIZE = 5000 # (5000)
REPORT = 1
TRAIN_RATIO = 0.8

np.random.seed(42)
tf.random.set_seed(42)

# 모델 학습
history = model.fit(
  X_train_scaled, y_train,
  batch_size=MB_SIZE,
  validation_split = 0.2,
  verbose=1,
  epochs=EPOCHS,
  callbacks=[es, rlrp]
  )

Epoch 1/1024

  m.reset_state()


Epoch 2/1024
Epoch 3/1024
Epoch 4/1024
Epoch 5/1024
Epoch 6/1024
Epoch 7/1024
Epoch 8/1024
Epoch 9/1024
Epoch 10/1024
Epoch 11/1024
Epoch 12/1024
Epoch 13/1024
Epoch 14/1024
Epoch 15/1024
Epoch 16/1024
Epoch 17/1024
Epoch 18/1024
Epoch 19/1024
Epoch 20/1024
Epoch 21/1024
Epoch 22/1024
Epoch 23/1024
Epoch 24/1024
Epoch 25/1024
Epoch 26/1024
Epoch 27/1024
Epoch 28/1024
Epoch 29/1024
Epoch 30/1024
Epoch 31/1024
Epoch 32/1024
Epoch 33/1024
Epoch 34/1024
Epoch 35/1024
Epoch 36/1024
Epoch 37/1024
Epoch 38/1024
Epoch 39/1024
Epoch 40/1024
Epoch 41/1024
Epoch 42/1024
Epoch 43/1024
Epoch 44/1024
Epoch 45/1024
Epoch 46/1024
Epoch 47/1024
Epoch 48/1024
Epoch 49/1024
Epoch 50/1024
Epoch 51/1024
Epoch 52/1024
Epoch 53/1024
Epoch 54/1024
Epoch 55/1024
Epoch 56/1024
Epoch 57/1024
Epoch 58/1024
Epoch 59/1024
Epoch 60/1024
Epoch 61/1024
Epoch 62/1024
Epoch 63/1024
Epoch 64/1024
Epoch 65/1024
Epoch 66/1024
Epoch 67/1024
Epoch 68/1024
Epoch 69/1024
Epoch 70/1024
Epoch 71/1024
Epoch 72/1024
Epoch 73/1024


In [94]:
import matplotlib.pyplot as plt
"""
# 손실 그래프
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

# 훈련 및 검증 평가 지표 추출
train_accuracy = history.history['eval_accuracy']
val_accuracy = history.history['val_eval_accuracy']

# 평가 지표 그래프 그리기
plt.plot(train_accuracy, label='Train Accuracy')
plt.plot(val_accuracy, label='Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()
"""
# 테스트 세트 평가
loss, accuracy = model.evaluate(X_test_scaled, y_test)
print("Test Loss:", round(loss, 3))
print("Test Accuracy:", round(accuracy, 3))




  m.reset_state()


Test Loss: 6.407
Test Accuracy: 0.855


In [19]:
# 모델 가중치 및 편향 저장 (save_weights 메서드를 사용하면 가중치와 편향 모두를 저장)
model.save_weights('model_weights.h5')

# 다음과 같이 저장한 가중치 호출 가능
new_model = Base_Model()
new_model.load_weights('model_weights.h5')

In [20]:
# 테스트 세트 평가
loss, accuracy = new_model.evaluate(X_test_scaled, y_test)
print("Test Loss:", round(loss, 3))
print("Test Accuracy:", round(accuracy, 3))

Test Loss: 4.404
Test Accuracy: 0.882
