In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
def load_dataset(csv_path, TRAIN_RATIO=0.8):
    
    # 데이터셋 로드
    df = pd.read_csv(csv_path)

    # 성별 원핫 인코딩
    df=pd.get_dummies(df,columns=['Sex'])

    return df

csv_path = 'D:\project\Teamproject1\colabo\Data\Regression_data.csv'
df = load_dataset(csv_path)

In [2]:
df.head()

Unnamed: 0,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings,Sex_F,Sex_I,Sex_M
0,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15,0,0,1
1,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7,0,0,1
2,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9,1,0,0
3,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10,0,0,1
4,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7,0,1,0


In [3]:
df.corr()['Rings'].sort_values(ascending=False)

Rings             1.000000
Shell weight      0.627574
Diameter          0.574660
Height            0.557467
Length            0.556720
Whole weight      0.540390
Viscera weight    0.503819
Shucked weight    0.420884
Sex_F             0.250279
Sex_M             0.181831
Sex_I            -0.436063
Name: Rings, dtype: float64

In [4]:
features = df[['Shell weight', 'Diameter', 'Height', 'Length', 'Whole weight', 'Viscera weight','Rings']]
features.head()

Unnamed: 0,Shell weight,Diameter,Height,Length,Whole weight,Viscera weight,Rings
0,0.15,0.365,0.095,0.455,0.514,0.101,15
1,0.07,0.265,0.09,0.35,0.2255,0.0485,7
2,0.21,0.42,0.135,0.53,0.677,0.1415,9
3,0.155,0.365,0.125,0.44,0.516,0.114,10
4,0.055,0.255,0.08,0.33,0.205,0.0395,7


In [5]:
target = features['Rings']
target.head()

0    15
1     7
2     9
3    10
4     7
Name: Rings, dtype: int64

In [6]:
features = features.drop(['Rings'], axis = 1)
features.head()

Unnamed: 0,Shell weight,Diameter,Height,Length,Whole weight,Viscera weight
0,0.15,0.365,0.095,0.455,0.514,0.101
1,0.07,0.265,0.09,0.35,0.2255,0.0485
2,0.21,0.42,0.135,0.53,0.677,0.1415
3,0.155,0.365,0.125,0.44,0.516,0.114
4,0.055,0.255,0.08,0.33,0.205,0.0395


In [7]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [8]:
# 데이터 표준화
standardized_data = (features - np.mean(features, axis=0)) / np.std(features, ddof=1, axis=0)
print('\n standardized_data: \n', standardized_data)


 standardized_data: 
       Shell weight  Diameter    Height    Length  Whole weight  Viscera weight
0        -0.638140 -0.432097 -1.064297 -0.574489     -0.641821       -0.726125
1        -1.212842 -1.439757 -1.183837 -1.448812     -1.230130       -1.205077
2        -0.207114  0.122116 -0.107978  0.050027     -0.309432       -0.356647
3        -0.602222 -0.432097 -0.347058 -0.699393     -0.637743       -0.607527
4        -1.320599 -1.540523 -1.422916 -1.615350     -1.271933       -1.287183
...            ...       ...       ...       ...           ...             ...
4172      0.073053  0.424414  0.609261  0.341468      0.118799        0.532836
4173      0.155666  0.323648 -0.107978  0.549640      0.279896        0.309325
4174      0.496895  0.676328  1.565580  0.632909      0.708127        0.975296
4175      0.410690  0.777094  0.250642  0.841081      0.541933        0.733540
4176      1.840260  1.482456  1.326500  1.548867      2.283407        1.787235

[4177 rows x 6 columns]


In [9]:
covariance_matrix = np.cov(standardized_data.T)
print('\n covariance_matrix: \n', covariance_matrix)


 covariance_matrix: 
 [[1.         0.90532978 0.81733801 0.89770557 0.95535544 0.90765632]
 [0.90532978 1.         0.83368369 0.98681158 0.9254521  0.89972443]
 [0.81733801 0.83368369 1.         0.82755361 0.81922077 0.7983193 ]
 [0.89770557 0.98681158 0.82755361 1.         0.92526117 0.9030177 ]
 [0.95535544 0.9254521  0.81922077 0.92526117 1.         0.96637508]
 [0.90765632 0.89972443 0.7983193  0.9030177  0.96637508 1.        ]]


In [10]:
values, vectors = np.linalg.eig(covariance_matrix)
print('\n Eigenvalues: \n', values)
print('\n Eigenvectors: \n', vectors)


 Eigenvalues: 
 [5.46154126 0.25267625 0.16081686 0.09196261 0.02028369 0.01271932]

 Eigenvectors: 
 [[ 0.41025434 -0.16544053 -0.34610057 -0.75404912  0.3317057  -0.07691848]
 [ 0.4153573  -0.04955905  0.55945029 -0.05589457  0.08254681  0.70858973]
 [ 0.37982879  0.90351254 -0.1788962   0.08347683 -0.01823929 -0.00950114]
 [ 0.41458886 -0.07121396  0.57745471  0.03748422 -0.02249438 -0.69834064]
 [ 0.4184799  -0.25353355 -0.2889563   0.04915081 -0.81886488  0.06437445]
 [ 0.40974435 -0.29066773 -0.34391312  0.64613966  0.46019451  0.00837563]]


In [11]:
print('pc1의 크기:', np.dot(vectors[:,0], vectors[:,0]))
print('pc2의 크기:', np.dot(vectors[:,1], vectors[:,1]))

pc1의 크기: 1.0
pc2의 크기: 0.9999999999999999


In [12]:
scaler = StandardScaler()
features_std = scaler.fit_transform(features)

# 표준화한 데이터에 대하여 pca 시행 
pca = PCA(2) 
pca.fit(features_std)

B = pca.transform(features_std)
print("\n Projected Data: \n", B)


 Projected Data: 
 [[-1.65001434 -0.41997118]
 [-3.1548396  -0.03227701]
 [-0.33018517  0.10922144]
 ...
 [ 2.03802401  0.79079777]
 [ 1.46268974 -0.29054491]
 [ 4.20508229 -0.38817233]]


In [13]:
# 2개의 PC를 이용하여 설명되는 variance의 ratio
pca.explained_variance_ratio_

array([0.91025688, 0.04211271])

In [14]:
# variance의 ratio 합
pca.explained_variance_ratio_.sum()

0.9523695853568099

In [15]:
df2 = pd.DataFrame(B)
df2.rename(columns = {0: '1st', 1:'2nd'}, inplace=True)
df2.head()

Unnamed: 0,1st,2nd
0,-1.650014,-0.419971
1,-3.15484,-0.032277
2,-0.330185,0.109221
3,-1.364298,0.195583
4,-3.451927,-0.179161


In [16]:
# 메서드 정의
import tensorflow as tf
def main():
    
    # 랜덤 시드 고정
    np.random.seed(42)
    tf.random.set_seed(42)
    
    global df2, features, LEARNING_RATE, EPOCH_COUNT, MB_SIZE, REPORT, TRAIN_RATIO, X, y, X_train, X_test, y_train, y_test, y_pred
    
    Regression_Model() # 회귀 모델

In [17]:
import tensorflow as tf

class EvalAccuracy(tf.keras.metrics.Metric):
    def __init__(self, name="eval_accuracy", **kwargs):
        super(EvalAccuracy, self).__init__(name=name, **kwargs)
        self.correct = self.add_weight(name="ctp", initializer="zeros")
        self.total = self.add_weight(name="total", initializer="zeros")

    def update_state(self, y_true, y_predict, sample_weight=None):
        value = tf.abs((y_predict - y_true) / y_true)
        self.correct.assign_add(tf.reduce_sum(value))
        self.total.assign_add(tf.cast(tf.shape(y_true)[0], dtype=tf.float32))

    def result(self):
        return 1 - (self.correct / self.total)

    def reset_states(self):
        self.correct.assign(0.)
        self.total.assign(0.)

In [19]:
# 베이스모델
def Base_Model(LEARNING_RATE=0.01):
    import tensorflow as tf
    model = tf.keras.Sequential([
            tf.keras.layers.Dense(units=128, activation='relu', input_shape=(len(df2.keys()),)),
            tf.keras.layers.Dense(units=64, activation='relu'),
            tf.keras.layers.Dense(units=1, activation= 'linear')
        ])
    optimizer = tf.keras.optimizers.SGD(learning_rate=LEARNING_RATE) # SGD : 경사하강법을 기본적으로 사용하는 옵티마이저
    model.compile(loss='mean_squared_error', optimizer=optimizer, metrics=[EvalAccuracy()]) # metrics=[eval_accuracy(y_pred, y_test)] 직접 작성한 eval_accuracy를 평가지표로 사용할 수 있음
    return model

model = Base_Model()
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 128)               384       
_________________________________________________________________
dense_1 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 65        
Total params: 8,705
Trainable params: 8,705
Non-trainable params: 0
_________________________________________________________________


In [20]:
def Regression_Model():
    # 학습 모델 구현
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import r2_score
    from sklearn.metrics import mean_squared_error

    # 상수 정의
    LEARNING_RATE = 0.001
    EPOCH_COUNT = 100
    MB_SIZE = 50
    REPORT = 1
    TRAIN_RATIO = 0.8

    # 학습 데이터 분리
    X = df2
    y = target
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=TRAIN_RATIO, random_state = 83)
    
    # 텐서플로우의 텐서 연산은 계산 효율성 및 GPU사용을 최적화하기 위해 자동 형변환이 발생하지 않는다.
    # 따라서 매개변수를 float타입으로 변경해야 EvalAccuracy 클래스의 update_state 메서드가 작동한다. (사용자 정의 평가 지표)
    y_train = y_train.astype('float32')
    y_test = y_test.astype('float32')

    # 모델 생성
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(units=256, activation='relu', input_shape=(len(X_train.keys()),)), # (len(X_train.keys()),)로 입력해야 튜플 형태로 입력되어 오류가 발생하지 않음
        tf.keras.layers.Dense(units=128, activation='relu'),
        tf.keras.layers.Dense(units=64, activation='relu'),
        tf.keras.layers.Dense(units=32, activation='relu'),
        tf.keras.layers.Dense(units=16, activation='relu'),
        tf.keras.layers.Dense(units=1)
    ])


    # 옵티마이저와 손실 함수 설정
    optimizer = tf.keras.optimizers.SGD(learning_rate=LEARNING_RATE) # SGD : 경사하강법을 기본적으로 사용하는 옵티마이저
    model.compile(loss='mean_squared_error',
                  optimizer=optimizer,
                  metrics=[EvalAccuracy()])
    
    # 학습 시작
    model.fit(X_train, y_train, epochs=EPOCH_COUNT, batch_size=MB_SIZE, verbose=REPORT)
    
    # 모델 평가
    y_pred = model.predict(X_test)
    
    loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
    print(f'Accuracy: {accuracy}\n MSE: {loss}')

In [21]:
main()

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100

  'consistency.' % (self.__class__.__name__,))


Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoc

  'consistency.' % (self.__class__.__name__,))
