In [1]:
!pip install category_encoders

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import pandas as pd 
import numpy as np 

import os
import sys

import matplotlib.pyplot as plt
import seaborn as sns
import time

import lightgbm as lgbm
import sklearn
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold
from sklearn.decomposition import TruncatedSVD, PCA, FastICA, FactorAnalysis, KernelPCA, DictionaryLearning
from sklearn.decomposition import IncrementalPCA, LatentDirichletAllocation,MiniBatchSparsePCA, SparsePCA

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.pipeline import make_pipeline
from category_encoders import OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer

In [4]:
df = pd.read_csv('/content/Regression_data.csv')
df.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [5]:
def check_missing_col(dataframe):
    missing_col = []
    counted_missing_col = 0
    for i, col in enumerate(dataframe.columns):
        missing_values = sum(dataframe[col].isna())
        is_missing = True if missing_values >= 1 else False
        if is_missing:
            counted_missing_col += 1
            print(f'결측치가 있는 컬럼은: {col}입니다')
            print(f'해당 컬럼에 총 {missing_values}개의 결측치가 존재합니다.')
            missing_col.append([col, dataframe[col].dtype])
    if counted_missing_col == 0:
        print('결측치가 존재하지 않습니다')
    return missing_col

missing_col = check_missing_col(df)

결측치가 존재하지 않습니다


In [6]:
#라벨인코딩을 하기 위함 dictionary map 생성 함수
def make_label_map(dataframe):
    label_maps = {}
    for col in dataframe.columns:
        if dataframe[col].dtype=='object':
            label_map = {'unknown':0}
            for i, key in enumerate(dataframe[col].unique()):
                label_map[key] = i+1  #새로 등장하는 유니크 값들에 대해 1부터 1씩 증가시켜 키값을 부여해줍니다.
            label_maps[col] = label_map
    print(label_maps)
    return label_maps

In [7]:
# 각 범주형 변수에 인코딩 값을 부여하는 함수
def label_encoder(dataframe, label_map):
    for col in dataframe.columns:
        if dataframe[col].dtype=='object':
            dataframe[col] = dataframe[col].map(label_map[col])
            dataframe[col] = dataframe[col].fillna(label_map[col]['unknown']) #혹시 모를 결측값은 unknown의 값(0)으로 채워줍니다.
    return dataframe

In [8]:
train_le = make_label_map(df[['Sex']])
Sex_df = label_encoder(df[['Sex']], train_le)
df['Sex'] = Sex_df[['Sex']]

{'Sex': {'unknown': 0, 'M': 1, 'F': 2, 'I': 3}}


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe[col] = dataframe[col].map(label_map[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe[col] = dataframe[col].fillna(label_map[col]['unknown']) #혹시 모를 결측값은 unknown의 값(0)으로 채워줍니다.


In [9]:
df.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,1,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,1,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,2,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,1,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,3,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [10]:
X = df.drop(['Rings'], axis = 1)
y = df['Rings']

In [11]:
X_train,X_test,y_train,y_test= train_test_split(X,y,test_size=0.2,random_state=42)

In [12]:
X_train.shape,X_test.shape

((3341, 8), (836, 8))

In [13]:
X_train

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight
4038,3,0.550,0.445,0.125,0.6720,0.2880,0.1365,0.210
1272,3,0.475,0.355,0.100,0.5035,0.2535,0.0910,0.140
3384,2,0.305,0.225,0.070,0.1485,0.0585,0.0335,0.045
3160,3,0.275,0.200,0.065,0.1165,0.0565,0.0130,0.035
3894,1,0.495,0.380,0.135,0.6295,0.2630,0.1425,0.215
...,...,...,...,...,...,...,...,...
3444,2,0.490,0.400,0.115,0.5690,0.2560,0.1325,0.145
466,2,0.670,0.550,0.190,1.3905,0.5425,0.3035,0.400
3092,1,0.510,0.395,0.125,0.5805,0.2440,0.1335,0.188
3772,1,0.575,0.465,0.120,1.0535,0.5160,0.2185,0.235


In [41]:
# 정규화
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [32]:
X_train_scaled[0]

array([ 1.26075882,  0.21017452,  0.36972535, -0.34795017, -0.32316418,
       -0.32540694, -0.40512998, -0.21213236])

In [14]:
import tensorflow as tf
import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Dropout
from tensorflow.keras import regularizers

In [33]:
from keras import models
from keras import layers

def build_model():
  model = keras.Sequential([
    layers.Dense(64, activation='relu', input_shape=[len(X_train.keys())]),
    layers.Dense(64, activation='relu'),
    layers.Dense(1)
  ])

  optimizer = tf.keras.optimizers.RMSprop(0.001)

  model.compile(loss='mse',
                optimizer=optimizer,
                metrics=['mae', 'mse'])
  return model

In [34]:
model = build_model()
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_3 (Dense)             (None, 64)                576       
                                                                 
 dense_4 (Dense)             (None, 64)                4160      
                                                                 
 dense_5 (Dense)             (None, 1)                 65        
                                                                 
Total params: 4,801
Trainable params: 4,801
Non-trainable params: 0
_________________________________________________________________


In [37]:
example_batch = X_train_scaled[:10]
example_result = model.predict(example_batch)
example_result



array([[-0.17074896],
       [-0.08182038],
       [ 0.40308717],
       [ 0.4025413 ],
       [ 0.05247262],
       [-0.10739589],
       [ 0.21327458],
       [-0.17530006],
       [-0.08486304],
       [ 0.11653577]], dtype=float32)

In [38]:
# 파라미터 저장 경로를 설정하는 코드입니다.

checkpoint_filepath = "FMbest.hdf5" # .hdf5는 데이터를 저장하는 확장자입니다. 'FMbest.hdf5' 파일로 최고 성능의 모델을 저장하며 .h5와 동일한 확장자입니다.

# moitor : 개선을 확인할 지표
# min_delta : 개선이 있다고 판단되는 최소 변경값. min_delta보다 변경 사항이 작다면 개선이 없다고 판단(default=0)
# patience : 개선이 없을 경우 개선되기까지 기다리는 epochs의 수
early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=10, verbose=1)

In [39]:
# ModelCheckpoint를 통해 모델을 저장합니다.

# filepath : 파일이 저장될 경로. 앞서 설정한 것처럼 'FMbest.hdf5'에 저장됩니다.
# save_best_only : 최고 성능을 보이는 경우만 저장합니다. False일 경우 filepath에 모든 학습 과정이 저장됩니다.
# save_weights_only : 모델의 가중치만 저장합니다. False일 경우 레이어까지 모두 저장합니다.
# mode : 검증 지표가 val_acc일 경우 정확도이기 때문에 높을 수록 좋기 때문에 'max'로 설정, val_loss일 경우 낮을 수록 좋기 때문에 'min'으로 설정, 'auto'의 경우 자동으로 탐지하여 진행함.
# save_freq : 매 에폭마다 저장을 시도합니다. integer로 설정할 경우 설정한 수만큼의 iteration마다 모델을 저장합니다.
save_best = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath, monitor='val_loss', verbose=1, save_best_only=True,
    save_weights_only=True, mode='auto', save_freq='epoch', options=None)

In [42]:
model.fit(X_train_scaled, y_train, batch_size=10, epochs=100, verbose=1, 
          validation_data=(X_test_scaled,y_test), # 검증 데이터
          callbacks=[early_stop, save_best]) # 앞서 선언한 Early Stopping과 Model Checkpoint를 callbacks를 통해 설정합니다.

Epoch 1/100
Epoch 1: val_loss improved from inf to 5.88671, saving model to FMbest.hdf5
Epoch 2/100
Epoch 2: val_loss improved from 5.88671 to 5.21254, saving model to FMbest.hdf5
Epoch 3/100
Epoch 3: val_loss did not improve from 5.21254
Epoch 4/100
Epoch 4: val_loss improved from 5.21254 to 4.72984, saving model to FMbest.hdf5
Epoch 5/100
Epoch 5: val_loss did not improve from 4.72984
Epoch 6/100
Epoch 6: val_loss did not improve from 4.72984
Epoch 7/100
Epoch 7: val_loss did not improve from 4.72984
Epoch 8/100
Epoch 8: val_loss did not improve from 4.72984
Epoch 9/100
Epoch 9: val_loss did not improve from 4.72984
Epoch 10/100
Epoch 10: val_loss did not improve from 4.72984
Epoch 11/100
Epoch 11: val_loss improved from 4.72984 to 4.51706, saving model to FMbest.hdf5
Epoch 12/100
Epoch 12: val_loss did not improve from 4.51706
Epoch 13/100
Epoch 13: val_loss did not improve from 4.51706
Epoch 14/100
Epoch 14: val_loss did not improve from 4.51706
Epoch 15/100
Epoch 15: val_loss impr

<keras.callbacks.History at 0x7fb477e16080>

In [43]:
model.predict(X_test[0:1]) # 테스트 실행



array([[14.1503315]], dtype=float32)

In [44]:
test_loss, test_acc = model.evaluate(X_test_scaled,  y_test, verbose=1) # 테스트 지표 확인



ValueError: ignored