In [3]:
import tensorflow as tf
import matplotlib.pyplot as plt
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPool2D
from keras.datasets import mnist
from keras.utils import to_categorical
import numpy as np 
import pandas as pd

# CNN (딥러닝)

In [4]:
# 데이터셋 로드
(X_train, y_train), (X_test, y_test) = mnist.load_data()
print(X_train.shape) ## (60000, 28, 28) 이미지수, 높이, 너비

# CNN 입력을 위한 Reshape
X_train = X_train.reshape(X_train.shape[0], 28, 28, 1)
X_test = X_test.reshape(X_test.shape[0], 28, 28, 1)

# 독립변수 정규화 (r, g, b) 0~255 -> 0~1
X_train = X_train.astype('float32')
X_test = X_test.astype('float32')
X_train /= 255
X_test /= 255

# 목표변수 One-Hot Encoding 처리
Y_train = to_categorical(y_train, 10)
Y_test = to_categorical(y_test, 10)

# 모델 생성
model = Sequential()
model.add(Conv2D(32,(3,3),activation='relu', input_shape=(28, 28, 1))) # 합성곱 연산 수행
model.add(Conv2D(32, (3,3), activation='relu'))
model.add(MaxPool2D(pool_size=(2, 2))) # 2x2 max pooling
model.add(Dropout(0.25))
model.add(Flatten()) # 1차원으로 변환
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(10, activation='softmax'))
print(model.summary())

# 모델 컴파일
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# 모델 학습
history = model.fit(X_train, Y_train, batch_size=32, epochs=10, verbose=1)

# 모델 평가
score = model.evaluate(X_test, Y_test, verbose=0)
print(model.metrics_names)
print(score)

# 성능 지표 시각화
acc = history.history['accuracy']
loss = history.history['loss']
epoch_range = range(1, len(acc)+1)

## Loss
plt.plot(epoch_range, loss, 'b', label='Training loss')
plt.title('Training loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.figure()

## Accuracy
plt.plot(epoch_range, acc, 'b', label='Training acc')
plt.title('Training accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.figure()


(60000, 28, 28)
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 26, 26, 32)        320       
                                                                 
 conv2d_1 (Conv2D)           (None, 24, 24, 32)        9248      
                                                                 
 max_pooling2d (MaxPooling2  (None, 12, 12, 32)        0         
 D)                                                              
                                                                 
 dropout (Dropout)           (None, 12, 12, 32)        0         
                                                                 
 flatten (Flatten)           (None, 4608)              0         
                                                                 
 dense (Dense)               (None, 128)               589952    
                                        

KeyboardInterrupt: 

# 결측치 처리

In [None]:
df = pd.read_csv('salesdata.csv')

# Null value를 가진 컬럼 확인 & Null value의 수 확인
df.info()
df.isnull().sum()
(df.isnull().sum() / df.shape[0]) * 100 ## Null 비율

# 고유값 확인
for col in df.columns:
    print(col, df[col].unique())

# (참고) 기초 시각화
import seaborn as sns
sns.countplot(x='BikeBuyer', data=df) ## countplot
sns.histplot(x='AvgMonthSpend', data=df) ## histplot

# 결측값이 있는 행 제거
df.dropna(subset=['column명'], inplace=True) ## 특정 칼럼에 결측값이 있을 경우 해당 행을 제거

# 결측행 채우기
df.fillna('값', inplace=True)

# 레이블 인코딩

In [None]:
from sklearn.preprocessing import LabelEncoder

# 데이터 로드
items=['TV','냉장고','전자렌지','컴퓨터','선풍기','선풍기','믹서','믹서']

# 인코딩 변환
encoder = LabelEncoder() ## 인코더 초기화
encoder.fit(items) ## 학습
labels = encoder.transform(items) ## 문자열 -> 인코딩된 숫자로 변환
print('인코딩 변환값:',labels)

# 인코딩 클래스 확인
print('인코딩 클래스:',encoder.classes_)

# 디코딩
print('디코딩 원본 값:',encoder.inverse_transform([4, 5, 2, 0, 1, 1, 3, 3]))

# 원 핫 인코딩과 Min-Max 스케일링

In [None]:
import pandas as pd

# 원핫 인코딩
df = pd.DataFrame({'item':['TV','냉장고','전자렌지','컴퓨터','선풍기','선풍기','믹서','믹서'] })
pd.get_dummies(df)

# Min-Max 스케일링
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(df)
scaler_df_scaled_array = scaler.transform(df)
scaler_df_scaled = pd.DataFrame(scaler_df_scaled_array, columns=df.columns.tolist())
print(scaler_df_scaled)

