In [None]:
import pandas as pd
import tensorflow as tf
from keras import models
from keras import layers
import numpy as np
from keras.models import Sequential
from keras.utils import to_categorical

from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

from keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, Flatten, concatenate, Dense, BatchNormalization, Dropout
from tensorflow.keras.models import Model

In [None]:
pixel_data = pd.read_csv('/content/drive/MyDrive/딥실 데이터셋/archive/hmnist_28_28_RGB.csv') # 28 * 28 * 3 + 1(레이블 정보)
meta_data = pd.read_csv('/content/drive/MyDrive/딥실 데이터셋/archive/HAM10000_metadata.csv')

In [None]:
# 레이블과 질병명 매핑 딕셔너리
label_to_disease = {
    0: "akiec",
    1: "bcc",
    2: "bkl",
    3: "df",
    4: "nv",
    5: "vasc",
    6: "mel"
}

label_counts = pixel_data['label'].map(label_to_disease).value_counts()

# 레이블별로 데이터 개수 확인
print(label_counts)

label
nv       6705
mel      1113
bkl      1099
bcc       514
akiec     327
vasc      142
df        115
Name: count, dtype: int64


In [None]:
meta_data

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization
0,HAM_0000118,ISIC_0027419,bkl,histo,80.0,male,scalp
1,HAM_0000118,ISIC_0025030,bkl,histo,80.0,male,scalp
2,HAM_0002730,ISIC_0026769,bkl,histo,80.0,male,scalp
3,HAM_0002730,ISIC_0025661,bkl,histo,80.0,male,scalp
4,HAM_0001466,ISIC_0031633,bkl,histo,75.0,male,ear
...,...,...,...,...,...,...,...
10010,HAM_0002867,ISIC_0033084,akiec,histo,40.0,male,abdomen
10011,HAM_0002867,ISIC_0033550,akiec,histo,40.0,male,abdomen
10012,HAM_0002867,ISIC_0033536,akiec,histo,40.0,male,abdomen
10013,HAM_0000239,ISIC_0032854,akiec,histo,80.0,male,face


In [None]:
# 픽셀값이랑 메타데이터 합치기
df_concatenated = pd.concat([pixel_data, meta_data], axis=1)

# 'label'열 맨 뒤로 옮기기
label_column = df_concatenated.pop('label')
df_concatenated.insert(len(df_concatenated.columns), 'label', label_column)

type(df_concatenated)

In [None]:
X = df_concatenated.iloc[:, :-1].values # label 데이터 제외
y = df_concatenated.iloc[:, -1].values  # label 데이터

# train set이랑 test set 7:3 비율로 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print(type(df_concatenated.iloc[:, :-1]))

X.shape

<class 'pandas.core.frame.DataFrame'>


(10015, 2359)

In [None]:
# 픽셀 데이터 정규화
X_train[:, :2352] = X_train[:, :2352].astype(np.float64) / 255.0
X_test[:, :2352] = X_test[:, :2352].astype(np.float64) / 255.0

In [None]:
# 이미지 입력 레이어
image_input = Input(shape=(28, 28, 3), name='image_input')

conv1 = Conv2D(32, (3, 3), activation='relu', padding='same')(image_input)
conv1 = BatchNormalization()(conv1)
conv1 = Conv2D(32, (3, 3), activation='relu', padding='same')(conv1)  # 추가된 합성곱 층
conv1 = BatchNormalization()(conv1)
pool1 = MaxPooling2D((2, 2))(conv1)

conv2 = Conv2D(64, (3, 3), activation='relu', padding='same')(pool1)
conv2 = BatchNormalization()(conv2)
conv2 = Conv2D(64, (3, 3), activation='relu', padding='same')(conv2)  # 추가된 합성곱 층
conv2 = BatchNormalization()(conv2)
pool2 = MaxPooling2D((2, 2))(conv2)

conv3 = Conv2D(128, (3, 3), activation='relu', padding='same')(pool2)
conv3 = BatchNormalization()(conv3)
conv3 = Conv2D(128, (3, 3), activation='relu', padding='same')(conv3)  # 추가된 합성곱 층
conv3 = BatchNormalization()(conv3)
pool3 = MaxPooling2D((2, 2))(conv3)

flatten = Flatten()(pool3)
dense1 = Dense(512, activation='relu')(flatten)  # 더 큰 완전 연결층
dropout1 = Dropout(0.5)(dense1)
batchnorm1 = BatchNormalization()(dropout1)

dense2 = Dense(256, activation='relu')(batchnorm1)
dropout2 = Dropout(0.5)(dense2)
batchnorm2 = BatchNormalization()(dropout2)

# 출력 레이어 (다중 클래스 분류)
output = Dense(7, activation='softmax')(batchnorm2)  # 7가지 질병 클래스 중 하나 예측

# 모델 생성
model = Model(inputs=image_input, outputs=output)

# 모델 컴파일
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# 모델 요약 출력
model.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 image_input (InputLayer)    [(None, 28, 28, 3)]       0         
                                                                 
 conv2d_6 (Conv2D)           (None, 28, 28, 32)        896       
                                                                 
 batch_normalization_8 (Bat  (None, 28, 28, 32)        128       
 chNormalization)                                                
                                                                 
 conv2d_7 (Conv2D)           (None, 28, 28, 32)        9248      
                                                                 
 batch_normalization_9 (Bat  (None, 28, 28, 32)        128       
 chNormalization)                                                
                                                                 
 max_pooling2d_3 (MaxPoolin  (None, 14, 14, 32)        0   

In [None]:
# 이미지 데이터의 경우 0~2351 열까지는 이미지 데이터
X_train_image = X_train[:, :2352].reshape(-1, 28, 28, 3)
y_train_encoded = to_categorical(y_train, num_classes=7)

print(type(X_train_image))
print(type(y_train_encoded))
print(X_train_image[0])

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
[[[0.788235294117647 0.7137254901960784 0.8117647058823529]
  [0.788235294117647 0.7176470588235294 0.8156862745098039]
  [0.7843137254901961 0.7058823529411765 0.807843137254902]
  ...
  [0.8117647058823529 0.7215686274509804 0.8313725490196079]
  [0.8 0.7215686274509804 0.8235294117647058]
  [0.803921568627451 0.7098039215686275 0.8196078431372549]]

 [[0.7803921568627451 0.7176470588235294 0.807843137254902]
  [0.7843137254901961 0.7176470588235294 0.8156862745098039]
  [0.7843137254901961 0.7098039215686275 0.8117647058823529]
  ...
  [0.8117647058823529 0.7098039215686275 0.8235294117647058]
  [0.807843137254902 0.7176470588235294 0.8235294117647058]
  [0.8117647058823529 0.7176470588235294 0.8274509803921568]]

 [[0.788235294117647 0.7215686274509804 0.8156862745098039]
  [0.796078431372549 0.7372549019607844 0.8352941176470589]
  [0.792156862745098 0.7294117647058823 0.8235294117647058]
  ...
  [0.807843137254902 0.7058823529411765

In [None]:
# 데이터 타입 확인
print("X_train_image 타입:", X_train_image.dtype)
print("y_train_encoded 타입:", y_train_encoded.dtype)

# float 타입으로 변환
X_train_image = X_train_image.astype(np.float32)
y_train_encoded = y_train_encoded.astype(np.float32)

X_train_image 타입: object
y_train_encoded 타입: float32


In [None]:
# 모델을 훈련시킴
model.fit(X_train_image, y_train_encoded,
          epochs=20,
          batch_size=32,
          validation_split=0.2,
         )

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x7bcbe0f2dd50>

In [None]:
# 이미지 데이터의 경우 0~2351 열까지는 이미지 데이터
X_test_image = X_test[:, :2352].reshape(-1, 28, 28, 3)

# y_train을 원-핫 인코딩
y_test_encoded = to_categorical(y_test, num_classes=7)

X_test_image = X_test_image.astype(np.float32)
y_test_encoded = y_test_encoded.astype(np.float32)

loss, accuracy = model.evaluate([X_test_image], y_test_encoded)

print("테스트 세트 손실:", loss)
print("테스트 세트 정확도:", accuracy)

테스트 세트 손실: 0.7090426087379456
테스트 세트 정확도: 0.7334442734718323
