In [10]:
import pandas as pd
import tensorflow as tf
from keras import models
from keras import layers
import numpy as np
from keras.models import Sequential
from keras.utils import to_categorical

from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

from keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, Flatten, concatenate, Dense, BatchNormalization, Dropout
from tensorflow.keras.models import Model

In [11]:
pixel_data = pd.read_csv('/content/drive/MyDrive/딥실 데이터셋/archive/hmnist_28_28_RGB.csv') # 28 * 28 * 3 + 1(레이블 정보)
meta_data = pd.read_csv('/content/drive/MyDrive/딥실 데이터셋/archive/HAM10000_metadata.csv')

In [12]:
# 레이블과 질병명 매핑 딕셔너리
label_to_disease = {
    0: "akiec",
    1: "bcc",
    2: "bkl",
    3: "df",
    4: "nv",
    5: "vasc",
    6: "mel"
}

label_counts = pixel_data['label'].map(label_to_disease).value_counts()

# 레이블별로 데이터 개수 확인
print(label_counts)

label
nv       6705
mel      1113
bkl      1099
bcc       514
akiec     327
vasc      142
df        115
Name: count, dtype: int64


In [13]:
meta_data.isnull().sum()

lesion_id        0
image_id         0
dx               0
dx_type          0
age             57
sex              0
localization     0
dtype: int64

### ⬆︎ 나이만 결측치 있는 것을 확인

In [14]:
# 결측치 있는 행만 추출
null_row = meta_data[meta_data['age'].isnull()]

In [15]:
# 'age' 결측치인 부분 평균값으로 채우기
mean_age = meta_data['age'].mean()
meta_data['age'].fillna(mean_age, inplace=True)

In [16]:
scaler = MinMaxScaler()

# 'age' 값을 0과 1 사이의 값으로 정규화
meta_data['age_normalized'] = scaler.fit_transform(meta_data['age'].values.reshape(-1, 1))

In [17]:
one_hot_encoder_sex = OneHotEncoder()
one_hot_encoder_local = OneHotEncoder()
label_encoder = LabelEncoder()

# 'sex', 'localization' 2차원 배열로 변환
sex_data = meta_data['sex'].values.reshape(-1, 1)
localization_data = meta_data['localization'].values.reshape(-1, 1)

# 원핫 인코딩 수행
sex_encoded = one_hot_encoder_sex.fit_transform(sex_data)
localization_encoded = one_hot_encoder_local.fit_transform(localization_data)

# 인코딩 결과를 데이터프레임으로 변환
sex_columns = one_hot_encoder_sex.get_feature_names_out(['sex'])
meta_data[sex_columns] = pd.DataFrame(sex_encoded.toarray(), columns=sex_columns)

localization_columns = one_hot_encoder_local.get_feature_names_out(['localization'])
meta_data[localization_columns] = pd.DataFrame(localization_encoded.toarray(), columns=localization_columns)

meta_data

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization,age_normalized,sex_female,sex_male,...,localization_face,localization_foot,localization_genital,localization_hand,localization_lower extremity,localization_neck,localization_scalp,localization_trunk,localization_unknown,localization_upper extremity
0,HAM_0000118,ISIC_0027419,bkl,histo,80.0,male,scalp,0.941176,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,HAM_0000118,ISIC_0025030,bkl,histo,80.0,male,scalp,0.941176,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,HAM_0002730,ISIC_0026769,bkl,histo,80.0,male,scalp,0.941176,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,HAM_0002730,ISIC_0025661,bkl,histo,80.0,male,scalp,0.941176,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,HAM_0001466,ISIC_0031633,bkl,histo,75.0,male,ear,0.882353,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10010,HAM_0002867,ISIC_0033084,akiec,histo,40.0,male,abdomen,0.470588,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10011,HAM_0002867,ISIC_0033550,akiec,histo,40.0,male,abdomen,0.470588,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10012,HAM_0002867,ISIC_0033536,akiec,histo,40.0,male,abdomen,0.470588,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10013,HAM_0000239,ISIC_0032854,akiec,histo,80.0,male,face,0.941176,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
# 메타 데이터에서 필요한 열만 선택
selected_columns = meta_data.columns[7:]

# 픽셀 데이터랑 전처리 마친 메타 데이터 합치기
df_concatenated = pd.concat([pixel_data, meta_data[selected_columns]], axis=1)

# 'label'열 맨 뒤로 옮기기
label_column = df_concatenated.pop('label')
df_concatenated.insert(len(df_concatenated.columns), 'label', label_column)

df_concatenated

Unnamed: 0,pixel0000,pixel0001,pixel0002,pixel0003,pixel0004,pixel0005,pixel0006,pixel0007,pixel0008,pixel0009,...,localization_foot,localization_genital,localization_hand,localization_lower extremity,localization_neck,localization_scalp,localization_trunk,localization_unknown,localization_upper extremity,label
0,192,153,193,195,155,192,197,154,185,202,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2
1,25,14,30,68,48,75,123,93,126,158,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2
2,192,138,153,200,145,163,201,142,160,206,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2
3,38,19,30,95,59,72,143,103,119,171,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2
4,158,113,139,194,144,174,215,162,191,225,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10010,183,165,181,182,165,180,184,166,182,188,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
10011,2,3,1,38,33,32,121,104,103,132,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
10012,132,118,118,167,149,149,175,156,160,184,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
10013,160,124,146,164,131,152,167,127,146,169,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [19]:
X = df_concatenated.iloc[:, :-1].values # label 데이터 제외
y = df_concatenated.iloc[:, -1].values  # label 데이터

# train set이랑 test set 7:3 비율로 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

X.shape

(10015, 2371)

In [20]:
# 픽셀 데이터 정규화
X_train[:2352] = X_train[:2352].astype(np.float64) / 255.0
X_test[:2352] = X_test[:2352].astype(np.float64) / 255.0

In [23]:
# 이미지 입력 레이어
image_input = Input(shape=(28, 28, 3), name='image_input')
conv1 = Conv2D(32, (3, 3), activation='relu')(image_input)
conv1 = BatchNormalization()(conv1)
conv2 = Conv2D(64, (3, 3), activation='relu')(conv1)  # 추가된 합성곱 층
conv2 = BatchNormalization()(conv2)
pool1 = MaxPooling2D((2, 2))(conv2)
conv3 = Conv2D(128, (3, 3), activation='relu')(pool1)  # 추가된 합성곱 층
conv3 = BatchNormalization()(conv3)
pool2 = MaxPooling2D((2, 2))(conv3)
flatten1 = Flatten()(pool2)

# 나이 입력 레이어
age_input = Input(shape=(1,), name='age_input')

# 성별 입력 레이어
sex_input = Input(shape=(3,), name='sex_input')

# 발병위치 입력 레이어
localization_input = Input(shape=(15,), name='localization_input')

# 모든 입력을 결합하는 레이어
multi_input = concatenate([flatten1, age_input, sex_input, localization_input])

# Fully Connected Layer
dense1 = Dense(256, activation='relu')(multi_input)
dropout1 = Dropout(0.5)(dense1)

# 출력 레이어 (다중 클래스 분류)
output = Dense(7, activation='softmax')(dropout1)  # 7가지 질병 클래스 중 하나 예측

# 모델 생성
model = Model(inputs=[image_input, age_input, sex_input, localization_input], outputs=output)

# 모델 컴파일
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# 모델 요약 출력
model.summary()

Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 image_input (InputLayer)    [(None, 28, 28, 3)]          0         []                            
                                                                                                  
 conv2d_6 (Conv2D)           (None, 26, 26, 32)           896       ['image_input[0][0]']         
                                                                                                  
 batch_normalization_6 (Bat  (None, 26, 26, 32)           128       ['conv2d_6[0][0]']            
 chNormalization)                                                                                 
                                                                                                  
 conv2d_7 (Conv2D)           (None, 24, 24, 64)           18496     ['batch_normalization_6[

In [25]:
# 이미지 데이터의 경우 0~2351 열까지는 이미지 데이터
X_train_image = X_train[:, :2352].reshape(-1, 28, 28, 3)

# 나이 데이터는 2352번째 열
X_train_age = X_train[:, 2352].reshape(-1, 1)

# 성별 데이터는 2353~2355번째 열
X_train_sex = X_train[:, 2353:2356]

# 발병위치 데이터는 2356~2369번째 열
X_train_localization = X_train[:, 2356:]

y_train_encoded = to_categorical(y_train, num_classes=7)

In [26]:
# 모델을 훈련시킴
model.fit([X_train_image, X_train_age, X_train_sex, X_train_localization], y_train_encoded,
          epochs=20,
          batch_size=32,
          validation_split=0.1,
         )

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x7fc67de6af50>

In [27]:
# 이미지 데이터의 경우 0~2351 열까지는 이미지 데이터
X_test_image = X_test[:, :2352].reshape(-1, 28, 28, 3)

# 나이 데이터는 2352번째 열
X_test_age = X_test[:, 2352].reshape(-1, 1)

# 성별 데이터는 2353~2355번째 열 (One-Hot Encoding된 형태로 입력)
X_test_sex = X_test[:, 2353:2356]

# 발병위치 데이터는 2356~2369번째 열 (One-Hot Encoding된 형태로 입력)
X_test_localization = X_test[:, 2356:]

# y_train을 원-핫 인코딩
y_test_encoded = to_categorical(y_test, num_classes=7)

loss, accuracy = model.evaluate([X_test_image, X_test_age, X_test_sex, X_test_localization], y_test_encoded)

print("테스트 세트 손실:", loss)
print("테스트 세트 정확도:", accuracy)

테스트 세트 손실: 0.8398354649543762
테스트 세트 정확도: 0.6991680264472961
