# **라이브러리 로드**

In [None]:
import pandas as pd
import numpy as np
import tarfile
import pickle
import os


# 데이터 전처리 패키지
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

#모델 패키지
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.layers import Dropout

# 시각화 패키지
from matplotlib.colors import ListedColormap, BoundaryNorm
import matplotlib.patches as mpatches
import matplotlib.pyplot as plt
import seaborn as sns

# **차원 축소 - 변수 추출법**

## **1. 주성분 분석(PCA)**

### **데이터 불러오기**

breast-cancer.csv
- 유방 종양의 30개 물리적 특성 관련 지표
- 목표 변수 : Diagnosis(악성M/양성B)

In [None]:
# Breast cancer dataset
data = pd.read_csv("breast-cancer.csv")
X_canc = data.drop(columns=['id', 'diagnosis', 'Unnamed: 32'], errors='ignore')
y_canc = data['diagnosis']

### **시각화 함수**

In [None]:
def plot_labelled_scatter(X, y, class_labels,s):
        # 라벨 인코딩
    le = LabelEncoder()
    y_encoded = le.fit_transform(y)  # 'M', 'B'를 숫자로 변환
    num_labels = len(class_labels)

    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1

    marker_array = ['o', '^', '*']
    color_array = ['#FFFF00', '#00AAFF', '#000000', '#FF00AA']
    cmap_bold = ListedColormap(color_array)
    bnorm = BoundaryNorm(np.arange(0, num_labels + 1, 1), ncolors=num_labels)
    plt.figure(figsize=s)
    
    plt.scatter(X[:, 0], X[:, 1], s=80, c=y_encoded, cmap=cmap_bold, norm = bnorm, alpha = 0.4, edgecolor='black', lw = 1)
    sp = plt.gca().spines
    sp['top'].set_visible(False)
    sp['right'].set_visible(False)
    
    plt.grid(which='both',color='lightslategrey',alpha=0.3)

    plt.xlim(x_min, x_max)
    plt.ylim(y_min, y_max)

    h = []
    for c in range(0, num_labels):
        h.append(mpatches.Patch(color=color_array[c], label=class_labels[c]))
    plt.legend(handles=h,fontsize=15,frameon=False)

### **1-1. PCA 적용**

In [None]:
canc_norm = StandardScaler().fit(X_canc).transform(X_canc)  

pca = PCA(n_components = 2).fit(canc_norm) #Fitting PCA with 2 Components

canc_pca = pca.transform(canc_norm)

print('Number of Features in Breat Cancer DataSet Before PCA : {}\n\nNumber of Features in Breast Cancer DataSet After PCA : {}'
      .format(X_canc.shape[1],canc_pca.shape[1]))

### **1-2 PCA 결과 시각화**

In [None]:
plot_labelled_scatter(canc_pca, y_canc, ['M', 'B'],(15,9)) #Using the Helper Function

#Labelling
plt.xlabel('First principal component',fontsize=15)
plt.ylabel('Second principal component',fontsize=15)
plt.title('Breast Cancer Dataset PCA (n_components = 2)',fontsize=17)

### **1-3 PCA 결과와 기존 feature의 상관관계 확인**

In [None]:
fig = plt.figure(figsize=(20,9))
plt.imshow(pca.components_, interpolation = 'none', cmap = 'viridis')
feature_names = list(X_canc.columns)

plt.gca().set_xticks(np.arange(len(feature_names)));
plt.gca().set_yticks(np.arange(2));
plt.gca().set_xticklabels(feature_names, rotation=90, fontsize=14);
plt.gca().set_yticklabels(['Dimension 1', 'Dimension 2'], fontsize=14);

plt.colorbar(orientation='horizontal', ticks=[pca.components_.min(), 0, 
                                              pca.components_.max()], pad=0.5);

***

## **2. 이미지 데이터에서의 PCA 적용**
#### data : cifar10

In [None]:
# Define the path to the CIFAR-10 tar.gz file
cifar10_tar_path = 'cifar-10-python.tar.gz'

# Extract the tar.gz file
with tarfile.open(cifar10_tar_path, 'r:gz') as tar:
    tar.extractall()
    print("Files extracted.")

# Load batch data from a file
def load_batch(file_path):
    with open(file_path, 'rb') as f:
        data_dict = pickle.load(f, encoding='bytes')
        # Extract the images and labels
        data = data_dict[b'data']
        labels = data_dict[b'labels']
        # Reshape and transpose to match image format (N, H, W, C)
        data = data.reshape(-1, 3, 32, 32).transpose(0, 2, 3, 1)
        labels = np.array(labels)
    return data, labels

# Directory containing the extracted CIFAR-10 files
cifar10_dir = 'cifar-10-batches-py'

# Load training data
x_train = []
y_train = []
for i in range(1, 6):
    batch_file = os.path.join(cifar10_dir, f'data_batch_{i}')
    data, labels = load_batch(batch_file)
    x_train.append(data)
    y_train.append(labels)

# Concatenate all training batches
x_train = np.concatenate(x_train)
y_train = np.concatenate(y_train)

# Load test data
test_batch_file = os.path.join(cifar10_dir, 'test_batch')
x_test, y_test = load_batch(test_batch_file)

### **2-1 데이터 확인**

In [None]:
#데이터 분리
print('Traning data shape:', x_train.shape)
print('Testing data shape:', x_test.shape)

In [None]:
y_train.shape,y_test.shape

In [None]:
classes = np.unique(y_train)
nClasses = len(classes)
print('Total number of outputs : ', nClasses)
print('Output classes : ', classes)

In [None]:
# 라벨
label_dict = {
 0: 'airplane',
 1: 'automobile',
 2: 'bird',
 3: 'cat',
 4: 'deer',
 5: 'dog',
 6: 'frog',
 7: 'horse',
 8: 'ship',
 9: 'truck',
}

In [None]:
#데이터 확인
plt.figure(figsize=[5,5])

# Display the first image in training data
plt.subplot(121)
curr_img = np.reshape(x_train[0], (32,32,3))
plt.imshow(curr_img)
print(plt.title("(Label: " + str(label_dict[y_train[0]]) + ")"))

# Display the first image in testing data
plt.subplot(122)
curr_img = np.reshape(x_test[0],(32,32,3))
plt.imshow(curr_img)
print(plt.title("(Label: " + str(label_dict[y_test[0]]) + ")"))



### **2-2 데이터 Scaling**

In [None]:
#pca를 위한 scaling
np.min(x_train),np.max(x_train)

In [None]:
#scaling
x_train = x_train/255.0
np.min(x_train),np.max(x_train)

In [None]:
#3072개의 픽셀
x_train_flat = x_train.reshape(-1,3072)

In [None]:
feat_cols = ['pixel'+str(i) for i in range(x_train_flat.shape[1])]
df_cifar = pd.DataFrame(x_train_flat,columns=feat_cols)
df_cifar['label'] = y_train
print('Size of the dataframe: {}'.format(df_cifar.shape))

In [None]:
df_cifar.head()

### **2-3. PCA 적용**

In [None]:
pca_cifar = PCA(n_components=2)
principalComponents_cifar = pca_cifar.fit_transform(df_cifar.iloc[:,:-1])

In [None]:
principal_cifar_Df = pd.DataFrame(data = principalComponents_cifar
             , columns = ['principal component 1', 'principal component 2'])
principal_cifar_Df['y'] = y_train

principal_cifar_Df.head()

In [None]:
print('Explained variation per principal component: {}'.format(pca_cifar.explained_variance_ratio_))

In [None]:
plt.figure(figsize=(16,10))
sns.scatterplot(
    x="principal component 1", y="principal component 2",
    hue="y",
    palette=sns.color_palette("hls", 10),
    data=principal_cifar_Df,
    legend="full",
    alpha=0.3
)


### **2-4. PCA 효과성 검증**

In [None]:
x_test = x_test/255
x_test = x_test.reshape(-1,32,32,3)
x_test_flat = x_test.reshape(-1,3072)

In [None]:
pca = PCA(0.9)

In [None]:
pca.fit(x_train_flat)

In [None]:
PCA(copy=True, iterated_power='auto', n_components=0.9, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

#pca수
pca.n_components_

In [None]:
#x_train_flat: 기존 데이터
#train_img_pca: pca 적용 후 데이터
train_img_pca = pca.transform(x_train_flat)
test_img_pca = pca.transform(x_test_flat)

In [None]:
y_train = to_categorical(y_train)
y_test = to_categorical(y_test)

In [None]:
batch_size = 128
num_classes = 10
epochs = 40

In [None]:
#원본 학습
model = Sequential()
model.add(Dense(1024, activation='relu', input_shape=(3072,)))
model.add(Dropout(0.5))  # Dropout 추가
model.add(Dense(1024, activation='relu'))
model.add(Dropout(0.5))  # Dropout 추가
model.add(Dense(512, activation='relu'))
model.add(Dense(256, activation='relu'))
model.add(Dense(num_classes, activation='softmax'))
model.compile(loss='categorical_crossentropy',
              optimizer=RMSprop(),
              metrics=['accuracy'])

history = model.fit(x_train_flat, y_train,batch_size=batch_size,epochs=epochs,verbose=1,
                    validation_data=(x_test_flat, y_test))

In [None]:
#pca 학습
model_pca = Sequential()
model_pca.add(Dense(1024, activation='relu', input_shape=(99,)))
model_pca.add(Dropout(0.5))  # Dropout 추가
model_pca.add(Dense(1024, activation='relu'))
model_pca.add(Dropout(0.5))  # Dropout 추가
model_pca.add(Dense(512, activation='relu'))
model_pca.add(Dense(256, activation='relu'))
model_pca.add(Dense(num_classes, activation='softmax'))

model_pca.compile(loss='categorical_crossentropy',
              optimizer=RMSprop(),
              metrics=['accuracy'])

history_pca = model_pca.fit(train_img_pca, y_train,batch_size=batch_size,epochs=epochs,verbose=1,
                    validation_data=(test_img_pca, y_test))

In [None]:
# Plotting the accuracy of the two models
plt.figure(figsize=(14, 6))

# Plot training accuracy
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Original Model - Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Original Model - Validation Accuracy')
plt.plot(history_pca.history['accuracy'], label='PCA Model - Training Accuracy')
plt.plot(history_pca.history['val_accuracy'], label='PCA Model - Validation Accuracy')
plt.title('Model Accuracy Comparison')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()

plt.tight_layout()
plt.show()