# **밀도기반 군집화 (Density-based clustering)**
### **DBSCAN (Density-Based Spatial Clustering of Applications with Noise)**
- **데이터 포인트의 밀도를 기준으로 클러스터를 형성하며, 밀도가 낮은 영역에 위치한 데이터 포인트는 noise로 간주**
- **복잡한 형태의 클러스터와 noise가 많은 데이터셋에서 효과적이며 이상탐지에 활용**

# **2.실습**

## **2-1.필요한 라이브러리 임포트**

In [43]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# 데이터 정규화 패키지
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# 군집분석 관련 패키지
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score

# 차원축소를 위한 패키지
from sklearn.decomposition import PCA

## **2-2. 데이터셋 설명 (Credit Card Fraud Detection)**
- **Time: 첫 거래와의 경과 시간 (초 단위)**
- **Amount: 거래 금액**
- **X 변수: 거래 특성으로 민감 정보 보호를 위해 암호화**
- **Y 변수: 거래유형 (2개 클래스 :  0 = 정상 거래, 1 = 사기 거래)**

## **2-3.데이터 로드 및 전처리**

In [None]:
# 신용카드 데이터셋 로드
credit_df = pd.read_csv("../Data/creditcard.csv")
credit_df

In [None]:
# 결측치 확인


0

In [None]:
# 데이터 내 사기거래 수 확인 (사기거래의 수가 매우 적음)


#### **계산비용이 높은 DBSCAN에 해당 데이터 셋을 적용하기 위해, 전체 데이터 셋의 일부만을 사용**

In [None]:
# 클래스 비율 확인
print("클래스 비율 확인:")
print(credit_df["Class"].value_counts())

# 클래스 비율 계산
total_samples = len(credit_df)
fraud_samples = len(credit_df[credit_df["Class"] == 1])
normal_samples = len(credit_df[credit_df["Class"] == 0])

print(f"전체 데이터 수: {total_samples}")
print(f"사기 거래 수 (Class 1): {fraud_samples}")
print(f"정상 거래 수 (Class 0): {normal_samples}")

In [None]:
# 클래스 비율을 유지하면서 10% 샘플링


In [None]:
# 샘플링 결과 확인


In [36]:
credit_df_balanced

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
177282,123113.0,-4.168525,-4.164323,1.911850,1.130443,4.152041,-2.125948,-1.803619,0.675859,0.308972,...,-0.058678,-1.673241,0.937707,-0.616568,0.780497,-1.055841,-0.154194,0.146745,157.37,0
99433,67116.0,-0.241374,-0.043836,1.545847,-0.950404,-0.819948,0.847419,-0.786322,-1.420254,1.645278,...,1.222000,-1.007936,-0.415337,-0.336823,1.033332,0.848539,0.117121,0.092623,96.35,0
182653,125495.0,-2.134432,-2.219310,0.969065,-2.858480,0.693123,-1.315593,0.284006,0.149392,1.182680,...,0.579502,0.743960,0.519019,-0.354719,0.373946,-0.319379,-0.056289,0.155978,276.73,0
101035,67705.0,-0.862259,-0.224703,2.308340,-1.941343,-0.321210,1.954794,-0.942382,0.729052,0.090916,...,0.121589,0.683341,-0.590164,-1.645139,0.665159,-0.005705,0.219394,0.098477,2.00,0
94212,64782.0,1.241610,-0.051895,0.579918,-0.115431,-0.579488,-0.548451,-0.269573,-0.041116,0.353210,...,-0.103847,-0.237586,0.124342,0.143650,0.053582,0.933286,-0.052477,0.006476,1.54,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
271650,164675.0,-0.179026,0.943564,0.159355,0.713100,1.087442,0.390270,0.802591,0.088052,-0.341001,...,0.212203,0.978882,-0.493097,-1.063402,0.173113,-0.200831,0.453460,0.223779,8.00,0
113266,72999.0,-3.826287,-2.288348,0.843972,-0.288038,2.748319,-0.529788,0.505155,-0.843771,0.950461,...,-0.992128,-0.485620,0.487403,-1.108481,0.308781,-0.033623,-0.958645,0.286987,11.83,0
188681,128081.0,2.071255,-0.433516,-3.071114,-1.575368,0.736294,-1.725837,1.051549,-0.685897,0.997689,...,0.317018,0.971460,-0.317930,0.587448,0.871183,0.199648,-0.103865,-0.073293,90.25,0
199115,132815.0,2.123341,-0.257764,-1.466731,-0.714225,0.026685,-1.049025,0.096077,-0.281468,0.837886,...,-0.306540,-0.838143,0.384608,0.675921,-0.370399,0.512908,-0.109321,-0.061377,4.00,0


In [None]:
# 데이터 스케일링


## **2-4.주성분 분석(PCA)**

In [None]:
# PCA를 사용하여 2차원으로 축소


## **2-5.DBSCAN**

In [None]:
# DBSCAN 모델 생성




# 모델 학습 및 군집화 수행


# 클러스터 레이블 출력


In [None]:
# 클러스터 레이블별 데이터 개수 확인


## **2-6.시각화**

In [None]:
# DBSCAN 결과 시각화
plt.figure(figsize=(10, 6))

# 각 클러스터에 색상 할당 (-1은 노이즈로 검정색 표시)
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=clusters, cmap="viridis", s=10, alpha=0.6)

# 노이즈 데이터 강조 (클러스터 -1)
plt.scatter(X_pca[clusters == -1, 0], X_pca[clusters == -1, 1], 
            color="black", s=10, label="Noise")

# 그래프 설정
plt.title("DBSCAN Clustering Result")
plt.xlabel("PCA Component 1")
plt.ylabel("PCA Component 2")
plt.legend()
plt.colorbar(label="Cluster Label")
plt.show()

## **2-7.결과 평가**

In [None]:
# 노이즈(-1)를 제외한 데이터로 실루엣 점수 계산

## **2-8.하이퍼파라미터 튜닝**

In [None]:
# 튜닝할 파라미터 범위 설정
eps_values = [0.3, 0.5, 0.7, 1.0]
min_samples_values = [5, 10, 15]

best_eps = None
best_min_samples = None
best_score = -1

# Grid Search 실행
for eps in eps_values:
    for min_samples in min_samples_values:
        dbscan = DBSCAN(eps=eps, min_samples=min_samples)
        clusters = dbscan.fit_predict(X_pca)
        
        # 군집 수가 1개 초과일 때만 실루엣 점수 계산
        if len(set(clusters)) > 1:
            score = silhouette_score(X_pca, clusters)
            print(f"eps: {eps}, min_samples: {min_samples}, Silhouette Score: {score:.4f}")
            
            if score > best_score:
                best_eps = eps
                best_min_samples = min_samples
                best_score = score

print("\n최적 파라미터:")
print(f"Best eps: {best_eps}, Best min_samples: {best_min_samples}, Best Silhouette Score: {best_score:.4f}")