In [None]:
#
# # 빅데이터 분석기사: 군집화(Clustering) 실습 데이터셋 컬럼 설명
#
# # ---
# # 데이터셋은 주로 신용카드 이용 패턴 및 고객의 금융 행동을 나타내는 변수들로 구성되어 있으며,
# # 이를 바탕으로 고객을 세분화(Segmentation)하는 군집 분석에 활용될 수 있다.
# # ---
#
# # | 컬럼명                      | 데이터 타입 | 설명                  | 주요 분석 포인트                                |
# # | -------------------------- | ----------- | -------------------- | -------------------------------------------- |
#
# # 1. **CUST_ID** (문자열)
# # - 설명: 고객 고유 식별자
# # - 분석 포인트: 클러스터링 시 제외 (ID 컬럼).
#
# # 2. **BALANCE** (float)
# # - 설명: 고객의 평균 월별 잔액
# # - 분석 포인트: 소비 후 남는 평균 금액 (낮을수록 소비 많음, 재정적 여유 지표).
#
# # 3. **BALANCE_FREQUENCY** (float, 0~1)
# # - 설명: 잔액 갱신 빈도 (1에 가까울수록 잦음)
# # - 분석 포인트: 잔액 변동성, 활발한 금융 활동 정도.
#
# # 4. **PURCHASES** (float)
# # - 설명: 월평균 구매 금액
# # - 분석 포인트: 고객의 전체적인 평균 소비 수준.
#
# # 5. **ONEOFF_PURCHASES** (float)
# # - 설명: 일시불 결제 금액 합계
# # - 분석 포인트: 충동적/고액 구매 성향, 일회성 지출 성향.
#
# # 6. **INSTALLMENTS_PURCHASES** (float)
# # - 설명: 할부 결제 금액 합계
# # - 분석 포인트: 장기 지출 성향, 계획적/분할 결제 선호도.
#
# # 7. **CASH_ADVANCE** (float)
# # - 설명: 현금서비스 사용 금액
# # - 분석 포인트: 현금 유동성 부족 여부, 긴급 자금 필요 정도.
#
# # 8. **PURCHASES_FREQUENCY** (float, 0~1)
# # - 설명: 구매 발생 비율 (1에 가까울수록 빈도가 높음)
# # - 분석 포인트: 소비 활동의 전체 빈도.
#
# # 9. **ONEOFF_PURCHASES_FREQUENCY** (float)
# # - 설명: 일시불 구매 비율
# # - 분석 포인트: 단기적 소비/고액 구매 성향 빈도.
#
# # 10. **PURCHASES_INSTALLMENTS_FREQUENCY** (float)
# # - 설명: 할부 구매 비율
# # - 분석 포인트: 할부 이용 빈도, 금융 상품 활용 성향.
#
# # 11. **CASH_ADVANCE_FREQUENCY** (float)
# # - 설명: 현금서비스 사용 빈도
# # - 분석 포인트: 현금 유동성 위험도, 신용 리스크 지표.
#
# # 12. **CASH_ADVANCE_TRX** (int)
# # - 설명: 현금서비스 거래 횟수
# # - 분석 포인트: 현금서비스 사용 패턴의 횟수 지표.
#
# # 13. **PURCHASES_TRX** (int)
# # - 설명: 구매 거래 횟수
# # - 분석 포인트: 전체적인 소비 활동의 빈도수.
#
# # 14. **CREDIT_LIMIT** (float)
# # - 설명: 신용 한도
# # - 분석 포인트: 소득 수준 또는 신용도/리스크 등급 지표.
#
# # 15. **PAYMENTS** (float)
# # - 설명: 월별 평균 결제 금액
# # - 분석 포인트: 지불 능력 및 상환 규모.
#
# # 16. **MINIMUM_PAYMENTS** (float)
# # - 설명: 최소 납부 금액
# # - 분석 포인트: 미납 위험성, 신용 관리 성향 (최소 금액만 납부하는 성향 파악).
#
# # 17. **PRC_FULL_PAYMENT** (float, 0~1)
# # - 설명: 전체 결제 비율 (1이면 매번 완납하는 고객)
# # - 분석 포인트: 신용카드 사용 건전성, 이자 회피 성향.
#
# # 18. **TENURE** (int)
# # - 설명: 고객 유지 개월 수
# # - 분석 포인트: 거래 기간, 고객 충성도/장기 고객 여부 판단.
#

In [None]:
# 1. 문제 정의 (Problem Definition)
#  
# 목표: 신용카드 사용자의 소비 패턴을 분석하여 유사한 그룹으로 군집화한다.

# 활용:
# 고객 세그먼테이션(Customer Segmentation)
# 마케팅 전략 수립 (고소득/고소비/저소비 그룹별 맞춤형 서비스 제공)
# 데이터 기반 의사결정:
# 각 클러스터별 평균 소비 금액, 결제 성향 등을 분석해 마케팅 타겟팅 개선

In [1]:
import pandas as pd

# 데이터셋 불러오기 (Kaggle 데이터셋)
df = pd.read_csv("CC GENERAL.csv")

# 상위 5개 행 확인
df.head()

Unnamed: 0,CUST_ID,BALANCE,BALANCE_FREQUENCY,PURCHASES,ONEOFF_PURCHASES,INSTALLMENTS_PURCHASES,CASH_ADVANCE,PURCHASES_FREQUENCY,ONEOFF_PURCHASES_FREQUENCY,PURCHASES_INSTALLMENTS_FREQUENCY,CASH_ADVANCE_FREQUENCY,CASH_ADVANCE_TRX,PURCHASES_TRX,CREDIT_LIMIT,PAYMENTS,MINIMUM_PAYMENTS,PRC_FULL_PAYMENT,TENURE
0,C10001,40.900749,0.818182,95.4,0.0,95.4,0.0,0.166667,0.0,0.083333,0.0,0,2,1000.0,201.802084,139.509787,0.0,12
1,C10002,3202.467416,0.909091,0.0,0.0,0.0,6442.945483,0.0,0.0,0.0,0.25,4,0,7000.0,4103.032597,1072.340217,0.222222,12
2,C10003,2495.148862,1.0,773.17,773.17,0.0,0.0,1.0,1.0,0.0,0.0,0,12,7500.0,622.066742,627.284787,0.0,12
3,C10004,1666.670542,0.636364,1499.0,1499.0,0.0,205.788017,0.083333,0.083333,0.0,0.083333,1,1,7500.0,0.0,,0.0,12
4,C10005,817.714335,1.0,16.0,16.0,0.0,0.0,0.083333,0.083333,0.0,0.0,0,1,1200.0,678.334763,244.791237,0.0,12


In [5]:
# ----------
# 데이터전처리
# ------------

df.isnull().sum()
df.shape
# 결측치 처리하기
df =df.dropna()
df.isnull().sum()

CUST_ID                             0
BALANCE                             0
BALANCE_FREQUENCY                   0
PURCHASES                           0
ONEOFF_PURCHASES                    0
INSTALLMENTS_PURCHASES              0
CASH_ADVANCE                        0
PURCHASES_FREQUENCY                 0
ONEOFF_PURCHASES_FREQUENCY          0
PURCHASES_INSTALLMENTS_FREQUENCY    0
CASH_ADVANCE_FREQUENCY              0
CASH_ADVANCE_TRX                    0
PURCHASES_TRX                       0
CREDIT_LIMIT                        0
PAYMENTS                            0
MINIMUM_PAYMENTS                    0
PRC_FULL_PAYMENT                    0
TENURE                              0
dtype: int64

In [9]:
df = df.drop('CUST_ID',axis=1)
df.head()

Unnamed: 0,BALANCE,BALANCE_FREQUENCY,PURCHASES,ONEOFF_PURCHASES,INSTALLMENTS_PURCHASES,CASH_ADVANCE,PURCHASES_FREQUENCY,ONEOFF_PURCHASES_FREQUENCY,PURCHASES_INSTALLMENTS_FREQUENCY,CASH_ADVANCE_FREQUENCY,CASH_ADVANCE_TRX,PURCHASES_TRX,CREDIT_LIMIT,PAYMENTS,MINIMUM_PAYMENTS,PRC_FULL_PAYMENT,TENURE
0,40.900749,0.818182,95.4,0.0,95.4,0.0,0.166667,0.0,0.083333,0.0,0,2,1000.0,201.802084,139.509787,0.0,12
1,3202.467416,0.909091,0.0,0.0,0.0,6442.945483,0.0,0.0,0.0,0.25,4,0,7000.0,4103.032597,1072.340217,0.222222,12
2,2495.148862,1.0,773.17,773.17,0.0,0.0,1.0,1.0,0.0,0.0,0,12,7500.0,622.066742,627.284787,0.0,12
4,817.714335,1.0,16.0,16.0,0.0,0.0,0.083333,0.083333,0.0,0.0,0,1,1200.0,678.334763,244.791237,0.0,12
5,1809.828751,1.0,1333.28,0.0,1333.28,0.0,0.666667,0.0,0.583333,0.0,0,8,1800.0,1400.05777,2407.246035,0.0,12


In [11]:
# 숫자형 데이터 스케일링(정규화만)
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaled_df =scaler.fit_transform(df)

#DataFrame 변환
scaled_df =pd.DataFrame(scaled_df, columns=df.columns)
scaled_df.head()

Unnamed: 0,BALANCE,BALANCE_FREQUENCY,PURCHASES,ONEOFF_PURCHASES,INSTALLMENTS_PURCHASES,CASH_ADVANCE,PURCHASES_FREQUENCY,ONEOFF_PURCHASES_FREQUENCY,PURCHASES_INSTALLMENTS_FREQUENCY,CASH_ADVANCE_FREQUENCY,CASH_ADVANCE_TRX,PURCHASES_TRX,CREDIT_LIMIT,PAYMENTS,MINIMUM_PAYMENTS,PRC_FULL_PAYMENT,TENURE
0,-0.744625,-0.370047,-0.429184,-0.35916,-0.354826,-0.468655,-0.820769,-0.68628,-0.717179,-0.681953,-0.479437,-0.517623,-0.962575,-0.543942,-0.305508,-0.537727,0.355181
1,0.764152,0.067679,-0.473208,-0.35916,-0.458839,2.568556,-1.236139,-0.68628,-0.926522,0.557022,0.099258,-0.597054,0.677204,0.796852,0.087689,0.21238,0.355181
2,0.426602,0.505405,-0.116413,0.099909,-0.458839,-0.468655,1.256077,2.646651,-0.926522,-0.681953,-0.479437,-0.120467,0.813852,-0.399503,-0.099906,-0.537727,0.355181
3,-0.37391,0.505405,-0.465825,-0.34966,-0.458839,-0.468655,-1.028455,-0.408536,-0.926522,-0.681953,-0.479437,-0.557339,-0.907916,-0.380165,-0.261131,-0.537727,0.355181
4,0.099551,0.505405,0.142062,-0.35916,0.994815,-0.468655,0.425339,-0.68628,0.538882,-0.681953,-0.479437,-0.279329,-0.743938,-0.132119,0.650363,-0.537727,0.355181


In [12]:
# 데이터 분할
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(scaled_df, test_size=0.3, random_state=42)
print(train_df.shape, test_df.shape)

(6045, 17) (2591, 17)


In [13]:
#모델 학습
from sklearn.cluster import KMeans

# 적절한 k 찾기 (Elbow Method)
inertia = []
K = range(1,11)

for k in K:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(train_df)
    inertia.append(kmeans.inertia_)

# Elbow 그래프 시각화
plt.figure(figsize=(8,5))
plt.plot(K, inertia, 'bx-')
plt.xlabel('Cluster Count (k)')
plt.ylabel('Inertia')
plt.title('Elbow Method\s find  k ')
plt.show()

NameError: name 'plt' is not defined