# Library

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier
from sklearn.neural_network import MLPClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, Flatten, MaxPooling1D
import matplotlib.pyplot as plt
#import shap



# 전처리

In [2]:
# 파일 불러오기
file_path = r"C:\Users\james\J DATA LAB\Project-DonationPrediction\data\koweps_recent_3years_cleaned.csv"
df = pd.read_csv(file_path)
df.head()

  df = pd.read_csv(file_path)


Unnamed: 0,year,wv,wv_num,first_wv,last_wv,p_wsl,p_wsc,p_wgl,p_wgc,p_wsc_all,...,wc_60,wc_61,wc_62,wc_63,wc_8aq12,wc_64,wc_65,wc_5aq4,wc_5aq5,wc_5aq6
0,2021,17,18,1,18,0.255098,0.255099,1301.779156,1301.779156,0.279237,...,,,,,,,,,,
1,2022,18,18,1,18,0.259465,0.256182,1370.40069,1353.065043,0.1289,...,,,,,,,,,,
2,2021,17,18,1,18,0.369345,0.369345,1884.782338,1884.782338,0.389495,...,,,,,,,,,,
3,2022,18,18,1,18,0.397864,0.480753,2101.380145,2539.170017,0.323196,...,,,,,,,,,,
4,2021,17,18,1,18,0.2284,0.228046,1165.535342,1163.730163,0.241415,...,,,,,,,,,,


In [3]:
# 결측치 제거
df = df.replace([' ', '.', 'NaN'], np.nan)
df = df.fillna(0)

# 숫자형으로 변환 가능하도록 dtype 변경
df = df.apply(pd.to_numeric, errors='coerce').fillna(0)

# 분석대상(Y) 이진화 (1만원 이상 기부 여부)
y = (df['p04_5'] >= 1).astype(int)

# 횔용변수(X) 설정 (p04_5를 제외한 나머지)
x = df.drop(columns=['p04_5'])

In [4]:
# 데이터 분할 (train 80%, val 10%, test 10%)
x_train, x_temp, y_train, y_temp = train_test_split(x, y, test_size=0.2, random_state=42)
x_val, x_test, y_val, y_test = train_test_split(x_temp, y_temp, test_size=0.5, random_state=42)

In [5]:
# 스케일링 (StandardScaler)
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_val_scaled = scaler.transform(x_val)
x_test_scaled = scaler.transform(x_test)

# 모델링 검증(ML 5종 + DL 2종 알고리즘)

## LogisticRegression

In [8]:
# 로지스틱 회귀 모델 생성 및 학습
logreg = LogisticRegression(max_iter=1000)
logreg.fit(x_train_scaled, y_train)

# 검증 세트로 예측 및 평가
y_val_pred = logreg.predict(x_val_scaled)
val_accuracy = accuracy_score(y_val, y_val_pred)
val_conf_matrix = confusion_matrix(y_val, y_val_pred)
val_class_report = classification_report(y_val, y_val_pred)


print(f"VAL DATA ACCURACY SCORE: {val_accuracy}")
print(f"Confusion Matrix: {val_conf_matrix}")
print("\nClassification Report:")
print(val_class_report)

VAL DATA ACCURACY SCORE: 0.9910824108241082
Confusion Matrix: [[3005   15]
 [  14  218]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3020
           1       0.94      0.94      0.94       232

    accuracy                           0.99      3252
   macro avg       0.97      0.97      0.97      3252
weighted avg       0.99      0.99      0.99      3252



## RandomForestClassifier

In [9]:
# 랜덤 포레스트 모델 생성
rf = RandomForestClassifier(n_estimators=100, random_state=42)

# 모델 학습
rf.fit(x_train_scaled, y_train)

# 검증 데이터로 예측
y_val_pred = rf.predict(x_val_scaled)

# 검증 세트 성능 평가
val_accuracy = accuracy_score(y_val, y_val_pred)
val_conf_matrix = confusion_matrix(y_val, y_val_pred)
val_class_report = classification_report(y_val, y_val_pred)

print(f"VAL DATA ACCURACY SCORE: {val_accuracy}")
print(f"Confusion Matrix: {val_conf_matrix}")
print("\nClassification Report:")
print(val_class_report)

VAL DATA ACCURACY SCORE: 0.9677121771217713
Confusion Matrix: [[3017    3]
 [ 102  130]]

Classification Report:
              precision    recall  f1-score   support

           0       0.97      1.00      0.98      3020
           1       0.98      0.56      0.71       232

    accuracy                           0.97      3252
   macro avg       0.97      0.78      0.85      3252
weighted avg       0.97      0.97      0.96      3252



## XGBClassifier

In [10]:
# XGBoost 모델 생성
xgb_model = xgb.XGBClassifier(n_estimators=100, random_state=42)

# 모델 학습
xgb_model.fit(x_train_scaled, y_train)

# 검증 세트 예측
y_val_pred = xgb_model.predict(x_val_scaled)

# 검증 세트 성능 평가
val_accuracy = accuracy_score(y_val, y_val_pred)
val_conf_matrix = confusion_matrix(y_val, y_val_pred)
val_class_report = classification_report(y_val, y_val_pred)

print(f"VAL DATA ACCURACY SCORE: {val_accuracy}")
print(f"Confusion Matrix: {val_conf_matrix}")
print("\nClassification Report:")
print(val_class_report)

VAL DATA ACCURACY SCORE: 0.9966174661746617
Confusion Matrix: [[3017    3]
 [   8  224]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3020
           1       0.99      0.97      0.98       232

    accuracy                           1.00      3252
   macro avg       0.99      0.98      0.99      3252
weighted avg       1.00      1.00      1.00      3252



## LGBMClassifier

In [11]:
# LightGBM 모델 생성
lgbm_model = lgb.LGBMClassifier(n_estimators=100, random_state=42)

# 모델 학습
lgbm_model.fit(x_train_scaled, y_train)

# 검증 세트 예측
y_val_pred = lgbm_model.predict(x_val_scaled)

# 검증 세트 성능 평가
val_conf_matrix = confusion_matrix(y_val, y_val_pred)
val_class_report = classification_report(y_val, y_val_pred)

print(f"VAL DATA ACCURACY SCORE: {val_accuracy}")
print(f"Confusion Matrix: {val_conf_matrix}")
print("\nClassification Report:")
print(val_class_report)

[LightGBM] [Info] Number of positive: 1658, number of negative: 24359
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.028202 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 35918
[LightGBM] [Info] Number of data points in the train set: 26017, number of used features: 1018
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.063728 -> initscore=-2.687289
[LightGBM] [Info] Start training from score -2.687289
VAL DATA ACCURACY SCORE: 0.9966174661746617
Confusion Matrix: [[3018    2]
 [   7  225]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3020
           1       0.99      0.97      0.98       232

    accuracy                           1.00      3252
   macro avg       0.99      0.98      0.99      3252
weighted avg       1.00      1.00      1.00      3252



## CatBoostClassifier

In [12]:
# CatBoost 모델 생성
catboost_model = CatBoostClassifier(iterations=100, random_state=42, verbose=0)

# 모델 학습
catboost_model.fit(x_train_scaled, y_train)

# 검증 세트 예측
y_val_pred = catboost_model.predict(x_val_scaled)

# 검증 세트 성능 평가
val_conf_matrix = confusion_matrix(y_val, y_val_pred)
val_class_report = classification_report(y_val, y_val_pred)

print(f"VAL DATA ACCURACY SCORE: {val_accuracy}")
print(f"Confusion Matrix: {val_conf_matrix}")
print("\nClassification Report:")
print(val_class_report)

VAL DATA ACCURACY SCORE: 0.9966174661746617
Confusion Matrix: [[3015    5]
 [   9  223]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3020
           1       0.98      0.96      0.97       232

    accuracy                           1.00      3252
   macro avg       0.99      0.98      0.98      3252
weighted avg       1.00      1.00      1.00      3252



## MLPClassifier

In [13]:
# MLP 모델 생성
mlp_model = MLPClassifier(hidden_layer_sizes=(100,), max_iter=500, random_state=42)

# 모델 학습
mlp_model.fit(x_train_scaled, y_train)

# 검증 세트 예측
y_val_pred = mlp_model.predict(x_val_scaled)

# 검증 세트 성능 평가
val_conf_matrix = confusion_matrix(y_val, y_val_pred)
val_class_report = classification_report(y_val, y_val_pred)

print(f"VAL DATA ACCURACY SCORE: {val_accuracy}")
print(f"Confusion Matrix: {val_conf_matrix}")
print("\nClassification Report:")
print(val_class_report)

VAL DATA ACCURACY SCORE: 0.9966174661746617
Confusion Matrix: [[3008   12]
 [  20  212]]

Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      3020
           1       0.95      0.91      0.93       232

    accuracy                           0.99      3252
   macro avg       0.97      0.95      0.96      3252
weighted avg       0.99      0.99      0.99      3252



## CNN

In [12]:
# 3D 입력
x_train_scaled = np.expand_dims(x_train_scaled, axis=2) 
x_val_scaled = np.expand_dims(x_val_scaled, axis=2)
x_test_scaled = np.expand_dims(x_test_scaled, axis=2)

# CNN 모델 구성
model = Sequential()

# Conv1D 층 추가 
model.add(Conv1D(filters=64, kernel_size=2, activation='relu', input_shape=(x_train_scaled.shape[1], 1)))

# MaxPooling1D 층 추가
model.add(MaxPooling1D(pool_size=2))

# Flatten 층 추가 (1D 데이터를 2D로 변환)
model.add(Flatten())

# Dense 층 추가 (출력층)
model.add(Dense(100, activation='relu'))
model.add(Dense(1, activation='sigmoid'))  

# 모델 컴파일
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# 모델 학습
history = model.fit(x_train_scaled, y_train, epochs=20, batch_size=32, validation_data=(x_val_scaled, y_val))

# 검증 세트 성능 평가
y_val_pred = (model.predict(x_val_scaled) > 0.5).astype("int32")
val_accuracy = accuracy_score(y_val, y_val_pred)

print(f"VAL DATA ACCURACY SCORE: {val_accuracy}")

Epoch 1/20


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m814/814[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 19ms/step - accuracy: 0.9377 - loss: 0.2071 - val_accuracy: 0.9935 - val_loss: 0.0256
Epoch 2/20
[1m814/814[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 19ms/step - accuracy: 0.9918 - loss: 0.0250 - val_accuracy: 0.9914 - val_loss: 0.0258
Epoch 3/20
[1m814/814[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 19ms/step - accuracy: 0.9932 - loss: 0.0199 - val_accuracy: 0.9945 - val_loss: 0.0256
Epoch 4/20
[1m814/814[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 19ms/step - accuracy: 0.9934 - loss: 0.0169 - val_accuracy: 0.9932 - val_loss: 0.0198
Epoch 5/20
[1m814/814[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 18ms/step - accuracy: 0.9952 - loss: 0.0129 - val_accuracy: 0.9926 - val_loss: 0.0270
Epoch 6/20
[1m814/814[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 18ms/step - accuracy: 0.9950 - loss: 0.0124 - val_accuracy: 0.9923 - val_loss: 0.0290
Epoch 7/20
[1m814/814[0m 

# 개인 기부확률 예측(1 Test Sample)

In [13]:
# 테스트 세트에서 첫 번째 샘플 선택
single_test_sample = x_test.iloc[0:1]

# 기부 확률 예측
single_sample_proba = logreg.predict_proba(single_test_sample)

print(f"기부하지 않을 확률: {single_sample_proba[0][0]:.4f}")
print(f"기부할 확률: {single_sample_proba[0][1]:.4f}")

기부하지 않을 확률: 0.0000
기부할 확률: 1.0000




# 전체 기부특성 분석(Total Test Samples)

In [15]:
# 전체 테스트 세트에 대해 예측
y_test_pred = logreg.predict(x_test)

# 기부 확률 예측
y_test_proba = logreg.predict_proba(x_test)

# 테스트 세트에서 기부 확률 높은 순으로 정렬
test_results = x_test.copy()
test_results['Actual'] = y_test
test_results['Predicted'] = y_test_pred
test_results['Donation_Probability'] = y_test_proba[:, 1]

# 기부 확률이 높은 사람들의 특성 분석 (상위 10명 예시)
top_donors = test_results.sort_values(by='Donation_Probability', ascending=False).head(10)
top_donors



Unnamed: 0,year,wv,wv_num,first_wv,last_wv,p_wsl,p_wsc,p_wgl,p_wgc,p_wsc_all,...,wc_63,wc_8aq12,wc_64,wc_65,wc_5aq4,wc_5aq5,wc_5aq6,Actual,Predicted,Donation_Probability
22327,2022,18,12,7,18,0.0,0.0,0.0,0.0,0.908367,...,0,0.0,0,0.0,0,0,0,0,1,1.0
25182,2021,17,2,17,18,0.0,0.0,0.0,0.0,0.0,...,0,0.0,0,0.0,0,0,0,0,1,1.0
23039,2022,18,12,7,18,0.0,0.0,0.0,0.0,1.50258,...,0,0.0,0,0.0,0,0,0,0,1,1.0
7486,2022,18,18,1,18,0.492746,0.466374,2602.513559,2463.226107,0.381403,...,0,0.0,0,0.0,0,0,0,0,1,1.0
2342,2022,18,13,1,18,0.921968,1.320344,4869.5121,6973.593915,1.786622,...,0,0.0,0,0.0,0,0,0,0,1,1.0
18080,2021,17,18,1,18,4.05266,4.05266,20680.908325,20680.908325,4.5971,...,0,0.0,0,0.0,0,0,0,0,1,1.0
2622,2022,18,18,1,18,0.835007,0.914499,4410.213293,4830.063078,0.802868,...,0,0.0,0,0.0,0,0,0,0,1,1.0
19341,2022,18,18,1,18,0.195033,0.202132,1030.093726,1067.591709,0.175125,...,0,0.0,0,0.0,0,0,0,0,1,1.0
19612,2022,18,18,1,18,1.529711,1.54892,8079.393727,8180.850802,0.824541,...,0,0.0,0,0.0,0,0,0,0,1,1.0
18750,2022,18,18,1,18,1.18571,1.292087,6262.50278,6824.348455,1.119451,...,0,0.0,0,0.0,0,0,0,0,1,1.0
