# Library

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier
from sklearn.neural_network import MLPClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, Flatten, MaxPooling1D
import matplotlib.pyplot as plt
#import shap

# 전처리

In [2]:
# 파일 불러오기
file_path = r"C:\Users\james\J DATA LAB\Project-DonationPrediction\data\koweps_recent_3years_cleaned.csv"
df = pd.read_csv(file_path)
df.head()

  df = pd.read_csv(file_path)


Unnamed: 0,year,wv,wv_num,first_wv,last_wv,p_wsl,p_wsc,p_wgl,p_wgc,p_wsc_all,...,wc_60,wc_61,wc_62,wc_63,wc_8aq12,wc_64,wc_65,wc_5aq4,wc_5aq5,wc_5aq6
0,2021,17,18,1,18,0.255098,0.255099,1301.779156,1301.779156,0.279237,...,,,,,,,,,,
1,2022,18,18,1,18,0.259465,0.256182,1370.40069,1353.065043,0.1289,...,,,,,,,,,,
2,2021,17,18,1,18,0.369345,0.369345,1884.782338,1884.782338,0.389495,...,,,,,,,,,,
3,2022,18,18,1,18,0.397864,0.480753,2101.380145,2539.170017,0.323196,...,,,,,,,,,,
4,2021,17,18,1,18,0.2284,0.228046,1165.535342,1163.730163,0.241415,...,,,,,,,,,,


In [3]:
# 결측치 제거
df = df.replace([' ', '.', 'NaN'], np.nan)
df = df.fillna(0)

# 숫자형으로 변환 가능하도록 dtype 변경
df = df.apply(pd.to_numeric, errors='coerce').fillna(0)

# 분석대상(Y) 이진화 (1만원 이상 기부 여부)
y = (df['p04_5'] >= 1).astype(int)

# 횔용변수(X) 설정 (p04_5를 제외한 나머지)
x = df.drop(columns=['p04_5'])

In [4]:
# 데이터 분할 (train 80%, val 10%, test 10%)
x_train, x_temp, y_train, y_temp = train_test_split(x, y, test_size=0.2, random_state=42)
x_val, x_test, y_val, y_test = train_test_split(x_temp, y_temp, test_size=0.5, random_state=42)

In [5]:
# 스케일링 (StandardScaler)
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_val_scaled = scaler.transform(x_val)
x_test_scaled = scaler.transform(x_test)

# 개인 기부확률 예측(ML 5종 + DL 2종 알고리즘)

## LogisticRegression

In [6]:
# 로지스틱 회귀 모델 생성 및 학습
logreg = LogisticRegression(max_iter=1000)
logreg.fit(x_train_scaled, y_train)

# 검증 세트로 예측 및 평가
y_val_pred = logreg.predict(x_val_scaled)
val_accuracy = accuracy_score(y_val, y_val_pred)

print(f"검증 데이터 정확도: {val_accuracy}")

# 테스트 세트로 예측 및 평가
y_test_pred = logreg.predict(x_test_scaled)
test_accuracy = accuracy_score(y_test, y_test_pred)

print(f"\n테스트 데이터 정확도: {test_accuracy}")

검증 데이터 정확도: 0.9910824108241082

테스트 데이터 정확도: 0.9904703350753151


## RandomForestClassifier

In [7]:
# 랜덤 포레스트 모델 생성
rf = RandomForestClassifier(n_estimators=100, random_state=42)

# 모델 학습
rf.fit(x_train_scaled, y_train)

# 검증 데이터로 예측
y_val_pred = rf.predict(x_val_scaled)

# 검증 세트 성능 평가
val_accuracy = accuracy_score(y_val, y_val_pred)

print(f"검증 데이터 정확도: {val_accuracy}")

# 5. 테스트 세트 예측
y_test_pred = rf.predict(x_test_scaled)

# 테스트 세트 성능 평가
test_accuracy = accuracy_score(y_test, y_test_pred)

print(f"\n테스트 데이터 정확도: {test_accuracy}")

검증 데이터 정확도: 0.9677121771217713

테스트 데이터 정확도: 0.9658776513987088


## XGBClassifier

In [8]:
# XGBoost 모델 생성
xgb_model = xgb.XGBClassifier(n_estimators=100, random_state=42)

# 모델 학습
xgb_model.fit(x_train_scaled, y_train)

# 검증 세트 예측
y_val_pred = xgb_model.predict(x_val_scaled)

# 검증 세트 성능 평가
val_accuracy = accuracy_score(y_val, y_val_pred)

print(f"검증 데이터 정확도: {val_accuracy}")


# 테스트 세트 예측
y_test_pred = xgb_model.predict(x_test_scaled)

# 테스트 세트 성능 평가
test_accuracy = accuracy_score(y_test, y_test_pred)

print(f"\n테스트 데이터 정확도: {test_accuracy}")

검증 데이터 정확도: 0.9966174661746617

테스트 데이터 정확도: 0.9935444205348909


## LGBMClassifier

In [9]:
# LightGBM 모델 생성
lgbm_model = lgb.LGBMClassifier(n_estimators=100, random_state=42)

# 모델 학습
lgbm_model.fit(x_train_scaled, y_train)

# 검증 세트 예측
y_val_pred = lgbm_model.predict(x_val_scaled)

# 검증 세트 성능 평가
val_accuracy = accuracy_score(y_val, y_val_pred)

print(f"검증 데이터 정확도: {val_accuracy}")

# 테스트 세트 예측
y_test_pred = lgbm_model.predict(x_test_scaled)

# 테스트 세트 성능 평가
test_accuracy = accuracy_score(y_test, y_test_pred)

print(f"\n테스트 데이터 정확도: {test_accuracy}")


[LightGBM] [Info] Number of positive: 1658, number of negative: 24359
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.028928 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 35918
[LightGBM] [Info] Number of data points in the train set: 26017, number of used features: 1018
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.063728 -> initscore=-2.687289
[LightGBM] [Info] Start training from score -2.687289
검증 데이터 정확도: 0.9972324723247232

테스트 데이터 정확도: 0.994159237626806


## CatBoostClassifier

In [10]:
# CatBoost 모델 생성
catboost_model = CatBoostClassifier(iterations=100, random_state=42, verbose=0)

# 모델 학습
catboost_model.fit(x_train_scaled, y_train)

# 검증 세트 예측
y_val_pred = catboost_model.predict(x_val_scaled)

# 검증 세트 성능 평가
val_accuracy = accuracy_score(y_val, y_val_pred)

print(f"검증 데이터 정확도: {val_accuracy}")


# 테스트 세트 예측
y_test_pred = catboost_model.predict(x_test_scaled)

# 테스트 세트 성능 평가
test_accuracy = accuracy_score(y_test, y_test_pred)

print(f"\n테스트 데이터 정확도: {test_accuracy}")

검증 데이터 정확도: 0.9956949569495694

테스트 데이터 정확도: 0.9932370119889333


## MLPClassifier

In [11]:
# MLP 모델 생성
mlp_model = MLPClassifier(hidden_layer_sizes=(100,), max_iter=500, random_state=42)

# 모델 학습
mlp_model.fit(x_train_scaled, y_train)

# 검증 세트 예측
y_val_pred = mlp_model.predict(x_val_scaled)

# 검증 세트 성능 평가
val_accuracy = accuracy_score(y_val, y_val_pred)

print(f"검증 데이터 정확도: {val_accuracy}")


# 테스트 세트 예측
y_test_pred = mlp_model.predict(x_test_scaled)

# 테스트 세트 성능 평가
test_accuracy = accuracy_score(y_test, y_test_pred)

print(f"\n테스트 데이터 정확도: {test_accuracy}")

검증 데이터 정확도: 0.990159901599016

테스트 데이터 정확도: 0.9895481094374423


## CNN

In [12]:
# 3D 입력
x_train_scaled = np.expand_dims(x_train_scaled, axis=2) 
x_val_scaled = np.expand_dims(x_val_scaled, axis=2)
x_test_scaled = np.expand_dims(x_test_scaled, axis=2)

# CNN 모델 구성
model = Sequential()

# Conv1D 층 추가 
model.add(Conv1D(filters=64, kernel_size=2, activation='relu', input_shape=(x_train_scaled.shape[1], 1)))

# MaxPooling1D 층 추가
model.add(MaxPooling1D(pool_size=2))

# Flatten 층 추가 (1D 데이터를 2D로 변환)
model.add(Flatten())

# Dense 층 추가 (출력층)
model.add(Dense(100, activation='relu'))
model.add(Dense(1, activation='sigmoid'))  

# 모델 컴파일
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# 모델 학습
history = model.fit(x_train_scaled, y_train, epochs=20, batch_size=32, validation_data=(x_val_scaled, y_val))

# 검증 세트 성능 평가
y_val_pred = (model.predict(x_val_scaled) > 0.5).astype("int32")
val_accuracy = accuracy_score(y_val, y_val_pred)

print(f"검증 데이터 정확도: {val_accuracy}")

# 테스트 세트 평가
y_test_pred = (model.predict(x_test_scaled) > 0.5).astype("int32")
test_accuracy = accuracy_score(y_test, y_test_pred)

print(f"\n테스트 데이터 정확도: {test_accuracy}")


Epoch 1/20


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m814/814[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 19ms/step - accuracy: 0.9317 - loss: 0.2138 - val_accuracy: 0.9920 - val_loss: 0.0309
Epoch 2/20
[1m814/814[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 19ms/step - accuracy: 0.9900 - loss: 0.0263 - val_accuracy: 0.9920 - val_loss: 0.0258
Epoch 3/20
[1m814/814[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 18ms/step - accuracy: 0.9932 - loss: 0.0186 - val_accuracy: 0.9938 - val_loss: 0.0221
Epoch 4/20
[1m814/814[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 18ms/step - accuracy: 0.9928 - loss: 0.0192 - val_accuracy: 0.9908 - val_loss: 0.0356
Epoch 5/20
[1m814/814[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 18ms/step - accuracy: 0.9952 - loss: 0.0145 - val_accuracy: 0.9948 - val_loss: 0.0199
Epoch 6/20
[1m814/814[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 19ms/step - accuracy: 0.9948 - loss: 0.0132 - val_accuracy: 0.9942 - val_loss: 0.0250
Epoch 7/20
[1m814/814[0m 

In [14]:
# 모델 정확도 저장할 리스트
val_accuracies = []
test_accuracies = []

In [15]:
val_accuracies.append(logreg_val_accuracy)
test_accuracies.append(logreg_test_accuracy)

NameError: name 'logreg_val_accuracy' is not defined