In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
import pandas as pd
import warnings

warnings.filterwarnings('ignore')
plt.rcParams['figure.figsize'] = (10, 10)

In [9]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [47]:
import pandas as pd

# CSV 파일 경로 지정
file_path = '/content/drive/MyDrive/ITStudy/09_MLDL/team_pj/bank.csv'

# CSV 파일 읽기
data = pd.read_csv(file_path)

# 데이터 확인
print(data)

## 총 11162개의 관측치와 17개의 변수가 있음
## job, education, contact, poutcome 변수에 결측값(여기서는 "unknown"으로 표시된 값)이 있다는 점을 알 수 있음

       age          job  marital  education default  balance housing loan  \
0       59       admin.  married  secondary      no     2343     yes   no   
1       56       admin.  married  secondary      no       45      no   no   
2       41   technician  married  secondary      no     1270     yes   no   
3       55     services  married  secondary      no     2476     yes   no   
4       54       admin.  married   tertiary      no      184      no   no   
...    ...          ...      ...        ...     ...      ...     ...  ...   
11157   33  blue-collar   single    primary      no        1     yes   no   
11158   39     services  married  secondary      no      733      no   no   
11159   32   technician   single  secondary      no       29      no   no   
11160   43   technician  married  secondary      no        0      no  yes   
11161   34   technician  married  secondary      no        0      no   no   

        contact  day month  duration  campaign  pdays  previous poutcome  \

In [11]:
data.columns

Index(['age', 'job', 'marital', 'education', 'default', 'balance', 'housing',
       'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'deposit'],
      dtype='object')

In [12]:
data.info()
## 10개의 범주형 특성과 7개의 수치형 특성이 있다는 것을 확인 가능
## 또한, 어떤 열에도 null 값은 없지만 앞서 언급한 대로 일부 열에는 결측값이 존재

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11162 entries, 0 to 11161
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        11162 non-null  int64 
 1   job        11162 non-null  object
 2   marital    11162 non-null  object
 3   education  11162 non-null  object
 4   default    11162 non-null  object
 5   balance    11162 non-null  int64 
 6   housing    11162 non-null  object
 7   loan       11162 non-null  object
 8   contact    11162 non-null  object
 9   day        11162 non-null  int64 
 10  month      11162 non-null  object
 11  duration   11162 non-null  int64 
 12  campaign   11162 non-null  int64 
 13  pdays      11162 non-null  int64 
 14  previous   11162 non-null  int64 
 15  poutcome   11162 non-null  object
 16  deposit    11162 non-null  object
dtypes: int64(7), object(10)
memory usage: 1.4+ MB


# 이상치 및 결측치 수동제거에 따른 모델 변화


## 이상치 및 결측치 제거 전
- 사용 모델: RandomForestClassifier
- 선택된 변수: Index(['age', 'job', 'balance', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays', 'poutcome'], dtype='object')
- 모델 정확도: 0.8351992834751455

> 정규화: StandardScaler
>
> test_size=0.2, random_state=42

In [13]:
# Label Encoding for categorical columns
label_enc = LabelEncoder()
data['deposit'] = label_enc.fit_transform(data['deposit'])

# Categorical features to be label encoded
categorical_columns = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome']
for col in categorical_columns:
    data[col] = label_enc.fit_transform(data[col])

# Scale numerical columns
scaler = StandardScaler()
numerical_columns = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']
data[numerical_columns] = scaler.fit_transform(data[numerical_columns])

# Feature와 Target 설정
X = data.drop('deposit', axis=1)
y = data['deposit']

# Train/Test 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# RandomForestClassifier 초기화
rf_classifier = RandomForestClassifier(random_state=42)

# Recursive Feature Elimination (RFE)를 이용한 feature selection
rfe_selector = RFE(estimator=rf_classifier, n_features_to_select=10, step=1)
rfe_selector = rfe_selector.fit(X_train, y_train)

# 선택된 특징으로 학습
selected_features = X_train.columns[rfe_selector.support_]
rf_classifier.fit(X_train[selected_features], y_train)

# 테스트 데이터로 예측
y_pred = rf_classifier.predict(X_test[selected_features])

# 정확도 계산
accuracy = accuracy_score(y_test, y_pred)
print(f"선택된 변수: {selected_features}")
print(f"모델 정확도: {accuracy}")


선택된 특징: Index(['age', 'job', 'balance', 'contact', 'day', 'month', 'duration',
       'campaign', 'pdays', 'poutcome'],
      dtype='object')
모델 정확도: 0.8351992834751455


## 이상치 및 결측치 제거 (age, job)
- 사용 모델: RandomForestClassifier
- 선택된 변수: age', 'job', 'balance', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays', 'poutcome'
- 모델 정확도: 0.8441325768886234

> 정규화: StandardScaler
>
> test_size=0.3, random_state=42

In [20]:
# 필요한 라이브러리 임포트
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
import pandas as pd

# 70세 이상은 이상치로 제거
data = data[data['age'] <= 70]

# 'job' 칼럼에서 'unknown' 값을 결측치로 처리
data['job'] = data['job'].replace('unknown', None)

# SimpleImputer로 결측치를 처리 (최빈값으로 대체)
imputer = SimpleImputer(strategy='most_frequent')
data['job'] = imputer.fit_transform(data[['job']])

# Label Encoding for categorical columns
label_enc = LabelEncoder()
data['deposit'] = label_enc.fit_transform(data['deposit'])

# Categorical features to be label encoded
categorical_columns = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome']
for col in categorical_columns:
    data[col] = label_enc.fit_transform(data[col])

# Scale numerical columns
scaler = StandardScaler()
numerical_columns = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']
data[numerical_columns] = scaler.fit_transform(data[numerical_columns])

# Feature와 Target 설정
X = data.drop('deposit', axis=1)
y = data['deposit']

# Train/Test 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# RandomForestClassifier 초기화
rf_classifier = RandomForestClassifier(random_state=42)

# Recursive Feature Elimination (RFE)를 이용한 feature selection
rfe_selector = RFE(estimator=rf_classifier, n_features_to_select=10, step=1)
rfe_selector = rfe_selector.fit(X_train, y_train)

# 선택된 특징으로 학습
selected_features = X_train.columns[rfe_selector.support_]
rf_classifier.fit(X_train[selected_features], y_train)

# 테스트 데이터로 예측
y_pred = rf_classifier.predict(X_test[selected_features])

# 정확도 계산
accuracy = accuracy_score(y_test, y_pred)
print(f"선택된 특징: {selected_features}")
print(f"모델 정확도: {accuracy}")

선택된 특징: Index(['age', 'job', 'balance', 'contact', 'day', 'month', 'duration',
       'campaign', 'pdays', 'poutcome'],
      dtype='object')
모델 정확도: 0.8441325768886234


## 이상치 및 결측치 제거 (age, job)
- 사용 모델: GradientBoostingClassifier
- 선택된 변수: age', 'job', 'balance', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays', 'poutcome'
- 모델 정확도: 0.8348760824126605

> 정규화: StandardScaler
>
> test_size=0.3, random_state=42

In [21]:
# 필요한 라이브러리 임포트
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_selection import RFE
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
import pandas as pd

# 70세 이상은 이상치로 제거
data = data[data['age'] <= 70]

# 'job' 칼럼에서 'unknown' 값을 결측치로 처리
data['job'] = data['job'].replace('unknown', None)

# SimpleImputer로 결측치를 처리 (최빈값으로 대체)
imputer = SimpleImputer(strategy='most_frequent')
data['job'] = imputer.fit_transform(data[['job']])

# Label Encoding for categorical columns
label_enc = LabelEncoder()
data['deposit'] = label_enc.fit_transform(data['deposit'])

# Categorical features to be label encoded
categorical_columns = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome']
for col in categorical_columns:
    data[col] = label_enc.fit_transform(data[col])

# Scale numerical columns
scaler = StandardScaler()
numerical_columns = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']
data[numerical_columns] = scaler.fit_transform(data[numerical_columns])

# Feature와 Target 설정
X = data.drop('deposit', axis=1)
y = data['deposit']

# Train/Test 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# GradientBoostingClassifier 초기화
rf_classifier = GradientBoostingClassifier(random_state=42)

# Recursive Feature Elimination (RFE)를 이용한 feature selection
rfe_selector = RFE(estimator=rf_classifier, n_features_to_select=10, step=1)
rfe_selector = rfe_selector.fit(X_train, y_train)

# 선택된 특징으로 학습
selected_features = X_train.columns[rfe_selector.support_]
rf_classifier.fit(X_train[selected_features], y_train)

# 테스트 데이터로 예측
y_pred = rf_classifier.predict(X_test[selected_features])

# 정확도 계산
accuracy = accuracy_score(y_test, y_pred)
print(f"선택된 특징: {selected_features}")
print(f"모델 정확도: {accuracy}")

선택된 특징: Index(['age', 'balance', 'housing', 'contact', 'day', 'month', 'duration',
       'pdays', 'previous', 'poutcome'],
      dtype='object')
모델 정확도: 0.8348760824126605


## 이상치 및 결측치 제거 (age, job)
- 사용 모델: lightgbm
- 선택된 변수: 'age', 'job', 'balance', 'housing', 'contact', 'day', 'month','duration', 'campaign', 'pdays'
- 모델 정확도: 0.8518960883845924

> 정규화: StandardScaler
>
> test_size=0.3, random_state=42

In [22]:
# 필요한 라이브러리 임포트
import lightgbm as lgb
from sklearn.feature_selection import RFE
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
import pandas as pd

# 70세 이상은 이상치로 제거
data = data[data['age'] <= 70]

# 'job' 칼럼에서 'unknown' 값을 결측치로 처리
data['job'] = data['job'].replace('unknown', None)

# SimpleImputer로 결측치를 처리 (최빈값으로 대체)
imputer = SimpleImputer(strategy='most_frequent')
data['job'] = imputer.fit_transform(data[['job']])

# Label Encoding for categorical columns
label_enc = LabelEncoder()
data['deposit'] = label_enc.fit_transform(data['deposit'])

# Categorical features to be label encoded
categorical_columns = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome']
for col in categorical_columns:
    data[col] = label_enc.fit_transform(data[col])

# Scale numerical columns
scaler = StandardScaler()
numerical_columns = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']
data[numerical_columns] = scaler.fit_transform(data[numerical_columns])

# Feature와 Target 설정
X = data.drop('deposit', axis=1)
y = data['deposit']

# Train/Test 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# LightGBM 모델 초기화
lgb_classifier = lgb.LGBMClassifier(random_state=42)

# Recursive Feature Elimination (RFE)를 이용한 feature selection
rfe_selector = RFE(estimator=lgb_classifier, n_features_to_select=10, step=1)
rfe_selector = rfe_selector.fit(X_train, y_train)

# 선택된 특징으로 학습
selected_features = X_train.columns[rfe_selector.support_]
lgb_classifier.fit(X_train[selected_features], y_train)

# 테스트 데이터로 예측
y_pred = lgb_classifier.predict(X_test[selected_features])

# 정확도 계산
accuracy = accuracy_score(y_test, y_pred)
print(f"선택된 특징: {selected_features}")
print(f"모델 정확도: {accuracy}")


[LightGBM] [Info] Number of positive: 3682, number of negative: 4131
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001702 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 964
[LightGBM] [Info] Number of data points in the train set: 7813, number of used features: 16
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.471266 -> initscore=-0.115063
[LightGBM] [Info] Start training from score -0.115063
[LightGBM] [Info] Number of positive: 3682, number of negative: 4131
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001104 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 962
[LightGBM] [Info] Number of data points in the train set: 7813, number of used features: 15
[LightGBM] [Info] [binary:

## 이상치 및 결측치 제거 (age, job)
- 사용 모델: xgboost
- 선택된 변수: 'education', 'housing', 'loan', 'contact', 'day', 'month', 'duration', 'pdays', 'previous', 'poutcome'
- 모델 정확도: 0.8542848611525828

> 정규화: StandardScaler
>
> test_size=0.3, random_state=42

In [28]:
# 필요한 라이브러리 임포트
import xgboost as xgb
from sklearn.feature_selection import RFE
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
import pandas as pd

# 70세 이상은 이상치로 제거
data = data[data['age'] <= 70]

# 'job' 칼럼에서 'unknown' 값을 결측치로 처리
data['job'] = data['job'].replace('unknown', None)

# SimpleImputer로 결측치를 처리 (최빈값으로 대체)
imputer = SimpleImputer(strategy='most_frequent')
data['job'] = imputer.fit_transform(data[['job']])

# Label Encoding for categorical columns
label_enc = LabelEncoder()
data['deposit'] = label_enc.fit_transform(data['deposit'])

# Categorical features to be label encoded
categorical_columns = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome']
for col in categorical_columns:
    data[col] = label_enc.fit_transform(data[col])

# Scale numerical columns
scaler = StandardScaler()
numerical_columns = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']
data[numerical_columns] = scaler.fit_transform(data[numerical_columns])

# Feature와 Target 설정
X = data.drop('deposit', axis=1)
y = data['deposit']

# Train/Test 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# XGBoost 모델 초기화
xgb_classifier = xgb.XGBClassifier(random_state=42)

# Recursive Feature Elimination (RFE)를 이용한 feature selection
rfe_selector = RFE(estimator=xgb_classifier, n_features_to_select=10, step=1)
rfe_selector = rfe_selector.fit(X_train, y_train)

# 선택된 특징으로 학습
selected_features = X_train.columns[rfe_selector.support_]
xgb_classifier.fit(X_train[selected_features], y_train)

# 테스트 데이터로 예측
y_pred = xgb_classifier.predict(X_test[selected_features])

# 정확도 계산
accuracy = accuracy_score(y_test, y_pred)
print(f"선택된 특징: {selected_features}")
print(f"XGBoost 모델 정확도: {accuracy}")


선택된 특징: Index(['education', 'housing', 'loan', 'contact', 'day', 'month', 'duration',
       'pdays', 'previous', 'poutcome'],
      dtype='object')
XGBoost 모델 정확도: 0.8542848611525828


# 로지스틱 회귀분석 결과

In [35]:
# 필요한 라이브러리 임포트
import pandas as pd
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score

# 70세 이상은 이상치로 제거
data = data[data['age'] <= 70]

# 'job' 칼럼에서 'unknown' 값을 결측치로 처리
data['job'] = data['job'].replace('unknown', None)

# SimpleImputer로 결측치를 처리 (최빈값으로 대체)
imputer = SimpleImputer(strategy='most_frequent')
data['job'] = imputer.fit_transform(data[['job']])

# Label Encoding for categorical columns
label_enc = LabelEncoder()
data['deposit'] = label_enc.fit_transform(data['deposit'])

# Categorical features to be label encoded
categorical_columns = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome']
for col in categorical_columns:
    data[col] = label_enc.fit_transform(data[col])

# Scale numerical columns
scaler = StandardScaler()
numerical_columns = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']
data[numerical_columns] = scaler.fit_transform(data[numerical_columns])

# Feature와 Target 설정
X = data.drop('deposit', axis=1)
y = data['deposit']

# 1. 로지스틱 회귀 모델을 사용하여 p-value를 기반으로 유의미한 변수 선택
# 상수항 추가
X_with_constant = sm.add_constant(X)

# 로지스틱 회귀 모델 적합
logit_model = sm.Logit(y, X_with_constant).fit()

# p-value가 0.05 이하인 변수 선택
selected_features = logit_model.pvalues[logit_model.pvalues < 0.05].index.tolist()

# 상수항 제거 (constant 제거)
if 'const' in selected_features:
    selected_features.remove('const')

print(f"선택된 유의미한 변수들: {selected_features}")

# 2. 선택된 변수로 분류 모델 학습 (XGBoost 예시)
X_selected = X[selected_features]  # 선택된 변수로 새로운 데이터셋 구성

# Train/Test 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)

# 분류 모델 (XGBoost) 학습 및 평가
import xgboost as xgb
xgb_classifier = xgb.XGBClassifier(random_state=42)
xgb_classifier.fit(X_train, y_train)

# 테스트 데이터로 예측
y_pred = xgb_classifier.predict(X_test)

# 정확도 계산
accuracy = accuracy_score(y_test, y_pred)
print(f"XGBoost 모델 정확도: {accuracy}")


Optimization terminated successfully.
         Current function value: 0.468294
         Iterations 7
선택된 유의미한 변수들: ['age', 'marital', 'education', 'balance', 'housing', 'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome']
XGBoost 모델 정확도: 0.8437080161218092


In [36]:
# 필요한 라이브러리 임포트
import xgboost as xgb
from sklearn.feature_selection import RFE
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
import pandas as pd

# 70세 이상은 이상치로 제거
data = data[data['age'] <= 70]

# 1. "unknown" 값을 결측값으로 처리하고, 결측값을 최빈값으로 대체
data['job'] = data['job'].replace('unknown', None)
data['education'] = data['education'].replace('unknown', None)
data['contact'] = data['contact'].replace('unknown', None)
data['poutcome'] = data['poutcome'].replace('unknown', None)

# SimpleImputer로 결측치를 처리 (최빈값으로 대체)
imputer = SimpleImputer(strategy='most_frequent')
data[['job', 'education', 'contact', 'poutcome']] = imputer.fit_transform(data[['job', 'education', 'contact', 'poutcome']])

# 2. "duration" 변수를 제거 (사후 데이터 포함)
data = data.drop('duration', axis=1)

# Label Encoding for categorical columns
label_enc = LabelEncoder()
data['deposit'] = label_enc.fit_transform(data['deposit'])

# Categorical features to be label encoded
categorical_columns = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome']
for col in categorical_columns:
    data[col] = label_enc.fit_transform(data[col])

# Scale numerical columns
scaler = StandardScaler()
numerical_columns = ['age', 'balance', 'day', 'campaign', 'pdays', 'previous']
data[numerical_columns] = scaler.fit_transform(data[numerical_columns])

# Feature와 Target 설정
X = data.drop('deposit', axis=1)
y = data['deposit']

# Train/Test 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# XGBoost 모델 초기화
xgb_classifier = xgb.XGBClassifier(random_state=42)

# Recursive Feature Elimination (RFE)를 이용한 feature selection
rfe_selector = RFE(estimator=xgb_classifier, n_features_to_select=10, step=1)
rfe_selector = rfe_selector.fit(X_train, y_train)

# 선택된 특징으로 학습
selected_features = X_train.columns[rfe_selector.support_]
xgb_classifier.fit(X_train[selected_features], y_train)

# 테스트 데이터로 예측
y_pred = xgb_classifier.predict(X_test[selected_features])

# 정확도 계산
accuracy = accuracy_score(y_test, y_pred)
print(f"선택된 특징: {selected_features}")
print(f"XGBoost 모델 정확도: {accuracy}")


선택된 특징: Index(['age', 'marital', 'balance', 'housing', 'loan', 'contact', 'day',
       'month', 'pdays', 'poutcome'],
      dtype='object')
XGBoost 모델 정확도: 0.7160346372051358


In [39]:
# 필요한 라이브러리 임포트
import pandas as pd
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
import xgboost as xgb

# 70세 이상은 이상치로 제거
data = data[data['age'] <= 70]

# 'job', 'education', 'contact', 'poutcome' 칼럼에서 'unknown' 값을 결측치로 처리
data['job'] = data['job'].replace('unknown', None)
data['education'] = data['education'].replace('unknown', None)
data['contact'] = data['contact'].replace('unknown', None)
data['poutcome'] = data['poutcome'].replace('unknown', None)

# SimpleImputer로 결측치를 최빈값으로 대체
imputer = SimpleImputer(strategy='most_frequent')
data[['job', 'education', 'contact', 'poutcome']] = imputer.fit_transform(data[['job', 'education', 'contact', 'poutcome']])

# 'duration' 변수 제거 (사후 데이터)
data = data.drop('duration', axis=1)

# Label Encoding for categorical columns
label_enc = LabelEncoder()
data['deposit'] = label_enc.fit_transform(data['deposit'])

# Categorical features to be label encoded
categorical_columns = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome']
for col in categorical_columns:
    data[col] = label_enc.fit_transform(data[col])

# Scale numerical columns
scaler = StandardScaler()
numerical_columns = ['age', 'balance', 'day', 'campaign', 'pdays', 'previous']
data[numerical_columns] = scaler.fit_transform(data[numerical_columns])

# Feature와 Target 설정
X = data.drop('deposit', axis=1)
y = data['deposit']

# 1. 로지스틱 회귀 모델을 사용하여 p-value를 기반으로 유의미한 변수 선택
# 상수항 추가
X_with_constant = sm.add_constant(X)

# 로지스틱 회귀 모델 적합
logit_model = sm.Logit(y, X_with_constant).fit()

# p-value가 0.05 이하인 변수 선택
selected_features = logit_model.pvalues[logit_model.pvalues < 0.05].index.tolist()

# 상수항 제거 (constant 제거)
if 'const' in selected_features:
    selected_features.remove('const')

print(f"선택된 유의미한 변수들: {selected_features}")

# 2. 선택된 변수로 분류 모델 학습 (XGBoost 예시)
X_selected = X[selected_features]  # 선택된 변수로 새로운 데이터셋 구성

# Train/Test 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)

# 분류 모델 (XGBoost) 학습 및 평가
xgb_classifier = xgb.XGBClassifier(random_state=42)
xgb_classifier.fit(X_train, y_train)

# 테스트 데이터로 예측
y_pred = xgb_classifier.predict(X_test)

# 정확도 계산
accuracy = accuracy_score(y_test, y_pred)
print(f"XGBoost 모델 정확도: {accuracy}")


Optimization terminated successfully.
         Current function value: 0.619286
         Iterations 6
선택된 유의미한 변수들: ['marital', 'education', 'balance', 'housing', 'loan', 'contact', 'day', 'month', 'campaign', 'pdays', 'previous', 'poutcome']
XGBoost 모델 정확도: 0.7222222222222222


In [43]:
# 필요한 라이브러리 임포트
import pandas as pd
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
import xgboost as xgb

# 70세 이상은 이상치로 제거
data = data[data['age'] <= 70]

# 'job'과 'age' 칼럼에서 'unknown' 값을 결측치로 처리
data['job'] = data['job'].replace('unknown', None)

# SimpleImputer로 job 칼럼의 결측치를 최빈값으로 대체
imputer = SimpleImputer(strategy='most_frequent')
data['job'] = imputer.fit_transform(data[['job']])

# 'duration' 변수 제거 (사후 데이터)
data = data.drop('duration', axis=1)

# Label Encoding for categorical columns
label_enc = LabelEncoder()
data['deposit'] = label_enc.fit_transform(data['deposit'])

# Categorical features to be label encoded
categorical_columns = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome']
for col in categorical_columns:
    data[col] = label_enc.fit_transform(data[col])

# Scale numerical columns
scaler = StandardScaler()
numerical_columns = ['age', 'balance', 'day', 'campaign', 'pdays', 'previous']
data[numerical_columns] = scaler.fit_transform(data[numerical_columns])

# Feature와 Target 설정
X = data.drop('deposit', axis=1)
y = data['deposit']

# 1. 로지스틱 회귀 모델을 사용하여 p-value를 기반으로 유의미한 변수 선택
# 상수항 추가
X_with_constant = sm.add_constant(X)

# 로지스틱 회귀 모델 적합
logit_model = sm.Logit(y, X_with_constant).fit()

# p-value가 0.05 이하인 변수 선택
selected_features = logit_model.pvalues[logit_model.pvalues < 0.05].index.tolist()

# 상수항 제거 (constant 제거)
if 'const' in selected_features:
    selected_features.remove('const')

print(f"선택된 유의미한 변수들: {selected_features}")

# 2. 선택된 변수로 분류 모델 학습 (XGBoost 예시)
X_selected = X[selected_features]  # 선택된 변수로 새로운 데이터셋 구성

# Train/Test 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)

# 분류 모델 (XGBoost) 학습 및 평가
xgb_classifier = xgb.XGBClassifier(random_state=42)
xgb_classifier.fit(X_train, y_train)

# 테스트 데이터로 예측
y_pred = xgb_classifier.predict(X_test)

# 정확도 계산
accuracy = accuracy_score(y_test, y_pred)
print(f"XGBoost 모델 정확도: {accuracy}")


ValueError: 2

In [48]:
# Feature Importance 기반 상위 10개의 피처 선택 및 모델 학습 코드:
# 필요한 라이브러리 임포트
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score

# 70세 이상은 이상치로 제거
data = data[data['age'] <= 70]

# 'job' 칼럼에서 'unknown' 값을 결측치로 처리
data['job'] = data['job'].replace('unknown', None)

# SimpleImputer로 job 칼럼의 결측치를 최빈값으로 대체 (문자열 형식으로 변환)
imputer = SimpleImputer(strategy='most_frequent', missing_values=None)
data['job'] = imputer.fit_transform(data[['job']].astype(str))

# 'duration' 변수 제거 (사후 데이터)
data = data.drop('duration', axis=1)

# Label Encoding for categorical columns
label_enc = LabelEncoder()
data['deposit'] = label_enc.fit_transform(data['deposit'])

# Categorical features to be label encoded
categorical_columns = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome']
for col in categorical_columns:
    data[col] = label_enc.fit_transform(data[col])

# Scale numerical columns
scaler = StandardScaler()
numerical_columns = ['age', 'balance', 'day', 'campaign', 'pdays', 'previous']
data[numerical_columns] = scaler.fit_transform(data[numerical_columns])

# Feature


ValueError: 2