In [2]:
# !pip install numpy
# !pip install pandas
# !pip install matplotlib
# !pip install seaborn
# !pip install sklearn
# !pip install imbalanced-learn
# !pip install lightgbm
# !pip install xgboost

Collecting numpy
  Downloading numpy-1.21.6-cp37-cp37m-win_amd64.whl (14.0 MB)
     ---------------------------------------- 14.0/14.0 MB 9.2 MB/s eta 0:00:00
Installing collected packages: numpy
Successfully installed numpy-1.21.6
Collecting pandas
  Downloading pandas-1.3.5-cp37-cp37m-win_amd64.whl (10.0 MB)
     ---------------------------------------- 10.0/10.0 MB 7.5 MB/s eta 0:00:00
Collecting pytz>=2017.3
  Downloading pytz-2025.1-py2.py3-none-any.whl (507 kB)
     -------------------------------------- 507.9/507.9 kB 4.0 MB/s eta 0:00:00
Installing collected packages: pytz, pandas
Successfully installed pandas-1.3.5 pytz-2025.1
Collecting matplotlib
  Downloading matplotlib-3.5.3-cp37-cp37m-win_amd64.whl (7.2 MB)
     ---------------------------------------- 7.2/7.2 MB 10.7 MB/s eta 0:00:00
Collecting cycler>=0.10
  Downloading cycler-0.11.0-py3-none-any.whl (6.4 kB)
Collecting pillow>=6.2.0
  Downloading Pillow-9.5.0-cp37-cp37m-win_amd64.whl (2.5 MB)
     ---------------------

In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, precision_score, recall_score
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import RandomOverSampler
from collections import Counter   

In [6]:
# 데이터 불러오기
df = pd.read_csv('./data/data.csv')
# df.head(3)

X = df.drop(['id','hire_state','b022', 'b023', 'b036', 'b038', 'b039', 'b040'],axis=1)
y = df['hire_state']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

먼저 모델 돌려봄

In [4]:
# 의사결정 나무
tree = DecisionTreeClassifier()

dt_clf = DecisionTreeClassifier(random_state=0, max_depth=3)   # max_depth : 가지치기 (최대 깊이 지정)
dt_clf.fit(X_train, y_train)

print("의사결정나무 정확도:", dt_clf.score(X_train, y_train), dt_clf.score(X_test, y_test))

의사결정나무 정확도: 0.7015742642026009 0.6918142160636387


In [5]:
# 랜덤 포레스트
rf_clf = RandomForestClassifier(
    n_estimators=200,
    max_features='sqrt',  # 특성의 일부만 사용
    random_state=42
)

rf_clf.fit(X_train, y_train)
print("랜덤 포레스트 정확도:", rf_clf.score(X_train, y_train), rf_clf.score(X_test, y_test))

랜덤 포레스트 정확도: 1.0 0.7151655119322555


In [6]:
# xgboost
xgb_clf = XGBClassifier(
    n_estimators=100,
    max_depth=3,
    learning_rate=0.1,
    random_state=0
)

xgb_clf.fit(X_train, y_train)

y_pred_train = xgb_clf.predict(X_train)
y_pred_test = xgb_clf.predict(X_test)

print("xgboost 정확도:", accuracy_score(y_train, y_pred_train), accuracy_score(y_test, y_pred_test))

xgboost 정확도: 0.7396475017111568 0.7244033872209392


In [7]:
y_pred = dt_clf.predict(X_test)
confusion_matrix(y_test, y_pred, labels=[0, 1])

array([[ 274,  925],
       [ 276, 2422]])

70%대의 성능이 나오는 이유는 전부 y=1(취직함)으로 예측하기 때문으로 y값의 비율 차이로 발생한다고 추정됨

### oversampleing

In [None]:
counter = Counter(y)
print(counter)

smt = SMOTE(random_state=42)
X_new, y_new = smt.fit_resample(X, y)
ros = RandomOverSampler(random_state=42)
X_res, y_res = ros.fit_resample(X, y)

counter = Counter(y_new)
print(counter)
counter = Counter(y_res)
print(counter)

Counter({1: 10732, 0: 4853})
Counter({0: 10732, 1: 10732})
Counter({0: 10732, 1: 10732})


SMOTE는 가장 가까운 값 사이에 직성을 만들고 그 안에서 새로운 값을 뽑는 방식으로 범주형 변수에 맞지 않아서 RandomOverSampler 사용

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, random_state=0)

In [10]:
dt_clf = DecisionTreeClassifier(random_state=0, max_depth=3)   # max_depth : 가지치기 (최대 깊이 지정)
dt_clf.fit(X_train, y_train)

print(dt_clf.score(X_train, y_train), dt_clf.score(X_test, y_test))

0.6087712759348988 0.5972791651136787


In [11]:
y_pred = dt_clf.predict(X_test)
confusion_matrix(y_test, y_pred, labels=[0, 1])

array([[1348, 1367],
       [ 794, 1857]])

정확도는 떨어졌지만 예측을 고르게 하는 것으로 문제가 해결됨

그리드서치

최적의 하이퍼 파라미터를 선택하여 높은 정확도의 모델을 만들기 위해 그리드 서치 사용

In [14]:
skf = StratifiedKFold(n_splits=5)
dt_clf = DecisionTreeClassifier(random_state=0)
params = {
    "max_depth": [12,13,14,15,16,17,18,19,20,21,22,23,24,25,26]
}
gscv_tree = GridSearchCV (dt_clf, params, scoring ='accuracy', cv = skf)
gscv_tree.fit(X_train, y_train)
print(gscv_tree.best_estimator_)
# pd.DataFrame(gscv_tree.cv_results_)

DecisionTreeClassifier(max_depth=26, random_state=0)


In [15]:
# 랜덤 포레스트
rf_clf = RandomForestClassifier(
    n_estimators=200,
    random_state=42
)
rf_clf.fit(X_train, y_train)
print("Random Forest Accuracy:", rf_clf.score(X_test, y_test))

Random Forest Accuracy: 0.8591129332836377


In [9]:
skf = StratifiedKFold(n_splits=5)
rf_clf = RandomForestClassifier(random_state=0)
# params = {
#     "max_depth": [5,10,15,20,25,30,35],
#     "n_estimators": [100,200,300,400,500,800,1000]
# }
params = {
    "max_depth": [26,27,28,29,30,31,32,33,34],
    "n_estimators": [450,500,550,600,650,700]
}
gscv_rf = GridSearchCV (rf_clf, params, scoring ='accuracy', cv = skf)
gscv_rf.fit(X_train, y_train)
print(gscv_rf.best_estimator_)

KeyboardInterrupt: 

In [17]:
pd.DataFrame(gscv_rf.cv_results_)

AttributeError: 'GridSearchCV' object has no attribute 'cv_results_'

In [29]:
# xgboost
xgb_clf = XGBClassifier(
    n_estimators=100,
    max_depth=3,
    learning_rate=0.1,
    random_state=0
)

xgb_clf.fit(X_train, y_train)

y_pred_train = xgb_clf.predict(X_train)
y_pred_test = xgb_clf.predict(X_test)

print(accuracy_score(y_train, y_pred_train))
print(accuracy_score(y_test, y_pred_test))
# print(classification_report(y_train, y_pred_train))
print(classification_report(y_test, y_pred_test))
confusion_matrix(y_test, y_pred_test, labels=[0, 1])

0.6741210088209716
0.6634364517331346
              precision    recall  f1-score   support

           0       0.68      0.63      0.65      2715
           1       0.65      0.70      0.67      2651

    accuracy                           0.66      5366
   macro avg       0.66      0.66      0.66      5366
weighted avg       0.66      0.66      0.66      5366



array([[1707, 1008],
       [ 798, 1853]], dtype=int64)

In [30]:
skf = StratifiedKFold(n_splits=5)
xgb_clf = XGBClassifier(random_state=0)
params = {
    "max_depth": [3,5,7,10,15,20,25],
    "n_estimators": [100, 300, 500],
    'learning_rate' : [0.01,0.05,0.1]
}
gscv_xg = GridSearchCV (xgb_clf, params, scoring ='accuracy', cv = skf)
gscv_xg.fit(X_train, y_train)
print(gscv_xg.best_estimator_)

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.05, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=20, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=500,
              n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, ...)


In [34]:
skf = StratifiedKFold(n_splits=5)
xgb_clf = XGBClassifier(random_state=0)
params = {
    "max_depth": [18,19,20,21,22,23],
    "n_estimators": [400,450, 500,550,600,700],
    'learning_rate' : [0.05]
}
gscv_xg = GridSearchCV (xgb_clf, params, scoring ='accuracy', cv = skf)
gscv_xg.fit(X_train, y_train)
print(gscv_xg.best_estimator_)

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.05, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=23, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=700,
              n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, ...)


In [35]:
skf = StratifiedKFold(n_splits=5)
xgb_clf = XGBClassifier(random_state=0)
params = {
    "max_depth": [23,24,25],
    "n_estimators": [650,700,800,1000],
    'learning_rate' : [0.05]
}
gscv_xg = GridSearchCV (xgb_clf, params, scoring ='accuracy', cv = skf)
gscv_xg.fit(X_train, y_train)
print(gscv_xg.best_estimator_)

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.05, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=23, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=800,
              n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, ...)


In [32]:
pd.DataFrame(gscv_xg.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_learning_rate,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.180436,0.006167,0.005074,0.000435,0.01,3,100,"{'learning_rate': 0.01, 'max_depth': 3, 'n_est...",0.630435,0.631677,0.622981,0.624107,0.622554,0.626351,0.003895,63
1,0.518502,0.022714,0.005011,0.000315,0.01,3,300,"{'learning_rate': 0.01, 'max_depth': 3, 'n_est...",0.655901,0.644099,0.641925,0.644921,0.633737,0.644117,0.007100,61
2,0.828115,0.065859,0.005504,0.000552,0.01,3,500,"{'learning_rate': 0.01, 'max_depth': 3, 'n_est...",0.660870,0.653727,0.645963,0.652066,0.645853,0.651696,0.005575,59
3,0.283682,0.020203,0.005204,0.000675,0.01,5,100,"{'learning_rate': 0.01, 'max_depth': 5, 'n_est...",0.655901,0.631988,0.650932,0.641504,0.637154,0.643496,0.008784,62
4,0.782597,0.035522,0.005908,0.000585,0.01,5,300,"{'learning_rate': 0.01, 'max_depth': 5, 'n_est...",0.666149,0.659006,0.663665,0.667599,0.659211,0.663126,0.003514,54
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58,3.673918,0.069603,0.016322,0.002087,0.1,20,300,"{'learning_rate': 0.1, 'max_depth': 20, 'n_est...",0.806522,0.806832,0.805590,0.807083,0.797142,0.804634,0.003780,4
59,5.443889,0.077368,0.024132,0.002773,0.1,20,500,"{'learning_rate': 0.1, 'max_depth': 20, 'n_est...",0.807143,0.807453,0.804348,0.804598,0.797453,0.804199,0.003604,6
60,1.964846,0.148565,0.008242,0.001166,0.1,25,100,"{'learning_rate': 0.1, 'max_depth': 25, 'n_est...",0.808696,0.804658,0.803106,0.800870,0.791550,0.801776,0.005716,14
61,4.189157,0.066287,0.018003,0.002367,0.1,25,300,"{'learning_rate': 0.1, 'max_depth': 25, 'n_est...",0.806832,0.804037,0.801863,0.805530,0.799006,0.803454,0.002770,8


In [37]:
# LightGBM
from lightgbm import LGBMClassifier
skf = StratifiedKFold(n_splits=5)
lgbm_clf = LGBMClassifier(random_state=0, early_stopping_rounds=100)
params = {
    "n_estimators": [1000,1500,2000],
    'learning_rate' : [0.1,0.15,0.2,0.5]
}
gscv_lgbm = GridSearchCV (lgbm_clf, params, scoring ='accuracy', cv = skf)
gscv_lgbm.fit(X_train, y_train, eval_set=(X_test, y_test))
print(gscv_lgbm.best_estimator_)
pd.DataFrame(gscv_lgbm.cv_results_)

[LightGBM] [Info] Number of positive: 6465, number of negative: 6413
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000814 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 841
[LightGBM] [Info] Number of data points in the train set: 12878, number of used features: 35
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.502019 -> initscore=0.008076
[LightGBM] [Info] Start training from score 0.008076
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[996]	valid_0's binary_logloss: 0.471651
[LightGBM] [Info] Number of positive: 6465, number of negative: 6413
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002468 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_learning_rate,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.765491,0.021304,0.031073,0.001533,0.1,1000,"{'learning_rate': 0.1, 'n_estimators': 1000}",0.786646,0.786025,0.790373,0.792793,0.787512,0.78867,0.002543,6
1,0.991288,0.086752,0.03998,0.005023,0.1,1500,"{'learning_rate': 0.1, 'n_estimators': 1500}",0.787888,0.78913,0.791615,0.789376,0.792171,0.790036,0.001608,1
2,0.989509,0.081493,0.039817,0.004039,0.1,2000,"{'learning_rate': 0.1, 'n_estimators': 2000}",0.787888,0.78913,0.791615,0.789376,0.792171,0.790036,0.001608,1
3,0.630683,0.019075,0.022881,0.001632,0.15,1000,"{'learning_rate': 0.15, 'n_estimators': 1000}",0.786957,0.789441,0.792547,0.794967,0.783162,0.789415,0.004142,3
4,0.646376,0.018817,0.02372,0.0025,0.15,1500,"{'learning_rate': 0.15, 'n_estimators': 1500}",0.786957,0.789441,0.792547,0.794967,0.783162,0.789415,0.004142,3
5,0.685742,0.052163,0.024275,0.001837,0.15,2000,"{'learning_rate': 0.15, 'n_estimators': 2000}",0.786957,0.789441,0.792547,0.794967,0.783162,0.789415,0.004142,3
6,0.532945,0.027392,0.01973,0.003197,0.2,1000,"{'learning_rate': 0.2, 'n_estimators': 1000}",0.785714,0.783851,0.78882,0.790308,0.775707,0.78488,0.005116,7
7,0.529747,0.029764,0.017322,0.001141,0.2,1500,"{'learning_rate': 0.2, 'n_estimators': 1500}",0.785714,0.783851,0.78882,0.790308,0.775707,0.78488,0.005116,7
8,0.524308,0.022411,0.020594,0.005851,0.2,2000,"{'learning_rate': 0.2, 'n_estimators': 2000}",0.785714,0.783851,0.78882,0.790308,0.775707,0.78488,0.005116,7
9,0.236598,0.029743,0.007111,0.001011,0.5,1000,"{'learning_rate': 0.5, 'n_estimators': 1000}",0.779503,0.783851,0.771118,0.752097,0.76794,0.770902,0.010992,10


In [83]:
print(gscv_lgbm.best_estimator_)

LGBMClassifier(early_stopping_rounds=100, n_estimators=1500, random_state=0)


위의 4가지 모델의 최적 파라미터를 사용하여 5-foldvalidation을 이용하여 정확도, 정밀도, 재현율을 비교하여 최종 모델을 선정

In [3]:
# 최종 모델 비교
dt_clf = DecisionTreeClassifier(max_depth=25, random_state=0)
rf_clf = RandomForestClassifier(max_depth=34, n_estimators=550, random_state=0)
xgb_clf = XGBClassifier(learning_rate=0.05, max_depth=23, n_estimators=800, random_state=0)
lgbm_clf = LGBMClassifier(n_estimators=1500, learning_rate=0.1, random_state=0)

stratified_kfold = StratifiedKFold(n_splits=5)

# k번 반복하면서 평가한 정확도를 저장할 배열
dt_accuracy = []
dt_precision = []
dt_recall = []
rf_accuracy = []
rf_precision = []
rf_recall = []
xgb_accuracy = []
xgb_precision = []
xgb_recall = []
lgbm_accuracy = []
lgbm_precision = []
lgbm_recall = []

for train_index, val_index in stratified_kfold.split(X_new, y_new):
    X_train, y_train = X_new.iloc[train_index], y_new.iloc[train_index]
    X_val, y_val = X_new.iloc[val_index], y_new.iloc[val_index]

    dt_clf.fit(X_train, y_train)
    rf_clf.fit(X_train, y_train)
    xgb_clf.fit(X_train, y_train)
    lgbm_clf.fit(X_train, y_train)

    y_pred_dt = dt_clf.predict(X_val)   # 검증 데이터로 예측
    y_pred_rf = rf_clf.predict(X_val)   # 검증 데이터로 예측
    y_pred_xgb = xgb_clf.predict(X_val)   # 검증 데이터로 예측
    y_pred_lgbm = lgbm_clf.predict(X_val)   # 검증 데이터로 예측

    dt_accuracy.append(accuracy_score(y_val, y_pred_dt)) 
    rf_accuracy.append(accuracy_score(y_val, y_pred_rf)) 
    xgb_accuracy.append(accuracy_score(y_val, y_pred_xgb)) 
    lgbm_accuracy.append(accuracy_score(y_val, y_pred_lgbm)) 
    
    dt_precision.append(precision_score(y_val, y_pred_dt)) 
    rf_precision.append(precision_score(y_val, y_pred_rf)) 
    xgb_precision.append(precision_score(y_val, y_pred_xgb)) 
    lgbm_precision.append(precision_score(y_val, y_pred_lgbm)) 
    
    dt_recall.append(recall_score(y_val, y_pred_dt)) 
    rf_recall.append(recall_score(y_val, y_pred_rf)) 
    xgb_recall.append(recall_score(y_val, y_pred_xgb)) 
    lgbm_recall.append(recall_score(y_val, y_pred_lgbm)) 


print("의사결정나무 정확도:", np.mean(dt_accuracy))
print("의사결정나무 정밀도:", np.mean(dt_precision))
print("의사결정나무 재현율:", np.mean(dt_recall))

print("랜덤포레스트 정확도:", np.mean(rf_accuracy))
print("랜덤포레스트 정밀도:", np.mean(rf_precision))
print("랜덤포레스트 재현율:", np.mean(rf_recall))

print("xgboost 정확도:", np.mean(xgb_accuracy))
print("xgboost 정밀도:", np.mean(xgb_precision))
print("xgboost 재현율:", np.mean(xgb_recall))

print("lightGBM 정확도:", np.mean(lgbm_accuracy))
print("lightGBM 정밀도:", np.mean(lgbm_precision))
print("lightGBM 재현율:", np.mean(lgbm_recall))

NameError: name 'DecisionTreeClassifier' is not defined

랜덤포레스트 최종모델로 선정

----------------------
추가적인 모델 사용

- 범주형 변수를 라벨인코딩을 하면 로지스틱과 svm과 같은 수리적인 모델에서는 사용하면 안되지만 그냥 한번 해봄

In [21]:
# 로지스틱
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

X_train, X_test, y_train, y_test = train_test_split(X_new, y_new, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

lr_clf = LogisticRegression()
lr_clf.fit(X_train_scaled, y_train)

print(lr_clf.score(X_train_scaled, y_train), lr_clf.score(X_test_scaled, y_test))

0.7064231581562928 0.689340290719344


In [22]:
# SVM
from sklearn.svm import SVC

model = SVC(kernel="rbf")   # kernel 기본값 "rbf"
model.fit(X_train, y_train)

model.score(X_train, y_train), model.score(X_test, y_test)

(0.5663436451733135, 0.5566530003727171)

In [23]:
# MLP
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(
    hidden_layer_sizes=(100,100,100,100,100),
    activation='relu',
    solver='adam',
    max_iter=5000,
    random_state=42
)
mlp.fit(X_train, y_train)

pred = mlp.predict(X_test)
print(accuracy_score(y_test, pred))
confusion_matrix(y_test, pred, labels=[0, 1])

0.7083488632128214


array([[1873,  779],
       [ 786, 1928]], dtype=int64)

In [18]:
high_prob_data = pd.DataFrame({
    'branch': [1, 1],
    'found': [1, 1],
    'course': [1, 1],
    'daynight': [0, 0],
    'major': [1, 1],
    'school': [1, 1],
    'school_area': [1, 1],
    'sex': [0, 0],
    'age': [24, 24],
    'f009': [1, 1],
    'i001': [1, 1],
    'i033': [850, 900],
    'i037': [140, 140],
    'i042': [100, 100],
    'i046': [400, 400],
    'i066': [800, 0],
    'i076': [800, 800],
    'l001': [1, 1],
    'l009': [1, 1],
    'l016': [500, 500],
    'm002': [5, 5],
    'k110': [1, 1],
    'k004': [1, 1],
    'k007': [1, 1],
    'k009': [1, 1],
    'k011': [1, 1],
    'q001': [3, 3],
    'q002': [1, 1],
    'q003': [1, 1],
    'q004': [1, 1],
    'q006': [0, 0],
    'p001': [1, 1],
    'p026': [1, 1],
    'p029': [1, 1],
    'p036': [1, 1],
    'p045': [1, 1]
})
rf_clf = RandomForestClassifier(max_depth=34, n_estimators=550, random_state=0)
rf_clf.fit(X_res, y_res)

test_pred = rf_clf.predict(high_prob_data)

test_pred

ValueError: The feature names should match those that were passed during fit.
Feature names seen at fit time, yet now missing:
- Unnamed: 0


In [None]:
high_prob_data = pd.DataFrame({
    'branch': [1, 1],     # 본분교
    'found': [1, 1],      # 국공사립
    'course': [1, 1],     # 학위과정
    'daynight': [0, 0],   # 주야간
    'major': [7, 7],      # 전공계열
    'school': [1, 1],     # 학교유형
    'school_area': [0, 0], #학교소재지역
    'sex': [0, 0],        # 성별
    'age': [23, 23],      # 연령
    'f009': [4, 4],       # 고등학교 계열
    'i001': [1, 1],       # 어학연수 경험
    'i033': [650, 650],   # 토익 점수
    'i037': [0, 0],   # 토익 스피킹 점수
    'i042': [0, 0],   # 토플 점수
    'i046': [0, 0],   # 텝스 점수
    'i066': [0, 0],     # 일본어 JLPT 점수
    'i076': [0, 0],   # 중국어 HSK 점수
    'l001': [1, 1],       # 직업 훈련 여부
    'l009': [0, 0],       # 직업 훈련 종료 여부
    'l016': [20, 20],   # 직업 훈련 총 시간
    'm002': [1, 1],       # 자격증 개수
    'k110': [0, 0],       # NCS 준비 여부
    'k004': [0, 0],       # 직무적성검사 여부
    'k007': [0, 0],       # 공모전 수상
    'k009': [0, 0],       # 대외 활동 여부
    'k011': [0, 0],       # 이력서 작성 및 면접훈련 교육 여부
    'q001': [1, 1],       # 현재 건강 상태
    'q002': [1, 1],       # 일주일 평균 운동 시간
    'q003': [10, 10],       # 하루 평균 수면시간
    'q004': [1, 1],       # 흡연 여부
    'q006': [0, 0],       # 음주 빈도
    'p001': [1, 1],       # 혼인 여부
    'p026': [1, 1],       # 아버님의 최종 학력
    'p029': [1, 1],       # 어머님의 최종 학력
    'p036': [1, 1],       # 부모님의 자산 규모
    'p045': [0, 0]        # 군 복무 경혐
})
rf_clf = RandomForestClassifier(max_depth=34, n_estimators=550, random_state=0)
rf_clf.fit(X_res, y_res)

test_pred = rf_clf.predict(high_prob_data)

test_pred