# 모듈 설치

In [None]:
!pip install -q catboost
!pip install -q xgboost
!pip install -q lightgbm
!pip install -q bayesian-optimization==1.4.2
!pip install -q missingno

# 데이터 셋 불러오기

In [None]:
!wget -P /content/dataset -q https://raw.githubusercontent.com/kangmg/Bank_Churn_classification/main/dataset/test.csv
!wget -P /content/dataset -q https://raw.githubusercontent.com/kangmg/Bank_Churn_classification/main/dataset/sample_submission.csv
!wget -P /content/dataset -q https://raw.githubusercontent.com/kangmg/Bank_Churn_classification/main/dataset/train.csv
!wget -P /content/dataset -q https://raw.githubusercontent.com/kangmg/Bank_Churn_classification/main/dataset/Churn_Modelling.csv

!echo "The dataset download is complete !"

# 모듈 호출하기

In [3]:
import pandas as pd
import numpy as np

import missingno as msno
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

from sklearn.metrics import roc_auc_score, accuracy_score,roc_curve

from catboost import CatBoostClassifier, Pool
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

import warnings
warnings.filterwarnings('ignore')

# 데이터 셋 불러오기

In [4]:
train = pd.read_csv("/content/dataset/train.csv", index_col = 0)
test_df = pd.read_csv("/content/dataset/test.csv", index_col = 0)
train_add = pd.read_csv("/content/dataset/Churn_Modelling.csv", index_col = 0)

In [None]:
display(train.head()), display(test_df.head()), display(train_add.head())

In [None]:
train.shape, test_df.shape, train_add.shape

In [None]:
# train과 train_add 병합
train_df = train.merge(train_add, how="outer")
train_df.shape

In [None]:
%matplotlib inline
msno.matrix(train_df)

In [None]:
# 병합 후에 결측치 확인
train_df.isnull().sum()

In [None]:
#결측치 칼럼 제거
train_df = train_df.dropna()
train_df.isnull().sum()

- 피처 요약표 생성

In [None]:
def summary_feature_info( df ):
  # 타입을 데이터로 기본 구성
  summary_df = pd.DataFrame( df.dtypes, columns = ['타입'])

  summary_df.reset_index(inplace = True)
  summary_df.rename( columns = {'index':'feature'}, inplace = True)
  summary_df['결측치 수'] = df.isnull().sum().values
  summary_df['고유값 수'] = df.nunique().values
  summary_df['샘플1'] = df.head().T[0].values
  summary_df['샘플2'] = df.head().T[1].values
  summary_df['샘플3'] = df.head().T[2].values
  return summary_df

summary_feature_info(train_df)

# EDA

- 피처 유형별로 분류

In [12]:
contis = ['CreditScore', 'Age','Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary']
cats = ['Geography', 'Gender', 'HasCrCard', 'IsActiveMember']
target = ['Exited']

## 타겟 데이터 분포 확인

In [None]:
# 타겟 데이터 불균형 -> 층화 처리 고려
palette1 = sns.color_palette('colorblind', 10)
palette2 = sns.color_palette('deep', 10)

def custom_countplot( ax, all_data_cnt ):
  for bar in ax.patches:
    if bar.get_height() == 0:
      continue
    rate = bar.get_height() / all_data_cnt * 100
    ax.text(   x = bar.get_x() + bar.get_width()/2
              ,y = bar.get_y() + bar.get_height() + all_data_cnt*0.01
              ,s = f'{rate:1.1f} %'
              ,ha = 'center'
            )
  return ax


chart = custom_countplot( sns.countplot( data=train_df, x='Exited', palette=palette1),
                          train_df.shape[0] )
chart;

## 범주형 데이터 분포 확인

In [None]:
plt.figure(figsize=(18,6))
plt.subplot(1,4,1)
custom_countplot( sns.countplot( data=train_df, x=cats[0], palette=palette1),train_df.shape[0] )
plt.subplot(1,4,2)
custom_countplot( sns.countplot( data=train_df, x=cats[1], palette=palette2),train_df.shape[0] )
plt.subplot(1,4,3)
custom_countplot( sns.countplot( data=train_df, x=cats[2], palette=palette1),train_df.shape[0] )
plt.subplot(1,4,4)
custom_countplot( sns.countplot( data=train_df, x=cats[3], palette=palette2),train_df.shape[0] )

In [None]:
plt.figure(figsize=(18,6))
plt.subplot(1,4,1)
custom_countplot( sns.countplot( data=train_df, x=cats[0], hue = 'Exited', palette=palette1),train_df.shape[0] )
plt.subplot(1,4,2)
custom_countplot( sns.countplot( data=train_df, x=cats[1], hue = 'Exited', palette=palette2),train_df.shape[0] )
plt.subplot(1,4,3)
custom_countplot( sns.countplot( data=train_df, x=cats[2], hue = 'Exited', palette=palette1),train_df.shape[0] )
plt.subplot(1,4,4)
custom_countplot( sns.countplot( data=train_df, x=cats[3], hue = 'Exited', palette=palette2),train_df.shape[0] )

## 연속형 데이터 분포 확인

In [None]:
plt.figure(figsize=(18, 12))

for i, conti in enumerate(contis, start=1):
    plt.subplot(3, 3, i)
    sns.histplot(data=train_df, x=conti, hue='Exited', kde=True, bins=30, palette='muted')
    plt.title(f'Distribution of {conti} by Exited', pad=20)  # 제목과 subplot 사이의 간격 조정
    plt.xlabel(conti, labelpad=10)  # x축 레이블과 그래프 사이의 간격 조정
    plt.ylabel('Frequency')
    plt.legend(title='Exited', labels=['Not Exited', 'Exited'])

plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(18,15))
plt.subplot(3,3,1)
sns.boxplot(x = train_df[contis[0]])
plt.subplot(3,3,2)
sns.boxplot(x = train_df[contis[1]])
plt.subplot(3,3,3)
sns.boxplot(x = train_df[contis[2]])
plt.subplot(3,3,4)
sns.boxplot(x = train_df[contis[3]])
plt.subplot(3,3,5)
sns.boxplot(x = train_df[contis[4]])
plt.subplot(3,3,6)
sns.boxplot(x = train_df[contis[5]])

In [None]:
nums = ['CreditScore', 'Age','Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary', 'HasCrCard', 'IsActiveMember', 'Exited']
upp_mat = np.triu(train_df[nums].corr())
plt.figure(figsize=(15,10))
sns.heatmap(train_df[nums].corr(), annot=True, fmt=".2f", cmap="Reds", mask = upp_mat)

# 피처엔지니어링

## 불필요한 피처 제거

In [None]:
# CustomerId, Surname 제거
train_df.drop(["CustomerId"], axis=1, inplace=True)
test_df.drop(["CustomerId"], axis=1, inplace=True)

train_df.drop(["Surname"], axis=1, inplace=True)
test_df.drop(["Surname"], axis=1, inplace=True)

train_df.head(2)

## 파생변수 추가

In [None]:
# 파생변수 정의
def getFeats(df):
    df['IsSenior']               = df['Age'].apply(lambda x: 1 if x >= 60 else 0)
    df['IsActive_by_CreditCard'] = df['HasCrCard'] * df['IsActiveMember']
    df['Products_Per_Tenure']    = df['Tenure'] / df['NumOfProducts']
    df['AgeCat']                 = (df["Age"]//20).astype('int').astype('category')
    return df

# 새로 추가한 feature 함수 적용
getFeats(train_df)
getFeats(test_df)

train_df.head()

## 타입 변환

In [None]:
train_df.dtypes

In [None]:
train_df['HasCrCard'] = train_df['HasCrCard'].astype(int)
train_df['IsActiveMember'] = train_df['IsActiveMember'].astype(int)
train_df['IsActive_by_CreditCard'] = train_df['IsActive_by_CreditCard'].astype(int)

test_df['HasCrCard'] = test_df['HasCrCard'].astype(int)
test_df['IsActiveMember'] = test_df['IsActiveMember'].astype(int)
test_df['IsActive_by_CreditCard'] = test_df['IsActive_by_CreditCard'].astype(int)

In [None]:
train_df.dtypes

## 피처 인코딩 진행

In [None]:
# Geography 원 핫 적용
train_df = pd.get_dummies(data = train_df, columns=['Geography'], dtype=int)
test_df = pd.get_dummies(data = test_df, columns=['Geography'], dtype=int)

# AgeCat 원 핫 적용
train_df = pd.get_dummies(data = train_df, columns=['AgeCat'], dtype=int)
test_df = pd.get_dummies(data = test_df, columns=['AgeCat'], dtype=int)

# Gender 데이터 이진 분류
train_df["Gender"] = train_df["Gender"].map({"Male":0, "Female":1})
test_df["Gender"] = test_df["Gender"].map({"Male":0, "Female":1})

In [None]:
train_df.head()

# 모델링

In [None]:
X = train_df.drop(columns = ['Exited'])
y = train_df.Exited

# EDA를 바탕으로 층화 적용
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42, stratify = y, shuffle=True)

## 부스팅 아닌 모델

In [None]:
models = {
    'LogisticRegression': LogisticRegression(),
    'KNN': KNeighborsClassifier(),
    'DecisionTree': DecisionTreeClassifier(),
    'RandomForest': RandomForestClassifier(),
}

model_scores = {}

for model_name, model in models.items():
    model.fit(X_train, y_train)

    y_pred = model.predict_proba(X_test)[:, 1]
    auc = roc_auc_score(y_test, y_pred)
    model_scores[model_name] = auc

    print(f"Test AUC for {model_name}:", auc)
    print('----------------------------------------------------')

In [None]:
# 제일 점수가 높게 나온 RandomForest모델 튜닝
plt.figure(figsize=(10, 6))
sns.barplot(x=list(model_scores.keys()), y=list(model_scores.values()), palette=palette1)
plt.title('Model AUC ROC Scores')
plt.show()

### RandomForest

In [None]:
# RandomForestClassifier

pipeline_rf = Pipeline([
    ('scaler', None),
    ('clf', RandomForestClassifier())
])
param_rf = {
        'scaler': [None],
        'clf__n_estimators': [220],
        'clf__max_depth': [10],
    }

cls_rf = GridSearchCV(estimator=pipeline_rf, param_grid=param_rf, cv=5, scoring='roc_auc')
cls_rf.fit(X_train, y_train)

print("Best parameters:", cls_rf.best_params_)
print("Train Best score (AUC):", cls_rf.best_score_)

y_pred_rf = cls_rf.best_estimator_.predict_proba(X_test)[:, 1]
auc_rf = roc_auc_score(y_test, y_pred_rf)
print("Test Score (AUC) : ", auc_rf )

# Best parameters: {'clf__max_depth': 10, 'clf__n_estimators': 220, 'scaler': None}
# Train Best score (AUC): 0.8836905775473195
# Test Score (AUC) :  0.8841045450038525

## 부스팅 모델

### XGBClassifier

In [None]:
# XGBClassifier

pipeline_xgboost = Pipeline([
    ('scaler', None),
    ('clf', XGBClassifier())
])

param_xgboost = {
        'scaler': [None],
        'clf__learning_rate': [0.2],
        'clf__max_depth': [4]
    }

cls_xgb = GridSearchCV(estimator=pipeline_xgboost, param_grid=param_xgboost, cv=5, scoring='roc_auc')
cls_xgb.fit(X_train, y_train)

print("Best parameters:", cls_xgb.best_params_)
print("Train Best score (AUC):", cls_xgb.best_score_)

y_pred_xgb = cls_xgb.best_estimator_.predict_proba(X_test)[:, 1]
auc_xgb = roc_auc_score(y_test, y_pred_xgb)
print("Test Score (AUC) : ", auc_xgb )

# Best parameters: {'clf__learning_rate': 0.2, 'clf__max_depth': 4, 'scaler': None}
# Train Best score (AUC): 0.8881784408711055
# Test Score (AUC) :  0.8892143578251673

### LGBMClassifier

In [None]:
# LGBMClassifier

pipeline_lightgbm = Pipeline([
    ('scaler', None),
    ('clf', LGBMClassifier())
])

param_lightgbm = {
        'scaler': [None],
        'clf__learning_rate': [0.1],
        'clf__max_depth': [4]
    }

cls_LGB = GridSearchCV(estimator=pipeline_lightgbm, param_grid=param_lightgbm, cv=5, scoring='roc_auc')
cls_LGB.fit(X_train, y_train)

print("Best parameters:", cls_LGB.best_params_)
print("Train Best score (AUC):", cls_LGB.best_score_)

y_pred_LGB = cls_LGB.best_estimator_.predict_proba(X_test)[:, 1]
auc_LGB = roc_auc_score(y_test, y_pred_LGB)
print("Test Score (AUC) : ", auc_LGB )

# Best parameters: {'clf__learning_rate': 0.1, 'clf__max_depth': 4, 'scaler': None}
# Train Best score (AUC): 0.8880022543889072
# Test Score (AUC) :  0.8885257258508887

### CatBoostClassifier

In [None]:
# CatBoostClassifier

pipeline_catboost = Pipeline([
    ('scaler', None),
    ('clf', CatBoostClassifier(logging_level="Silent", eval_metric="AUC", loss_function="Logloss"))
])

param_catboost = {
        'scaler': [MinMaxScaler()],
        'clf__learning_rate': [0.04],
        'clf__depth': [4]
    }

cls_catboost = GridSearchCV(estimator=pipeline_catboost, param_grid=param_catboost, cv=5, scoring='roc_auc')
cls_catboost.fit(X_train, y_train)

print("Best parameters:", cls_catboost.best_params_)
print("Train Best score (AUC):", cls_catboost.best_score_)

y_pred_catboost = cls_catboost.best_estimator_.predict_proba(X_test)[:, 1]
auc_catboost = roc_auc_score(y_test, y_pred_catboost)
print("Test Score (AUC) : ", auc_catboost )

# Best parameters: {'clf__depth': 4, 'clf__learning_rate': 0.04, 'scaler': MinMaxScaler()}
# Train Best score (AUC): 0.8882883004371456
# Test Score (AUC) :  0.889206326181815

## 그 이외의 모델

### QuadraticDiscriminantAnalysis

In [None]:
# QuadraticDiscriminantAnalysis

pipeline_qda = Pipeline([
    ('scaler', None),
    ('clf', QuadraticDiscriminantAnalysis())
])

# 하이퍼파라미터 그리드 설정
param_qda = {
    'scaler': [RobustScaler()],
    'clf__reg_param': [ 0.1]
}

# 그리드 서치 설정
cls_qda = GridSearchCV(estimator=pipeline_qda, param_grid=param_qda, cv=5, scoring='roc_auc')

# 그리드 서치 수행
cls_qda.fit(X_train, y_train)

# 최적의 파라미터 및 성능 출력
print("Best parameters:", cls_qda.best_params_)
print("Train Best score (AUC):", cls_qda.best_score_)

# 테스트 데이터 예측 및 AUC 계산
y_pred_qda = cls_qda.best_estimator_.predict_proba(X_test)[:, 1]
auc_qda = roc_auc_score(y_test, y_pred_qda)
print("Test Score (AUC) : ", auc_qda)

# Best parameters: {'clf__reg_param': 0.1, 'scaler': RobustScaler()}
# Train Best score (AUC): 0.8478037285002824
# Test Score (AUC) :  0.8449149630470922

### AdaBoostClassifier

In [None]:
# AdaBoostClassifier

pipeline_adaboost = Pipeline([
    ('scaler', None),
    ('clf', AdaBoostClassifier())
])

# 하이퍼파라미터 그리드 설정
param_adaboost = {
    'scaler': [RobustScaler()],
    'clf__n_estimators': [ 250],
    'clf__learning_rate': [1.0]
}

# 그리드 서치 설정
cls_adaboost = GridSearchCV(estimator=pipeline_adaboost, param_grid=param_adaboost, cv=5, scoring='roc_auc')

# 그리드 서치 수행
cls_adaboost.fit(X_train, y_train)

# 최적의 파라미터 및 성능 출력
print("Best parameters:", cls_adaboost.best_params_)
print("Train Best score (AUC):", cls_adaboost.best_score_)

# 테스트 데이터 예측 및 AUC 계산
y_pred_adaboost = cls_adaboost.best_estimator_.predict_proba(X_test)[:, 1]
auc_adaboost = roc_auc_score(y_test, y_pred_adaboost)
print("Test Score (AUC) : ", auc_adaboost)

# Best parameters: {'clf__learning_rate': 1.0, 'clf__n_estimators': 250, 'scaler': RobustScaler()}
# Train Best score (AUC): 0.8792854483116999
# Test Score (AUC) :  0.879878212951271

### GaussianNB

In [None]:
# GaussianNB
pipeline_nb = Pipeline([
    ('scaler', None),
    ('clf', GaussianNB())
])

# 하이퍼파라미터 그리드 설정
param_nb = {
    'scaler': [MinMaxScaler()],
}

# 그리드 서치 설정
cls_nb = GridSearchCV(estimator=pipeline_nb, param_grid=param_nb, cv=5, scoring='roc_auc')

# 그리드 서치 수행
cls_nb.fit(X_train, y_train)

# 최적의 파라미터 및 성능 출력
print("Best parameters:", cls_nb.best_params_)
print("Train Best score (AUC):", cls_nb.best_score_)

# 테스트 데이터 예측 및 AUC 계산
y_pred_nb = cls_nb.best_estimator_.predict_proba(X_test)[:, 1]
auc_nb = roc_auc_score(y_test, y_pred_nb)
print("Test Score (AUC) : ", auc_nb)

# Best parameters: {'scaler': MinMaxScaler()}
# Train Best score (AUC): 0.8162984607769876
# Test Score (AUC) :  0.8121932911602249

### SGDClassifier

In [None]:
# SGDClassifier

pipeline_sgd = Pipeline([
    ('scaler', None),
    ('clf', SGDClassifier())
])

param_sgd = {
        'scaler': [RobustScaler()],
        'clf__loss': ['modified_huber'],
        'clf__penalty': ['l2'],
        'clf__alpha': [ 0.1],
        'clf__learning_rate': ['optimal']
    }

cls_sgd = GridSearchCV(estimator=pipeline_sgd, param_grid=param_sgd, cv=5, scoring='roc_auc')
cls_sgd.fit(X_train, y_train)

print("Best parameters:", cls_sgd.best_params_)
print("Train Best score (AUC):", cls_sgd.best_score_)

y_pred_sgd = cls_sgd.best_estimator_.decision_function(X_test)
auc_sgd = roc_auc_score(y_test, y_pred_sgd)
print("Test Score (AUC) : ", auc_sgd)

# Best parameters: {'clf__alpha': 0.1, 'clf__learning_rate': 'optimal', 'clf__loss': 'modified_huber', 'clf__penalty': 'l2', 'scaler': RobustScaler()}
# Train Best score (AUC): 0.8277247967596943
# Test Score (AUC) :  0.8262353054483689

In [None]:
models = [cls_LGB.best_estimator_, cls_xgb.best_estimator_,  cls_rf.best_estimator_,  cls_adaboost.best_estimator_,cls_qda.best_estimator_, cls_sgd.best_estimator_]
names = ["cls_LGB", "cls_xgb", "cls_rf", "cls_adaboost", "cls_qda", "cls_sgd"]

In [None]:
def calculate_accuracy(model, X, y):
    # 모델의 예측값 계산
    y_pred = model.predict(X)

    # 정확도 점수 계산
    accuracy = accuracy_score(y, y_pred)

    return accuracy

for name, model in zip(names, models):
  print(name, " | accuracy score : ", calculate_accuracy(model, X_test, y_test))

In [None]:
model_names = list()
base_models = list()
for model_name, base_model in zip(names, models):
  try:
    y_pred = base_model.predict_proba(X_test)[:, 1]
    auc_score = roc_auc_score(y_test, y_pred)
    print(f"{model_name} \t\t Test Score (AUC) : {auc_score}")
    model_names.append(model_name)
    base_models.append(base_model)
  except:
    print(f"********* Failed model : {model_name} ***********")

In [None]:
als = {
    'RandomForestClassifier': ( RandomForestClassifier(max_depth=10, n_estimators=220), '.-'),
    'XGBClassifier'            :( XGBClassifier(learning_rate = 0.2, max_depth = 4), ':'),
    'LGBMClassifier'                   :( LGBMClassifier(learning_rate=0.1, max_depth = 4), '-'),
    'CatBoostClassifier' : (CatBoostClassifier(learning_rate=0.04, max_depth = 4), '.')
}

In [None]:
plt.figure( figsize=(7,7))

for al_nm, ( model, line_style ) in als.items():
    model.fit( X_train, y_train )
    pred = model.predict_proba( X_test )
    pred_t = pred[ :, -1]
    fpr, tpr, _ = roc_curve(y_test.values, pred_t, )
    plt.plot( fpr, tpr, line_style, label=al_nm)

plt.legend()
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')

# 화면출력
plt.show()

# 피처 수정 + CatBoost

In [None]:
train = pd.read_csv("/content/dataset/train.csv", index_col = 0)
test_df = pd.read_csv("/content/dataset/test.csv", index_col = 0)
train_add = pd.read_csv("/content/dataset/Churn_Modelling.csv", index_col = 0)

# 병합
train_df = train.merge(train_add, how="outer")

#결측치 칼럼 제거
train_df = train_df.dropna()

# CustomerId 피쳐 제거
train_df.drop(["CustomerId"], axis=1, inplace=True)
test_df.drop(["CustomerId"], axis=1, inplace=True)

def getFeats(df):
    df['IsSenior']               = df['Age'].apply(lambda x: 1 if x >= 60 else 0)

    df['IsActive_by_CreditCard'] = df['HasCrCard'] * df['IsActiveMember']

    df['Products_Per_Tenure']    = df['Tenure'] / df['NumOfProducts']

    df['AgeCat']                 = (df["Age"]//10).astype('int').astype('category')

    return df

getFeats(train_df)
getFeats(test_df)

In [None]:
X = train_df.drop(columns = ['Exited'])
y = train_df.Exited

In [None]:
# train_df = pd.get_dummies(data = train_df, columns=['Geography'], dtype=int)
# train_df = pd.get_dummies(data = train_df, columns=['AgeCat'], dtype=int)
# train_df["Gender"] = train_df["Gender"].map({"Male":0, "Female":1})

In [None]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42, stratify = y)
# def my_func(depth, learning_rate, l2_leaf_reg, random_strength):
#     model = CatBoostClassifier( depth        = int(depth)
#                           ,learning_rate    = learning_rate
#                           ,l2_leaf_reg   = int(l2_leaf_reg)
#                           ,random_strength  = random_strength
#     )

#     model.fit( X_train, y_train )

#     y_pred = model.predict_proba(X_test)[:, 1]

#     return roc_auc_score(y_test, y_pred)
# bo = BayesianOptimization(f=my_func, pbounds=param_bounds, verbose=2, random_state=42 )
# bo.maximize( init_points = 10, n_iter=100, acq='ei', xi=0.01)
# max_params = bo.max['params']
# max_params

# {'depth': 3.421383194252043,
#  'l2_leaf_reg': 1.5592158346513352,
#  'learning_rate': 0.0658686340211863,
#  'random_strength': 1.3869079551546575}

In [None]:
cat_features = ['Surname', 'Geography', 'Gender', 'AgeCat']

In [None]:
folds = StratifiedKFold(n_splits=5,random_state=42,shuffle=True)
test_preds = np.empty((5, len(test_df)))
auc_vals=[]

for n_fold, (train_idx, valid_idx) in enumerate(folds.split(X, y)):

    X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
    X_val, y_val = X.iloc[valid_idx], y.iloc[valid_idx]

    train_pool = Pool(X_train, y_train,cat_features=cat_features)
    val_pool = Pool(X_val, y_val,cat_features=cat_features)

    clf = CatBoostClassifier(
    eval_metric='AUC',
    depth = 3,
    l2_leaf_reg = 1,
    learning_rate=0.06,
    random_strength = 1.38,
    iterations = 5000)

    clf.fit(train_pool, eval_set=val_pool,verbose=300)

    y_pred_val = clf.predict_proba(X_val)[:,1]
    auc_val = roc_auc_score(y_val, y_pred_val)
    print("AUC for fold ",n_fold,": ",auc_val)
    auc_vals.append(auc_val)

    y_pred_test = clf.predict_proba(test_df)[:,1]
    test_preds[n_fold, :] = y_pred_test
    print("----------------")

In [None]:
"Mean AUC: ",np.mean(auc_vals)

In [None]:
pred = clf.predict_proba( X_val )
pred_t = pred[ :, -1]
fpr, tpr, _ = roc_curve(y_val.values, pred_t, )
plt.plot( fpr, tpr )

plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')


plt.show()

In [None]:
features = X.columns
importance = clf.feature_importances_

cat_importances = pd.Series(importance, index = features).sort_values(ascending = False)

fig, ax = plt.subplots(figsize=(6, 4))
cat_importances.plot.bar(ax=ax)
plt.title(f"Feature Importances - Cat Model")
plt.show()

In [None]:
sample = pd.read_csv('/content/dataset/sample_submission.csv')
y_pred = test_preds.mean(axis=0)
sample['Exited'] = y_pred

In [None]:
# sample.to_csv('clf_v30_20240424.csv', index = False)