<a href="https://colab.research.google.com/github/ErdemAslans/ALL-IN-BANKER/blob/main/ALL_IN_BANKER.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pandas numpy scikit-learn seaborn matplotlib
!pip install xgboost lightgbm
!pip install optuna
!pip install imbalanced-learn
!pip install keras tensorflow
!pip install plotly dash
!pip install joblib


Collecting optuna
  Downloading optuna-4.0.0-py3-none-any.whl.metadata (16 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.13.3-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.8.2-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.5-py3-none-any.whl.metadata (2.9 kB)
Downloading optuna-4.0.0-py3-none-any.whl (362 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m362.8/362.8 kB[0m [31m28.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.13.3-py3-none-any.whl (233 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.2/233.2 kB[0m [31m22.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.8.2-py3-none-any.whl (11 kB)
Downloading Mako-1.3.5-py3-none-any.whl (78 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: M

In [None]:
######################################
# HOME CREDIT DEFAULT RISK COMPETITION XGBoost
######################################
# Reference from https://www.kaggle.com/jsaguiar/lightgbm-with-simple-features

# Importing essential libraries
import numpy as np
import pandas as pd
import time
import gc
from contextlib import contextmanager
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold, StratifiedKFold
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import re
from xgboost.callback import EarlyStopping

pd.set_option('display.max_columns', None)
pd.set_option('display.width', 500)
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)
warnings.simplefilter(action='ignore', category=RuntimeWarning)


# Defining timer to track progress
@contextmanager
def timer(title):
    t0 = time.time()
    yield
    print("{} - done in {:.0f}s".format(title, time.time() - t0))


# Defining one-hot encoding for categorical columns with get_dummies
def one_hot_encoder(df, nan_as_category=True):
    original_columns = list(df.columns)
    categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
    df = pd.get_dummies(df, columns=categorical_columns, dummy_na=nan_as_category)
    new_columns = [c for c in df.columns if c not in original_columns]
    return df, new_columns


# Defining Sin-cos transformation for cyclic features
def encode(df, col, max_val):
    df[col + '_SIN'] = np.sin(2 * np.pi * df[col] / max_val)
    df[col + '_COS'] = np.cos(2 * np.pi * df[col] / max_val)
    return df


# Defining dynamic rare encoding for column categories
def dyn_rare_encoder(df, columns, rare_percent):
    for col in columns:
        tmp = df[col].value_counts() / len(df) * 100
        rare_labels = tmp[tmp < rare_percent].index
        df[col] = np.where(df[col].isin(rare_labels), 'Other', df[col])
    return df


# Display/plot feature importance
def display_importances(feature_importance_df_):
    cols = feature_importance_df_[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False)[:40].index
    best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)]
    plt.figure(figsize=(8, 10))
    sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False))
    plt.title('Model Features (avg over folds)')
    plt.tight_layout()
    plt.savefig('model_importances.png')
    plt.close()

#####################################
# Application Train and Test Data
#####################################
def application_train_test(num_rows=None, nan_as_category=True):
    # Read and merge data
    df = pd.read_csv('/content/drive/MyDrive/Risk/application_train.csv', nrows=num_rows)
    test_df = pd.read_csv('/content/drive/MyDrive/Risk/application_test.csv', nrows=num_rows)
    print("Train samples: {}, test samples: {}".format(len(df), len(test_df)))

    # Concatenate dataframes
    df = pd.concat([df, test_df], ignore_index=True)

    # Removing 4 applications with XNA CODE_GENDER (train set)
    df = df[df['CODE_GENDER'] != 'XNA']

    # Deleting FLAG_MOBIL because there is only 1 person without mobile phone
    df.drop('FLAG_MOBIL', axis=1, inplace=True)
    df.drop('FLAG_CONT_MOBILE', axis=1, inplace=True)

    # NaN values for DAYS_EMPLOYED: 365243 -> nan
    df['DAYS_EMPLOYED'].replace(365243, np.nan, inplace=True)

    # Changing rare categories of NAME_INCOME_TYPE with the similar categories
    df.loc[df['NAME_INCOME_TYPE'] == 'Student', 'NAME_INCOME_TYPE'] = 'State servant'
    df.loc[df['NAME_INCOME_TYPE'] == 'Maternity leave', 'NAME_INCOME_TYPE'] = 'Pensioner'
    df.loc[df['NAME_INCOME_TYPE'] == 'Unemployed', 'NAME_INCOME_TYPE'] = 'Pensioner'
    df.loc[df['NAME_INCOME_TYPE'] == 'Businessman', 'NAME_INCOME_TYPE'] = 'Commercial associate'

    # Dynamic rare encoding
    df = dyn_rare_encoder(df, ['ORGANIZATION_TYPE'], rare_percent=1.9)
    df = dyn_rare_encoder(df, ['NAME_TYPE_SUITE'], rare_percent=3.6)
    df = dyn_rare_encoder(df, ['OCCUPATION_TYPE'], rare_percent=1.5)
    df = dyn_rare_encoder(df, ['WALLSMATERIAL_MODE'], rare_percent=20)

    # Rare Encoding NAME_HOUSING_TYPE with 'Other'
    df.loc[(df['NAME_HOUSING_TYPE'] == 'Office apartment') &
           (df['NAME_HOUSING_TYPE'] == 'Co-op apartment'), 'NAME_HOUSING_TYPE'] = 'Other'

    #  Changing unknown family status with the most observed category
    df['NAME_FAMILY_STATUS'].replace('Unknown', 'Married', inplace=True)

    #  Changing HOUSETYPE_MODE not null values with
    df.loc[df['HOUSETYPE_MODE'].notnull(), 'HOUSETYPE_MODE'] = 'house_type_reported'

    # Changing weekdays with integer values
    weekday_dict = {'MONDAY': 1, 'TUESDAY': 2, 'WEDNESDAY': 3, 'THURSDAY': 4, 'FRIDAY': 5, 'SATURDAY': 6, 'SUNDAY': 7}
    df.replace({'WEEKDAY_APPR_PROCESS_START': weekday_dict}, inplace=True)
    # Creating sin-cos transformed features
    df = encode(df, 'WEEKDAY_APPR_PROCESS_START', 7)
    df = encode(df, 'HOUR_APPR_PROCESS_START', 23)
    # Deleting initial WEEKDAY_APPR_PROCESS_START and HOUR_APPR_PROCESS_START features
    df.drop(['WEEKDAY_APPR_PROCESS_START', 'HOUR_APPR_PROCESS_START'], axis=1, inplace=True)

    # New features (percentages)
    df['NEW_DAYS_EMPLOYED_PERC'] = df['DAYS_EMPLOYED'] / df['DAYS_BIRTH']
    df['NEW_INCOME_CREDIT_PERC'] = df['AMT_INCOME_TOTAL'] / df['AMT_CREDIT']
    df['NEW_INCOME_PER_PERSON'] = df['AMT_INCOME_TOTAL'] / df['CNT_FAM_MEMBERS']
    df['NEW_ANNUITY_INCOME_PERC'] = df['AMT_ANNUITY'] / df['AMT_INCOME_TOTAL']
    df['NEW_ANNUITY_CREDIT_RATIO'] = df['AMT_ANNUITY'] / df['AMT_CREDIT']

    # Loan to Value Ratio (LVR)
    df['NEW_LVR'] = df['AMT_CREDIT'] / df['AMT_GOODS_PRICE']

    # LVR_RISK assesment feature
    df.loc[df['NEW_LVR'] >= 0.80, 'NEW_LVR_RISK'] = 1
    df.loc[df['NEW_LVR'] < 0.80, 'NEW_LVR_RISK'] = 0

    # Mean of External Sources
    df["NEW_EXT_MEAN"] = df[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].mean(axis=1)

    # Product of External Sources
    df['NEW_EXT_PROD'] = df['EXT_SOURCE_1'] * df['EXT_SOURCE_2'] * df['EXT_SOURCE_3']

    # Ages of customers
    df['NEW_AGE'] = df['DAYS_BIRTH'] / -365

    # NEW_AGE_SEGMENT segments
    df.loc[df['NEW_AGE'] <= 34, 'NEW_AGE_SEGMENT'] = 'AGE_GRP_1'
    df.loc[(df['NEW_AGE'] > 34) & (df['NEW_AGE'] <= 54), 'NEW_AGE_SEGMENT'] = 'AGE_GRP_2'
    df.loc[df['NEW_AGE'] > 54, 'NEW_AGE_SEGMENT'] = 'AGE_GRP_3'

    # Total documents demonstrated
    df['NEW_TOTAL_DOC_NUM'] = df.loc[:, 'FLAG_DOCUMENT_2':'FLAG_DOCUMENT_21'].sum(axis=1)
    df.drop(df.loc[:, 'FLAG_DOCUMENT_2':'FLAG_DOCUMENT_21'], axis=1, inplace=True)

    # Product-Credit-Salary relation
    df["NEW_PROD_CRED_SALARY"] = (df["AMT_GOODS_PRICE"] - df["AMT_CREDIT"]) / df["AMT_INCOME_TOTAL"]

    # NEW_ACCOMPANIED feature
    df.loc[df['NAME_TYPE_SUITE'] == 'Unaccompanied', 'NEW_ACCOMPANIED'] = 0
    df.loc[df['NAME_TYPE_SUITE'] != 'Unaccompanied', 'NEW_ACCOMPANIED'] = 1
    df.loc[df['NAME_TYPE_SUITE'].isnull(), 'NEW_ACCOMPANIED'] = np.nan

    # Social circle with both 30 and 60 days default (binary)
    df.loc[(df['DEF_30_CNT_SOCIAL_CIRCLE'] > 0) & (df['DEF_60_CNT_SOCIAL_CIRCLE'] > 0),
           'NEW_DEF_30&60_SOCIAL_CIRCLE'] = 1
    df.loc[(df['DEF_30_CNT_SOCIAL_CIRCLE'] == 0) & (df['DEF_60_CNT_SOCIAL_CIRCLE'] == 0),
           'NEW_DEF_30&60_SOCIAL_CIRCLE'] = 0

    # Label encoding
    for bin_feature in ['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY']:
        df[bin_feature], uniques = pd.factorize(df[bin_feature])

    # Categorical features with One-Hot encode
    df, cat_cols = one_hot_encoder(df, nan_as_category)
    del test_df
    gc.collect()
    return df


#####################################
# Bureau Data
#####################################
def bb__agg(num_rows=None, nan_as_category=True):
    bb = pd.read_csv('/content/drive/MyDrive/Risk/bureau_balance.csv', nrows=num_rows)


    # DPD (Days Past Due) 'ye düşmüşmü, düşmemişmi?'
    liste = ['1', '2', '3', '4', '5']
    bb['NEW_FLAG'] = bb['STATUS'].apply(lambda x: 1 if (x in liste) else ("X" if x == "X" else 0))

    bb, bb_cat = one_hot_encoder(bb, nan_as_category)
    bb.drop("NEW_FLAG_X", inplace=True, axis=1)
    bb_cat.remove('NEW_FLAG_X')

    bb_aggregations = {'MONTHS_BALANCE': ['min', 'max', 'size']}
    for col in bb_cat:
        bb_aggregations[col] = ['mean']

    bb_agg = bb.groupby('SK_ID_BUREAU').agg(bb_aggregations)
    bb_agg.columns = pd.Index([e[0] + "_" + e[1].upper() for e in bb_agg.columns.tolist()])

    del bb
    gc.collect()
    return bb_agg, bb_cat


def bureau_(num_rows=None, nan_as_category=True):
    bu = pd.read_csv('/content/drive/MyDrive/Risk/bureau.csv', nrows=num_rows)


    # Kredi Aktive ve Closed toplam Sayılarını ve Oranlarını hesaplamak
    temp_bu = bu[['SK_ID_CURR', 'CREDIT_ACTIVE']]
    temp_bu = pd.get_dummies(temp_bu)
    temp_bu = temp_bu.groupby('SK_ID_CURR').agg({'CREDIT_ACTIVE_Active': 'sum', 'CREDIT_ACTIVE_Closed': 'sum'})
    temp_bu.columns = ['CREDIT_ACTIVE_Active_Count', 'CREDIT_ACTIVE_Closed_Count']
    temp_bu['CREDIT_ACTIVE_Active_ratio'] = temp_bu['CREDIT_ACTIVE_Active_Count'] / (
                temp_bu['CREDIT_ACTIVE_Active_Count'] + temp_bu['CREDIT_ACTIVE_Closed_Count'])
    temp_bu['CREDIT_ACTIVE_Closed_ratio'] = temp_bu['CREDIT_ACTIVE_Closed_Count'] / (
                temp_bu['CREDIT_ACTIVE_Active_Count'] + temp_bu['CREDIT_ACTIVE_Closed_Count'])
    bu = bu.merge(temp_bu, on=['SK_ID_CURR'], how='left')

    # Kredi DAYS_CREDIT'i SK_ID_CURR bazında sıralayarak NEW_DAYS_DIFF değişkeni üretmek kredi alma frekansı bilgisi verebilir.
    temp_bu = bu[['SK_ID_CURR', 'SK_ID_BUREAU', 'DAYS_CREDIT']].groupby(by=['SK_ID_CURR'], as_index=False).apply(
      lambda x: x.sort_values(['DAYS_CREDIT'], ascending=True)).reset_index(drop=True)
    temp_bu['NEW_DAYS_DIFF'] = temp_bu.groupby(by=['SK_ID_CURR'])['DAYS_CREDIT'].diff()
    temp_bu = temp_bu[['SK_ID_BUREAU', 'NEW_DAYS_DIFF']]
    temp_bu['NEW_DAYS_DIFF'] = temp_bu['NEW_DAYS_DIFF'].fillna(0)
    bu = bu.merge(temp_bu, on=['SK_ID_BUREAU'], how='left')

    # Active ve Closed Krediler için kredi erken kapanmışmı?
    bu.loc[(bu['CREDIT_ACTIVE'] == 'Active') & (bu['DAYS_CREDIT_ENDDATE'] < 0), 'NEW_EARLY_ACTİVE'] = 1
    bu.loc[(bu['CREDIT_ACTIVE'] == 'Closed') & (
                abs(bu['DAYS_CREDIT_ENDDATE']) < abs(bu['DAYS_ENDDATE_FACT'])), 'NEW_EARLY_CLOSED'] = 1

    # Uzatılmış Kredilerin 1 ile değiştirilmesi
    prolong = [1, 2, 3, 4, 5, 6, 7, 8, 9]
    bu['CNT_CREDIT_PROLONG'].replace(prolong, 1, inplace=True)

    # Kişi Kaç farklı kredi tipi almış
    temp_bu = bu[['SK_ID_CURR', 'CREDIT_TYPE']].groupby(by=['SK_ID_CURR'])[
        'CREDIT_TYPE'].nunique().reset_index().rename(index=str, columns={'CREDIT_TYPE': 'NEW_BUREAU_LOAN_TYPES'})
    bu = bu.merge(temp_bu, on=['SK_ID_CURR'], how='left')

    # Borç Oranı
    bu['NEW_DEPT_RATİO'] = bu['AMT_CREDIT_SUM_DEBT'] / (bu['AMT_CREDIT_SUM'] + 1)

    # Kredi Tiplerinin 'others' ile değiştirilmesi
    credit_type = ['Loan for working capital replenishment',
                   'Loan for business development', 'Real estate loan',
                   'Unknown type of loan', 'Another type of loan',
                   'Cash loan (non-earmarked)', 'Loan for the purchase of equipment',
                   'Mobile operator loan', 'Interbank credit',
                   'Loan for purchase of shares (margin lending)']

    bu['CREDIT_TYPE'].replace(credit_type, 'others', inplace=True)

    # Aylık Ödeme Oranı
    bu['NEW_AMT_ANNUITY_RATİO'] = bu['AMT_ANNUITY'] / bu['AMT_CREDIT_SUM']

    # Kredi güncellenmesi yenimi ?
    bu['NEWS_DAYS_CREDIT_UPDATE'] = bu['DAYS_CREDIT_UPDATE'].apply(lambda x: 'old' if x < -90 else 'new')

    # 'CREDIT_CURRENCY' değişkenini düşürmek
    bu.drop('CREDIT_CURRENCY', inplace=True, axis=1)

    del temp_bu
    gc.collect()
    return bu


def combine(bureau, bb_agg):
    bureau = bureau.join(bb_agg, how='left', on='SK_ID_BUREAU')
    bureau.drop(['SK_ID_BUREAU'], axis=1, inplace=True)
    return bureau


# Preprocess bureau.csv and bureau_balance.csv
def bureau_and_balance(num_rows=None, nan_as_category=True):
    bb_agg, bb_cat = bb__agg(num_rows, nan_as_category)
    bureau = bureau_(num_rows, nan_as_category)
    bureau, bureau_cat = one_hot_encoder(bureau, nan_as_category)
    bureau = combine(bureau, bb_agg)

    # Bureau and bureau_balance numeric features
    num_aggregations = {
        'DAYS_CREDIT': ['min', 'max', 'mean', 'var'],
        'DAYS_CREDIT_ENDDATE': ['min', 'max', 'mean'],
        'DAYS_CREDIT_UPDATE': ['mean'],
        'CREDIT_DAY_OVERDUE': ['max', 'mean'],
        'AMT_CREDIT_MAX_OVERDUE': ['mean'],
        'AMT_CREDIT_SUM': ['max', 'mean', 'sum'],
        'AMT_CREDIT_SUM_DEBT': ['max', 'mean', 'sum'],
        'AMT_CREDIT_SUM_OVERDUE': ['mean'],
        'AMT_CREDIT_SUM_LIMIT': ['mean', 'sum'],
        'AMT_ANNUITY': ['max', 'mean'],
        'CNT_CREDIT_PROLONG': ['sum'],
        'MONTHS_BALANCE_MIN': ['min'],
        'MONTHS_BALANCE_MAX': ['max'],
        'MONTHS_BALANCE_SIZE': ['mean', 'sum'],
        "CREDIT_ACTIVE_Active_Count": ["mean"],
        "CREDIT_ACTIVE_Closed_Count": ["mean"],
        "CREDIT_ACTIVE_Active_ratio": ["mean"],
        "NEW_DAYS_DIFF": ['max', 'mean'],
        "NEW_EARLY_ACTİVE": ['mean'],
        "NEW_EARLY_CLOSED": ['mean'],
        "NEW_BUREAU_LOAN_TYPES": ['mean'],
        "NEW_DEPT_RATİO": ['max', 'mean'],
        "NEW_AMT_ANNUITY_RATİO": ['max', 'mean']
    }

    for col in bb_cat:
        num_aggregations[col + "_MEAN"] = ['mean']

    # Bureau and bureau_balance categorical features
    cat_aggregations = {}
    for cat in bureau_cat: cat_aggregations[cat] = ['mean']
    for cat in bb_cat: cat_aggregations[cat + "_MEAN"] = ['mean']

    bureau_agg = bureau.groupby('SK_ID_CURR').agg({**num_aggregations, **cat_aggregations})
    bureau_agg.columns = pd.Index(['BURO_' + e[0] + "_" + e[1].upper() for e in bureau_agg.columns.tolist()])

    # Bureau: Active credits - using only numerical aggregations
    active = bureau[bureau['CREDIT_ACTIVE_Active'] == 1]
    active_agg = active.groupby('SK_ID_CURR').agg(num_aggregations)
    active_agg.columns = pd.Index(['ACTIVE_' + e[0] + "_" + e[1].upper() for e in active_agg.columns.tolist()])
    bureau_agg = bureau_agg.join(active_agg, how='left', on='SK_ID_CURR')

    del active, active_agg
    gc.collect()

    # Bureau: Closed credits - using only numerical aggregations
    closed = bureau[bureau['CREDIT_ACTIVE_Closed'] == 1]
    closed_agg = closed.groupby('SK_ID_CURR').agg(num_aggregations)
    closed_agg.columns = pd.Index(['CLOSED_' + e[0] + "_" + e[1].upper() for e in closed_agg.columns.tolist()])
    bureau_agg = bureau_agg.join(closed_agg, how='left', on='SK_ID_CURR')

    del closed, closed_agg, bureau, bb_agg
    gc.collect()
    return bureau_agg


#####################################
# Previous Application Data
#####################################
def previous_app(num_rows=None, nan_as_category=True):
    df_prev = pd.read_csv('/content/drive/MyDrive/Risk/previous_application.csv', nrows=num_rows)
    cat_cols = [col for col in df_prev.columns if df_prev[col].dtypes == 'O']
    num_cols = [col for col in df_prev.columns if df_prev[col].dtypes != 'O']

    # days 365243 values to nan
    df_prev['DAYS_FIRST_DRAWING'].replace(365243, np.nan, inplace=True)
    df_prev['DAYS_FIRST_DUE'].replace(365243, np.nan, inplace=True)
    df_prev['DAYS_LAST_DUE_1ST_VERSION'].replace(365243, np.nan, inplace=True)
    df_prev['DAYS_LAST_DUE'].replace(365243, np.nan, inplace=True)
    df_prev['DAYS_TERMINATION'].replace(365243, np.nan, inplace=True)

    # XNA, XAP to nan for cat_cols.
    na = ['XNA', 'XAP']
    for col in cat_cols:
        for n in na:
            df_prev.loc[df_prev[col] == n, col] = np.nan

    # delete columns columns that do not contain information or missing values over 80 percent of the entire data
    del_cols = ['RATE_INTEREST_PRIMARY', 'RATE_INTEREST_PRIVILEGED', 'DAYS_FIRST_DRAWING',
                'NAME_CASH_LOAN_PURPOSE', 'CODE_REJECT_REASON', 'FLAG_LAST_APPL_PER_CONTRACT',
                'NFLAG_LAST_APPL_IN_DAY', 'SELLERPLACE_AREA']
    df_prev.drop(del_cols, axis=1, inplace=True)

    # Feature Engineering
    # X-sell approved & Walk-in Approved
    df_prev['NEW_X_SELL_APPROVED'] = 0
    df_prev.loc[(df_prev['NAME_PRODUCT_TYPE'] == 'x-sell') &
                (df_prev['NAME_CONTRACT_STATUS'] == 'Approved'), 'NEW_X_SELL_APPROVED'] = 1
    df_prev['NEW_WALK_IN_APPROVED'] = 0
    df_prev.loc[(df_prev['NAME_PRODUCT_TYPE'] == 'walk-in') &
                (df_prev['NAME_CONTRACT_STATUS'] == 'Approved'), 'NEW_WALK_IN_APPROVED'] = 1

    # Customer status approved
    df_prev['NEW_REPEATER_APPROVED'] = 0
    df_prev.loc[(df_prev['NAME_CLIENT_TYPE'] == 'Repeater') &
                (df_prev['NAME_CONTRACT_STATUS'] == 'Approved'), 'NEW_REPEATER_APPROVED'] = 1
    df_prev['NEW_NEWCUST_APPROVED'] = 0
    df_prev.loc[(df_prev['NAME_CLIENT_TYPE'] == 'New') &
                (df_prev['NAME_CONTRACT_STATUS'] == 'Approved'), 'NEW_NEWCUST_APPROVED'] = 1
    df_prev['NEW_REFRESHED_APPROVED'] = 0
    df_prev.loc[(df_prev['NAME_CLIENT_TYPE'] == 'Refreshed') &
                (df_prev['NAME_CONTRACT_STATUS'] == 'Approved'), 'NEW_REFRESHED_APPROVED'] = 1

    # Purpose of application approved
    df_prev['NEW_CARDS_APPROVED'] = 0
    df_prev.loc[(df_prev['NAME_PORTFOLIO'] == 'Cards') &
                (df_prev['NAME_CONTRACT_STATUS'] == 'Approved'), 'NEW_CARDS_APPROVED'] = 1
    df_prev['NEW_CASH_APPROVED'] = 0
    df_prev.loc[(df_prev['NAME_PORTFOLIO'] == 'Cash') &
                (df_prev['NAME_CONTRACT_STATUS'] == 'Approved'), 'NEW_CASH_APPROVED'] = 1
    df_prev['NEW_POS_APPROVED'] = 0
    df_prev.loc[(df_prev['NAME_PORTFOLIO'] == 'POS') &
                (df_prev['NAME_CONTRACT_STATUS'] == 'Approved'), 'NEW_POS_APPROVED'] = 1

    # Interest approved
    df_prev['NEW_HIGH_APPROVED'] = 0
    df_prev.loc[(df_prev['NAME_YIELD_GROUP'] == 'high') &
                (df_prev['NAME_CONTRACT_STATUS'] == 'Approved'), 'NEW_HIGH_APPROVED'] = 1
    df_prev['NEW_MIDDLE_APPROVED'] = 0
    df_prev.loc[(df_prev['NAME_YIELD_GROUP'] == 'middle') &
                (df_prev['NAME_CONTRACT_STATUS'] == 'Approved'), 'NEW_MIDDLE_APPROVED'] = 1
    df_prev['NEW_LOWACTION_APPROVED'] = 0
    df_prev.loc[(df_prev['NAME_YIELD_GROUP'] == 'low_action') &
                (df_prev['NAME_CONTRACT_STATUS'] == 'Approved'), 'NEW_LOWACTION_APPROVED'] = 1
    df_prev['NEW_LOWNORMAL_APPROVED'] = 0
    df_prev.loc[(df_prev['NAME_YIELD_GROUP'] == 'low_normal') &
                (df_prev['NAME_CONTRACT_STATUS'] == 'Approved'), 'NEW_LOWNORMAL_APPROVED'] = 1

    # Application hour convert to categorical
    df_prev.loc[(df_prev['HOUR_APPR_PROCESS_START'] >= 0) &
                (df_prev['HOUR_APPR_PROCESS_START'] <= 6), 'NEW_APP_DAY_TIME'] = 'night'
    df_prev.loc[(df_prev['HOUR_APPR_PROCESS_START'] > 6) &
                (df_prev['HOUR_APPR_PROCESS_START'] <= 12), 'NEW_APP_DAY_TIME'] = 'morning'
    df_prev.loc[(df_prev['HOUR_APPR_PROCESS_START'] > 12) &
                (df_prev['HOUR_APPR_PROCESS_START'] <= 18), 'NEW_APP_DAY_TIME'] = 'afternoon'
    df_prev.loc[(df_prev['HOUR_APPR_PROCESS_START'] > 18) &
                (df_prev['HOUR_APPR_PROCESS_START'] < 24), 'NEW_APP_DAY_TIME'] = 'evening'
    df_prev.drop('HOUR_APPR_PROCESS_START', axis=1, inplace=True)

    # Client apply with someone
    df_prev.loc[df_prev['NAME_TYPE_SUITE'] == 'Unaccompanied', 'NEW_ACCOMPANIED'] = 0
    df_prev.loc[df_prev['NAME_TYPE_SUITE'] != 'Unaccompanied', 'NEW_ACCOMPANIED'] = 1
    df_prev.loc[df_prev['NAME_TYPE_SUITE'].isnull(), 'NEW_ACCOMPANIED'] = np.nan
    df_prev.drop('NAME_TYPE_SUITE', axis=1, inplace=True)

    # credit requested / credit given ratio
    df_prev['NEW_APP_CREDIT_RATIO'] = df_prev['AMT_APPLICATION'].div(df_prev['AMT_CREDIT']).replace(np.inf, 0)
    # loan installment / credit amount ratio
    df_prev['NEW_ANNUITY_CREDIT_RATIO'] = df_prev['AMT_ANNUITY'] / df_prev['AMT_CREDIT']
    # credit amount / goods price ratio
    df_prev['NEW_CREDIT_GOODS_RATIO'] = df_prev['AMT_CREDIT'].div(df_prev['AMT_GOODS_PRICE']).replace(np.inf, 0)
    # interest amount
    df_prev['NEW_AMT_INTEREST'] = df_prev['CNT_PAYMENT'] * df_prev['AMT_ANNUITY'] - df_prev['AMT_CREDIT']
    # interest ratio
    df_prev['NEW_INTEREST_RATIO'] = df_prev['NEW_AMT_INTEREST'] / df_prev['AMT_CREDIT']
    # needed amount / credit amount (belki silinir)
    df_prev['NEW_AMT_NEEDED_CREDIT_RATIO'] = (df_prev['AMT_GOODS_PRICE'] - df_prev['AMT_DOWN_PAYMENT']) / \
                                             df_prev['AMT_CREDIT']

    # risk assessment via NEW_CREDIT_GOODS_RATIO
    df_prev.loc[df_prev['NEW_CREDIT_GOODS_RATIO'] >= 0.80, 'NEW_CREDIT_GOODS_RISK'] = 1
    df_prev.loc[df_prev['NEW_CREDIT_GOODS_RATIO'] < 0.80, 'NEW_CREDIT_GOODS_RISK'] = 0

    # risk to approved
    df_prev['NEW_RISK_APPROVED'] = 0
    df_prev.loc[(df_prev['NEW_CREDIT_GOODS_RISK'] == 1) &
                (df_prev['NAME_CONTRACT_STATUS'] == 'Approved'), 'NEW_RISK_APPROVED'] = 1

    # non risk to approved
    df_prev['NEW_NONRISK_APPROVED'] = 0
    df_prev.loc[(df_prev['NEW_CREDIT_GOODS_RISK'] == 0) &
                (df_prev['NAME_CONTRACT_STATUS'] == 'Approved'), 'NEW_NONRISK_APPROVED'] = 1

    # Application weekdays cycle encoding
    df_prev['WEEKDAY_APPR_PROCESS_START'] = df_prev['WEEKDAY_APPR_PROCESS_START'].map({
        'MONDAY': 1, 'TUESDAY': 2, 'WEDNESDAY': 3, 'THURSDAY': 4, 'FRIDAY': 5, 'SATURDAY': 6, 'SUNDAY': 7})
    df_prev['NEW_WEEKDAY_SIN'] = np.sin(2 * np.pi * df_prev['WEEKDAY_APPR_PROCESS_START'] / 7)
    df_prev['NEW_WEEKDAY_COS'] = np.cos(2 * np.pi * df_prev['WEEKDAY_APPR_PROCESS_START'] / 7)
    df_prev.drop('WEEKDAY_APPR_PROCESS_START', axis=1, inplace=True)

    # Rare encoding
    a = ['Auto Accessories', 'Jewelry', 'Homewares', 'Medical Supplies', 'Vehicles', 'Sport and Leisure',
         'Gardening', 'Other', 'Office Appliances', 'Tourism', 'Medicine', 'Direct Sales', 'Fitness',
         'Additional Service', 'Education', 'Weapon', 'Insurance', 'House Construction', 'Animals']
    df_prev["NAME_GOODS_CATEGORY"] = df_prev["NAME_GOODS_CATEGORY"].replace(a, 'others')

    b = ['Channel of corporate sales', 'Car dealer']
    df_prev["CHANNEL_TYPE"] = df_prev["CHANNEL_TYPE"].replace(b, 'Other_Channel')

    c = ['Auto technology', 'Jewelry', 'MLM partners', 'Tourism']
    df_prev["NAME_SELLER_INDUSTRY"] = df_prev["NAME_SELLER_INDUSTRY"].replace(c, 'Others')

    d = ['Non-cash from your account', 'Cashless from the account of the employer']
    df_prev["NAME_PAYMENT_TYPE"] = df_prev["NAME_SELLER_INDUSTRY"].replace(d, 'Others')

    # One hot encoder
    new_df_prev, new_cat_cols = one_hot_encoder(df_prev, nan_as_category)

    # Getting to all the cat cols
    origin_bin_cols = [col for col in df_prev.columns if (df_prev[col].dtypes != 'O') & (df_prev[col].nunique() == 2)]
    all_cat_cols = new_cat_cols + origin_bin_cols

    # Getting to the num cols
    # x_cols = ['SK_ID_PREV','SK_ID_CURR', 'DAYS_FIRST_DUE','DAYS_LAST_DUE_1ST_VERSION','DAYS_LAST_DUE','DAYS_TERMINATION']
    # new_num_cols = [col for col in new_df.columns if (col not in all_binary_cols) and (col not in x_cols)]
    # num_aggregations = {}
    # for num in new_num_cols:
    # num_aggregations[num] = ['min', 'max', 'mean', 'median']

    # Previous app num features
    num_aggregations = {
        'AMT_ANNUITY': ['min', 'max', 'mean', 'median'],
        'AMT_APPLICATION': ['min', 'max', 'mean', 'median'],
        'AMT_CREDIT': ['min', 'max', 'mean', 'median'],
        'AMT_DOWN_PAYMENT': ['min', 'max', 'mean', 'median'],
        'AMT_GOODS_PRICE': ['min', 'max', 'mean', 'median'],
        'RATE_DOWN_PAYMENT': ['min', 'max', 'mean', 'median'],
        'DAYS_DECISION': ['min', 'max', 'mean', 'median'],
        'CNT_PAYMENT': ['min', 'max', 'mean', 'median'],
        'NEW_APP_CREDIT_RATIO': ['min', 'max', 'mean', 'median'],
        'NEW_ANNUITY_CREDIT_RATIO': ['min', 'max', 'mean', 'median'],
        'NEW_CREDIT_GOODS_RATIO': ['min', 'max', 'mean', 'median'],
        'NEW_AMT_INTEREST': ['min', 'max', 'mean', 'median'],
        'NEW_INTEREST_RATIO': ['min', 'max', 'mean', 'median'],
        'NEW_AMT_NEEDED_CREDIT_RATIO': ['min', 'max', 'mean', 'median'],
        'NEW_WEEKDAY_SIN': ['min', 'max', 'mean', 'median'],
        'NEW_WEEKDAY_COS': ['min', 'max', 'mean', 'median']}

    # Previous app cat features
    cat_aggregations = {}
    for cat in all_cat_cols:
        cat_aggregations[cat] = ['mean']

    final_prev_df = new_df_prev.groupby('SK_ID_CURR').agg({**num_aggregations, **cat_aggregations})
    final_prev_df.columns = pd.Index(['PREV_' + e[0] + "_" + e[1].upper() for e in final_prev_df.columns.tolist()])

    # Approved App - only num features
    approved = new_df_prev[new_df_prev['NAME_CONTRACT_STATUS_Approved'] == 1]
    approved_agg = approved.groupby('SK_ID_CURR').agg(num_aggregations)
    approved_agg.columns = pd.Index(
        ['PREV_APPROVED_' + e[0] + "_" + e[1].upper() for e in approved_agg.columns.tolist()])
    final_prev_df = final_prev_df.join(approved_agg, how='left', on='SK_ID_CURR')

    # refused App - only numerical features
    refused = new_df_prev[new_df_prev['NAME_CONTRACT_STATUS_Refused'] == 1]
    refused_agg = refused.groupby('SK_ID_CURR').agg(num_aggregations)
    refused_agg.columns = pd.Index(['PREV_REFUSED_' + e[0] + "_" + e[1].upper() for e in refused_agg.columns.tolist()])
    final_prev_df = final_prev_df.join(refused_agg, how='left', on='SK_ID_CURR')

    del refused, refused_agg, approved, approved_agg, new_df_prev
    gc.collect()
    return final_prev_df


#####################################
# POS_CASH_balance Data
#####################################
def pos_cash(num_rows=None, nan_as_category=True):
    pos = pd.read_csv('/content/drive/MyDrive/Risk/POS_CASH_balance.csv', nrows=num_rows)
    pos, cat_cols = one_hot_encoder(pos, nan_as_category)

    # Features
    aggregations = {
        'MONTHS_BALANCE': ['max', 'mean', 'size'],
        'SK_DPD': ['max', 'mean'],
        'SK_DPD_DEF': ['max', 'mean']
    }
    for cat in cat_cols:
        aggregations[cat] = ['mean']

    pos_agg = pos.groupby('SK_ID_CURR').agg(aggregations)
    pos_agg.columns = pd.Index(['POS_' + e[0] + "_" + e[1].upper() for e in pos_agg.columns.tolist()])

    # Count pos cash accounts
    pos_agg['POS_COUNT'] = pos.groupby('SK_ID_CURR').size()

    del pos
    gc.collect()
    return pos_agg


#####################################
# Installments_payments Data
#####################################
def installments_payments(num_rows=None, nan_as_category=True):
    ins = pd.read_csv('/content/drive/MyDrive/Risk/installments_payments.csv', nrows=num_rows)

    ins, cat_cols = one_hot_encoder(ins, nan_as_category)

    # Percentage and difference paid in each installment (amount paid and installment value)
    ins['PAYMENT_PERC'] = ins['AMT_PAYMENT'] / ins['AMT_INSTALMENT']
    ins['PAYMENT_DIFF'] = ins['AMT_INSTALMENT'] - ins['AMT_PAYMENT']

    # Days past due and days before due (no negative values)
    ins['DPD'] = ins['DAYS_ENTRY_PAYMENT'] - ins['DAYS_INSTALMENT']
    ins['DBD'] = ins['DAYS_INSTALMENT'] - ins['DAYS_ENTRY_PAYMENT']
    ins['DPD'] = ins['DPD'].apply(lambda x: x if x > 0 else 0)
    ins['DBD'] = ins['DBD'].apply(lambda x: x if x > 0 else 0)

    # Sayısal ve kategorik sütunları ayıralım
    num_cols = [col for col in ins.columns if col not in ['SK_ID_CURR'] + cat_cols]

    # Sayısal değişkenler için agregasyonlar
    num_aggregations = {
        'NUM_INSTALMENT_VERSION': ['nunique'],
        'DPD': ['max', 'mean', 'sum'],
        'DBD': ['max', 'mean', 'sum'],
        'PAYMENT_PERC': ['max', 'mean', 'sum', 'var'],
        'PAYMENT_DIFF': ['max', 'mean', 'sum', 'var'],
        'AMT_INSTALMENT': ['max', 'mean', 'sum'],
        'AMT_PAYMENT': ['min', 'max', 'mean', 'sum'],
        'DAYS_ENTRY_PAYMENT': ['max', 'mean', 'sum']
    }

    # Kategorik değişkenler için agregasyonlar
    cat_aggregations = {}
    for cat in cat_cols:
        cat_aggregations[cat] = ['mean']

    # Tüm agregasyonları birleştirelim
    ins_agg = ins.groupby('SK_ID_CURR').agg({**num_aggregations, **cat_aggregations})
    ins_agg.columns = ['INSTAL_' + '_'.join(col).upper() for col in ins_agg.columns.values]

    # Taksit hesaplarını sayalım
    ins_agg['INSTAL_COUNT'] = ins.groupby('SK_ID_CURR').size()

    del ins
    gc.collect()
    return ins_agg

#####################################
# Credit_card_balance Data
#####################################
def credit_card_balance(num_rows=None, nan_as_category=True):
    cc = pd.read_csv('/content/drive/MyDrive/Risk/credit_card_balance.csv', nrows=num_rows)
    cc, cat_cols = one_hot_encoder(cc, nan_as_category)

    cc.drop(['SK_ID_PREV'], axis=1, inplace=True)

    # Sayısal ve kategorik sütunları ayıralım
    num_cols = [col for col in cc.columns if col not in ['SK_ID_CURR'] + cat_cols]

    # Sayısal değişkenler için agregasyonlar
    num_aggregations = {}
    for col in num_cols:
        num_aggregations[col] = ['min', 'max', 'mean', 'sum', 'var']

    # Kategorik değişkenler için agregasyonlar
    cat_aggregations = {}
    for cat in cat_cols:
        cat_aggregations[cat] = ['mean']

    # Tüm agregasyonları birleştirelim
    cc_agg = cc.groupby('SK_ID_CURR').agg({**num_aggregations, **cat_aggregations})

    # Sütun isimlerini düzenleyelim
    cc_agg.columns = ['CC_' + '_'.join(col).upper() for col in cc_agg.columns.values]

    # Kredi kartı satırlarını sayalım
    cc_agg['CC_COUNT'] = cc.groupby('SK_ID_CURR').size()

    del cc
    gc.collect()
    return cc_agg

import optuna
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

def optimize_xgb(trial, X_train, y_train, X_valid, y_valid):
    params = {
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'tree_method': 'gpu_hist',
        'use_label_encoder': False,
        'seed': 27,
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.005, 0.05),
        'n_estimators': trial.suggest_int('n_estimators', 500, 1500),
        'max_depth': trial.suggest_int('max_depth', 4, 10),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 0.5, 3.0),
        'scale_pos_weight': trial.suggest_uniform('scale_pos_weight', 1, 5)
    }

    clf = XGBClassifier(**params)
    clf.fit(
        X_train, y_train,
        eval_set=[(X_valid, y_valid)],
        verbose=False
    )
    preds = clf.predict_proba(X_valid)[:, 1]
    auc = roc_auc_score(y_valid, preds)
    return auc

def hyperparameter_optimization(model_type, X, y, n_trials=5):
    def objective(trial):
        skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=27)
        aucs = []
        for train_idx, valid_idx in skf.split(X, y):
            X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
            y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]
            if model_type == 'xgb':
                auc = optimize_xgb(trial, X_train, y_train, X_valid, y_valid)
            aucs.append(auc)
        return np.mean(aucs)

    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=n_trials, timeout=600)
    print(f"Best trial for {model_type}: {study.best_trial.params}")
    return study.best_trial.params

import optuna
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

def optimize_xgb(trial, X_train, y_train, X_valid, y_valid):
    params = {
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'tree_method': 'gpu_hist',
        'use_label_encoder': False,
        'seed': 27,
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.005, 0.05),
        'n_estimators': trial.suggest_int('n_estimators', 500, 1500),
        'max_depth': trial.suggest_int('max_depth', 4, 10),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 0.5, 3.0),
        'scale_pos_weight': trial.suggest_uniform('scale_pos_weight', 1, 5)
    }

    clf = XGBClassifier(**params)
    clf.fit(
        X_train, y_train,
        eval_set=[(X_valid, y_valid)],
        verbose=False
    )
    preds = clf.predict_proba(X_valid)[:, 1]
    auc = roc_auc_score(y_valid, preds)
    return auc

def optimize_lgbm(trial, X_train, y_train, X_valid, y_valid):
    params = {
        'objective': 'binary',
        'metric': 'auc',
        'boosting_type': 'gbdt',
        'seed': 27,
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.005, 0.05),
        'n_estimators': trial.suggest_int('n_estimators', 500, 1500),
        'max_depth': trial.suggest_int('max_depth', 4, 10),
        'num_leaves': trial.suggest_int('num_leaves', 20, 150),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 0.5, 3.0),
        'scale_pos_weight': trial.suggest_uniform('scale_pos_weight', 1, 5)
    }

    clf = LGBMClassifier(**params)
    clf.fit(
        X_train, y_train,
        eval_set=[(X_valid, y_valid)],
    )
    preds = clf.predict_proba(X_valid)[:, 1]
    auc = roc_auc_score(y_valid, preds)
    return auc

def optimize_cat(trial, X_train, y_train, X_valid, y_valid):
    params = {
        'iterations': trial.suggest_int('iterations', 500, 1500),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.005, 0.05),
        'depth': trial.suggest_int('depth', 4, 10),
        'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 0.5, 3.0),
        'scale_pos_weight': trial.suggest_uniform('scale_pos_weight', 1, 5),
        'border_count': trial.suggest_int('border_count', 32, 255),
        'random_seed': 27,
        'logging_level': 'Silent'
    }

    clf = CatBoostClassifier(**params)
    clf.fit(
        X_train, y_train,
        eval_set=(X_valid, y_valid),
    )
    preds = clf.predict_proba(X_valid)[:, 1]
    auc = roc_auc_score(y_valid, preds)
    return auc

def hyperparameter_optimization(model_type, X, y, n_trials=8):
    def objective(trial):
        skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=27)
        aucs = []
        for train_idx, valid_idx in skf.split(X, y):
            X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
            y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]
            if model_type == 'xgb':
                auc = optimize_xgb(trial, X_train, y_train, X_valid, y_valid)
            elif model_type == 'lgbm':
                auc = optimize_lgbm(trial, X_train, y_train, X_valid, y_valid)
            elif model_type == 'cat':
                auc = optimize_cat(trial, X_train, y_train, X_valid, y_valid)
            aucs.append(auc)
        return np.mean(aucs)

    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=n_trials, timeout=600)
    print(f"Best trial for {model_type}: {study.best_trial.params}")
    return study.best_trial.params

def train_models(train_df, test_df, feats, n_trials=1):
    # Hiperparametre optimizasyonu
    print("Starting hyperparameter optimization for XGBoost...")
    best_params_xgb = hyperparameter_optimization('xgb', train_df[feats], train_df['TARGET'], n_trials)

    print("Starting hyperparameter optimization for LightGBM...")
    best_params_lgbm = hyperparameter_optimization('lgbm', train_df[feats], train_df['TARGET'], n_trials)

    print("Starting hyperparameter optimization for CatBoost...")
    best_params_cat = hyperparameter_optimization('cat', train_df[feats], train_df['TARGET'], n_trials)

    # Modellerin Tanımlanması
    clf_xgb = XGBClassifier(**best_params_xgb, objective='binary:logistic', eval_metric='auc', tree_method='gpu_hist', use_label_encoder=False, seed=27, n_jobs=-1)
    clf_lgbm = LGBMClassifier(**best_params_lgbm, objective='binary', metric='auc', seed=27, n_jobs=-1)
    clf_cat = CatBoostClassifier(**best_params_cat, task_type='GPU', verbose=False, random_seed=27)

    # Ensemble Tahminleri
    oof_preds_xgb = np.zeros(train_df.shape[0])
    oof_preds_lgbm = np.zeros(train_df.shape[0])
    oof_preds_cat = np.zeros(train_df.shape[0])

    sub_preds_xgb = np.zeros(test_df.shape[0])
    sub_preds_lgbm = np.zeros(test_df.shape[0])
    sub_preds_cat = np.zeros(test_df.shape[0])

    feature_importance_df = pd.DataFrame()

    # Cross-validation setup
    folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=27)

    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['TARGET'])):
        print(f"Training fold {n_fold + 1}")
        train_x, train_y = train_df[feats].iloc[train_idx], train_df['TARGET'].iloc[train_idx]
        valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df['TARGET'].iloc[valid_idx]

        # XGBoost
        clf_xgb.fit(
            train_x, train_y,
            eval_set=[(valid_x, valid_y)],
        )
        oof_preds_xgb[valid_idx] = clf_xgb.predict_proba(valid_x)[:, 1]
        sub_preds_xgb += clf_xgb.predict_proba(test_df[feats])[:, 1] / folds.n_splits

        # Feature importance for XGBoost
        fold_importance_xgb = pd.DataFrame()
        fold_importance_xgb["feature"] = feats
        fold_importance_xgb["importance"] = clf_xgb.feature_importances_
        fold_importance_xgb["fold"] = n_fold + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_xgb], axis=0)
        # LightGBM
        clf_lgbm.fit(
            train_x, train_y,
            eval_set=[(valid_x, valid_y)],
        )
        oof_preds_lgbm[valid_idx] = clf_lgbm.predict_proba(valid_x)[:, 1]
        sub_preds_lgbm += clf_lgbm.predict_proba(test_df[feats])[:, 1] / folds.n_splits

        # Feature importance for LightGBM
        fold_importance_lgbm = pd.DataFrame()
        fold_importance_lgbm["feature"] = feats
        fold_importance_lgbm["importance"] = clf_lgbm.feature_importances_
        fold_importance_lgbm["fold"] = n_fold + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_lgbm], axis=0)

        # CatBoost
        clf_cat.fit(
            train_x, train_y,
            eval_set=[(valid_x, valid_y)],
        )
        oof_preds_cat[valid_idx] = clf_cat.predict_proba(valid_x)[:, 1]
        sub_preds_cat += clf_cat.predict_proba(test_df[feats])[:, 1] / folds.n_splits

        # Feature importance for CatBoost
        fold_importance_cat = pd.DataFrame()
        fold_importance_cat["feature"] = feats
        fold_importance_cat["importance"] = clf_cat.get_feature_importance()
        fold_importance_cat["fold"] = n_fold + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_cat], axis=0)

        # AUC Skorları
        auc_xgb = roc_auc_score(valid_y, oof_preds_xgb[valid_idx])
        auc_lgbm = roc_auc_score(valid_y, oof_preds_lgbm[valid_idx])
        auc_cat = roc_auc_score(valid_y, oof_preds_cat[valid_idx])
        print(f'Fold {n_fold + 1} AUC XGB: {auc_xgb:.6f}, LGBM: {auc_lgbm:.6f}, CatBoost: {auc_cat:.6f}')

        # Bellek temizleme
        del train_x, train_y, valid_x, valid_y
        gc.collect()

    # Ensemble tahminlerini ortalama
    oof_preds = (oof_preds_xgb + oof_preds_lgbm + oof_preds_cat) / 3
    full_auc = roc_auc_score(train_df['TARGET'], oof_preds)
    print(f'Full AUC score: {full_auc:.6f}')

    # Ensemble tahminleri test seti için ortalama
    sub_preds = (sub_preds_xgb + sub_preds_lgbm + sub_preds_cat) / 3
    test_df['TARGET'] = sub_preds

    # Tahminleri kaydetme
    test_df[['SK_ID_CURR', 'TARGET']].to_csv(submission_file_name, index=False)
    print("Submission file saved.")

    # Özellik önemlerini gösterme
    display_importances(feature_importance_df)

    # SHAP ile model açıklanabilirliği (isteğe bağlı)
    import shap
    import matplotlib.pyplot as plt

    # SHAP değerlerini hesaplamak için en iyi XGBoost modelini kullanıyoruz
    explainer = shap.Explainer(clf_xgb)
    shap_values = explainer(train_df[feats])
    shap.summary_plot(shap_values, train_df[feats], show=False)
    plt.savefig('shap_summary_plot.png')
    plt.close()

    return feature_importance_df

def clean_feature_names(df):
    """
    Temizle özelliği: LightGBM'in desteklemediği özel karakterleri kaldırır.
    """
    df.columns = df.columns.str.replace(r'[^A-Za-z0-9_]+', '_', regex=True)
    return df

#####################################
# Main Function
#####################################
def main(debug=False):
    num_rows = 500000 if debug else None
    with timer("Veri Yükleme ve Ön İşleme"):
        df = application_train_test(num_rows)

    with timer("Process bureau and bureau_balance"):
        bureau = bureau_and_balance(num_rows)
        print("Bureau df shape:", bureau.shape)
        df = df.join(bureau, how='left', on='SK_ID_CURR')
        del bureau
        gc.collect()

    with timer("Process previous_applications"):
        prev = previous_app(num_rows)
        print("Previous applications df shape:", prev.shape)
        df = df.join(prev, how='left', on='SK_ID_CURR')
        del prev
        gc.collect()

    with timer("Process POS-CASH balance"):
        pos = pos_cash(num_rows)
        print("Pos-cash balance df shape:", pos.shape)
        df = df.join(pos, how='left', on='SK_ID_CURR')
        del pos
        gc.collect()

    with timer("Process installments payments"):
        ins = installments_payments(num_rows)
        print("Installments payments df shape:", ins.shape)
        df = df.join(ins, how='left', on='SK_ID_CURR')
        del ins
        gc.collect()

    with timer("Process credit card balance"):
        cc = credit_card_balance(num_rows)
        print("Credit card balance df shape:", cc.shape)
        df = df.join(cc, how='left', on='SK_ID_CURR')
        # Exporting combined_df to investigate features
        df.to_csv('combined_df.csv')
        del cc
        gc.collect()

    # Özellik isimlerini temizle
    df = clean_feature_names(df)

    # Model Eğitimine Hazırlık
    with timer("Prepare data for modeling"):
        train_df = df[df['TARGET'].notnull()]
        test_df = df[df['TARGET'].isnull()]
        feats = [f for f in train_df.columns if f not in ['TARGET', 'SK_ID_CURR', 'SK_ID_BUREAU', 'SK_ID_PREV', 'index']]

        # Inf ve NaN değerlerini kontrol et ve temizle
        train_df.replace([np.inf, -np.inf], np.nan, inplace=True)
        test_df.replace([np.inf, -np.inf], np.nan, inplace=True)

        # NaN değerlerini median ile doldur
        train_df.fillna(train_df.median(), inplace=True)
        test_df.fillna(test_df.median(), inplace=True)

        del df
        gc.collect()

    with timer("Run Ensemble Models"):
        feature_importance_df = train_models(train_df, test_df, feats, n_trials=5)

    print("Model training completed.")

if __name__ == "__main__":
    submission_file_name = "/content/drive/MyDrive/Risk/submission_DSMLBC4_Grp2.csv"
    with timer("Full model run"):
        main(debug=True)

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



FileNotFoundError: [Errno 2] No such file or directory: 'give_me_some_credit.csv'

In [None]:
# dolandiricilik_tespiti.py
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
from keras.models import Sequential
from keras.layers import Dense
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

import kagglehub

# Download latest version
data = kagglehub.dataset_download("mlg-ulb/creditcardfraud")

print("Path to dataset files:", data)

def load_data():
    data = kagglehub.dataset_download("mlg-ulb/creditcardfraud")
    print("Path to dataset files:", data)

def preprocess_data(data):
    data.fillna(0, inplace=True)
    X = data.drop('Class', axis=1)
    y = data['Class']
    return X, y

def scale_data(X):
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    return X_scaled, scaler

def build_autoencoder(input_dim):
    model = Sequential()
    model.add(Dense(32, activation='relu', input_dim=input_dim))
    model.add(Dense(16, activation='relu'))
    model.add(Dense(8, activation='relu'))
    model.add(Dense(16, activation='relu'))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(input_dim, activation='linear'))
    model.compile(optimizer='adam', loss='mse')
    return model

def main():
    data = load_data()
    X, y = preprocess_data(data)
    X_scaled, scaler = scale_data(X)
    # Normal işlemlerle eğitme
    X_train = X_scaled[y == 0]
    X_test = X_scaled
    y_test = y
    autoencoder = build_autoencoder(X_train.shape[1])
    autoencoder.fit(X_train, X_train, epochs=10, batch_size=32, shuffle=True, validation_split=0.1)
    reconstructions = autoencoder.predict(X_test)
    mse = np.mean(np.power(X_test - reconstructions, 2), axis=1)
    error_df = pd.DataFrame({'reconstruction_error': mse, 'true_class': y_test})
    threshold = error_df[error_df['true_class'] == 0]['reconstruction_error'].quantile(0.99)
    y_pred = [1 if e > threshold else 0 for e in error_df['reconstruction_error'].values]
    print("ROC AUC Skoru:", roc_auc_score(y_test, y_pred))
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt='d')
    plt.show()
    autoencoder.save('models/fraud_detection_autoencoder.h5')
    joblib.dump(scaler, 'models/fraud_scaler.pkl')

if __name__ == "__main__":
    main()


Downloading from https://www.kaggle.com/api/v1/datasets/download/mlg-ulb/creditcardfraud?dataset_version_number=3...


100%|██████████| 66.0M/66.0M [00:03<00:00, 17.5MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/mlg-ulb/creditcardfraud/versions/3
Path to dataset files: /root/.cache/kagglehub/datasets/mlg-ulb/creditcardfraud/versions/3


AttributeError: 'NoneType' object has no attribute 'fillna'

In [None]:
# musteri_segmentasyonu.py

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA
import plotly.express as px
import joblib

def load_data():
    data = pd.read_csv('bank_marketing.csv')
    return data

def preprocess_data(data):
    data.fillna(method='ffill', inplace=True)
    categorical_cols = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome']
    data = pd.get_dummies(data, columns=categorical_cols)
    return data

def scale_data(data):
    scaler = StandardScaler()
    data_scaled = scaler.fit_transform(data)
    return data_scaled, scaler

def reduce_dimensions(data_scaled):
    pca = PCA(n_components=3)
    data_pca = pca.fit_transform(data_scaled)
    return data_pca, pca

def cluster_data(data_pca):
    model = AgglomerativeClustering(n_clusters=4)
    clusters = model.fit_predict(data_pca)
    return clusters, model

def visualize_clusters(data_pca, clusters):
    fig = px.scatter_3d(x=data_pca[:,0], y=data_pca[:,1], z=data_pca[:,2], color=clusters.astype(str))
    fig.show()

def save_models(scaler, pca, model):
    joblib.dump(scaler, 'models/segment_scaler.pkl')
    joblib.dump(pca, 'models/segment_pca.pkl')
    joblib.dump(model, 'models/segment_model.pkl')

def main():
    data = load_data()
    data_processed = preprocess_data(data)
    data_scaled, scaler = scale_data(data_processed)
    data_pca, pca = reduce_dimensions(data_scaled)
    clusters, model = cluster_data(data_pca)
    data['segment'] = clusters
    silhouette_avg = silhouette_score(data_scaled, clusters)
    print("Silhouette Score:", silhouette_avg)
    visualize_clusters(data_pca, clusters)
    data.to_csv('customer_segments.csv', index=False)
    save_models(scaler, pca, model)

if __name__ == "__main__":
    main()


In [None]:
# gelir_tahmini.py

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.callbacks import EarlyStopping
import optuna
import joblib
import kagglehub

def load_data():
    # Download latest version
    path = kagglehub.dataset_download("kamaumunyori/income-prediction-dataset-us-20th-century-data")

    print("Path to dataset files:", path)

def preprocess_data(data):
    data.fillna(method='ffill', inplace=True)
    categorical_cols = data.select_dtypes(include=['object']).columns
    data = pd.get_dummies(data, columns=categorical_cols)
    X = data.drop('income', axis=1)
    y = data['income']
    return X, y

def scale_data(X):
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    return X_scaled, scaler

def build_model(trial, input_dim):
    model = Sequential()
    n_layers = trial.suggest_int('n_layers', 1, 5)
    for i in range(n_layers):
        num_units = trial.suggest_int('n_units_l{}'.format(i), 32, 256)
        model.add(Dense(num_units, activation='relu'))
        dropout_rate = trial.suggest_float('dropout_l{}'.format(i), 0.0, 0.5)
        model.add(Dropout(dropout_rate))
    model.add(Dense(1, activation='linear'))
    model.compile(optimizer='adam', loss='mean_squared_error')
    return model

def optimize_model(X_train, y_train, input_dim):
    def objective(trial):
        model = build_model(trial, input_dim)
        es = EarlyStopping(monitor='val_loss', mode='min', verbose=0, patience=5)
        history = model.fit(X_train, y_train, validation_split=0.1, callbacks=[es], epochs=50, batch_size=32, verbose=0)
        loss = history.history['val_loss'][-1]
        return loss
    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=20)
    return study.best_params

def train_model(X_train, y_train, params, input_dim):
    model = build_model(optuna.trial.FixedTrial(params), input_dim)
    es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=5)
    model.fit(X_train, y_train, validation_split=0.1, callbacks=[es], epochs=100, batch_size=32, verbose=1)
    return model

def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    print("Mean Absolute Error:", mae)

def main():
    data = load_data()
    X, y = preprocess_data(data)
    X_scaled, scaler = scale_data(X)
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
    input_dim = X_train.shape[1]
    best_params = optimize_model(X_train, y_train, input_dim)
    model = train_model(X_train, y_train, best_params, input_dim)
    evaluate_model(model, X_test, y_test)
    model.save('models/income_prediction_model.h5')
    joblib.dump(scaler, 'models/income_scaler.pkl')

if __name__ == "__main__":
    main()


Downloading from https://www.kaggle.com/api/v1/datasets/download/kamaumunyori/income-prediction-dataset-us-20th-century-data?dataset_version_number=1...


100%|██████████| 9.02M/9.02M [00:01<00:00, 5.86MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/kamaumunyori/income-prediction-dataset-us-20th-century-data/versions/1


AttributeError: 'NoneType' object has no attribute 'fillna'

In [None]:
# finansal_saglik_skoru.py

import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import kagglehub
def load_data():
    # Download latest version
    data = kagglehub.dataset_download("teertha/personal-loan-modeling")

    print("Path to dataset files:", data)

def preprocess_data(data):
    data.fillna(method='ffill', inplace=True)
    return data

def calculate_scores(data):
    data['DebtIncomeRatio'] = data['TotalDebt'] / data['AnnualIncome']
    data['CreditUtilization'] = data['CurrentBalance'] / data['CreditLimit']
    data['PaymentHistoryScore'] = data['OnTimePayments'] / data['TotalPayments']
    data['LengthOfCreditHistory'] = data['CreditHistoryYears']
    # Özellikleri normalleştirme
    scaler = MinMaxScaler()
    features = ['DebtIncomeRatio', 'CreditUtilization', 'PaymentHistoryScore', 'LengthOfCreditHistory']
    data[features] = scaler.fit_transform(data[features])
    # Ağırlıklı skor hesaplama
    weights = {'DebtIncomeRatio': 0.3, 'CreditUtilization': 0.3, 'PaymentHistoryScore': 0.3, 'LengthOfCreditHistory': 0.1}
    data['FinancialHealthScore'] = (data['DebtIncomeRatio'] * weights['DebtIncomeRatio'] +
                                    data['CreditUtilization'] * weights['CreditUtilization'] +
                                    data['PaymentHistoryScore'] * weights['PaymentHistoryScore'] +
                                    data['LengthOfCreditHistory'] * weights['LengthOfCreditHistory']) * 100
    return data

def main():
    data = load_data()
    data = preprocess_data(data)
    data = calculate_scores(data)
    data.to_csv('financial_health_scores.csv', index=False)
    print("Finansal Sağlık Skoru Hesaplandı ve Kaydedildi.")

if __name__ == "__main__":
    main()


Path to dataset files: /root/.cache/kagglehub/datasets/teertha/personal-loan-modeling/versions/1


TypeError: 'NoneType' object is not subscriptable

In [None]:
# crm_entegrasyonu.py

import pandas as pd
import numpy as np

def load_data():
    crm_data = pd.read_csv('crm_data.csv')
    segments = pd.read_csv('customer_segments.csv')
    return crm_data, segments

def merge_data(crm_data, segments):
    data = crm_data.merge(segments[['customer_id', 'segment']], on='customer_id', how='left')
    return data

def analyze_behavior(data):
    behavior = data.groupby('segment').agg({
        'purchase_amount': ['mean', 'sum'],
        'interaction_count': 'mean',
        'customer_lifetime_value': 'mean'
    })
    behavior.columns = ['_'.join(col).strip() for col in behavior.columns.values]
    behavior.reset_index(inplace=True)
    behavior.to_csv('segment_behavior.csv', index=False)
    return behavior

def main():
    crm_data, segments = load_data()
    data = merge_data(crm_data, segments)
    behavior = analyze_behavior(data)
    print("Segment Davranış Analizi Tamamlandı ve Kaydedildi.")

if __name__ == "__main__":
    main()


In [None]:
# raporlama_dashboard.py

import pandas as pd
import plotly.express as px
import dash
from dash import dcc, html
from dash.dependencies import Input, Output

def load_data():
    financial_data = pd.read_csv('financial_analytics.csv')
    return financial_data

def create_dashboard(financial_data):
    app = dash.Dash(__name__)

    app.layout = html.Div(children=[
        html.H1(children='Finansal Analitik Dashboard'),
        dcc.Dropdown(
            id='metric-dropdown',
            options=[
                {'label': 'Gelir', 'value': 'income'},
                {'label': 'Varlıklar', 'value': 'assets'},
                {'label': 'Yükümlülükler', 'value': 'liabilities'}
            ],
            value='income'
        ),
        dcc.Graph(id='metric-graph')
    ])

    @app.callback(
        Output('metric-graph', 'figure'),
        [Input('metric-dropdown', 'value')]
    )
    def update_graph(selected_metric):
        fig = px.line(financial_data, x='month', y=selected_metric, title=f'Aylık {selected_metric.capitalize()} Trendleri')
        return fig

    app.run_server(debug=True)

def main():
    financial_data = load_data()
    create_dashboard(financial_data)

if __name__ == "__main__":
    main()


In [None]:
# all_in_banker_main.py

def main():
    print("ALL-IN-BANKER Platformu Başlatılıyor...\n")
    import kredi_skorlamasi
    kredi_skorlamasi.main()
    import dolandiricilik_tespiti
    dolandiricilik_tespiti.main()
    import musteri_segmentasyonu
    musteri_segmentasyonu.main()
    import gelir_tahmini
    gelir_tahmini.main()
    import finansal_saglik_skoru
    finansal_saglik_skoru.main()
    import crm_entegrasyonu
    crm_entegrasyonu.main()
    import raporlama_dashboard
    raporlama_dashboard.main()
    print("\nALL-IN-BANKER Platformu Başarıyla Çalıştırıldı.")

if __name__ == "__main__":
    main()
