In [2]:
# Core Libraries
import numpy as np
import pandas as pd
import time
import warnings
import re

# Data Splitting and Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from imblearn.under_sampling import RandomUnderSampler

# Models
from catboost import CatBoostClassifier
from sklearn.model_selection import GridSearchCV

# Metrics for Model Evaluation
from sklearn.metrics import classification_report

# Utilities
from rapidfuzz import process, fuzz
from sklearn.base import BaseEstimator, TransformerMixin

# Filter Warnings
warnings.filterwarnings("ignore")

In [3]:
train=pd.read_csv('../data/train.csv')
test=pd.read_csv('../data/test.csv', dtype={'start_dt': 'object'})

In [4]:
pd.set_option('display.max_columns', None) #отображение всех столбцов

In [5]:
train.head()

Unnamed: 0,monthly_income,work_experience,requested_sum,main_agreement_amount,main_agreement_term,requested_period_days,requested_amount,req_app_amount,approved_amount,period_days,days_finish_loan,ag,cnt_ext,term,price,elecs_sum,recurents_sum,tamount,issues,principal,interest,overdue_interest,overdue_fee,nbki_score,payment_frequency,status,loan_id,client_id,source,first_source,interface,type,repayment_type,client_type,settlement,client_type.1,region,gender,loan_order,have_extension,contact_cases,created_at,closed_at,start_dt,churn
0,-0.000735,0.952894,,-0.388291,-0.340659,-0.389028,-1.05353,-0.629296,-0.61762,-0.340659,-0.539615,-0.619312,,,,-0.063933,-0.143015,-0.434972,0.347703,-0.347703,-0.502878,-0.218265,-0.185707,1.837622,2,1,464867,111035,11,0,0,2,3,1,г Москва,repeated,г Москва,male,2,0,,2022-12-23 17:50:25,2022-12-31 16:09:23,,0
1,-0.000742,0.952894,,-0.388291,-1.145865,-1.294731,-0.436189,-0.02014,-0.61762,-1.145865,-0.539615,1.35433,,,,-0.063933,-0.143015,-0.434972,0.347703,-0.347703,-0.502878,-0.218265,-0.185707,2.2626,2,1,4569010,604426,11,0,2,1,0,1,г Москва,repeated,г Москва,female,14,0,,2024-06-29 22:26:08,2024-07-27 16:21:49,,0
2,-0.000742,0.952894,,-0.388291,-0.685747,-1.11359,-0.259806,0.153905,-0.61762,-0.685747,-0.502706,1.35433,,,,-0.063933,-0.143015,-0.429509,0.347703,-0.347703,-0.484111,-0.218265,-0.185707,2.2626,2,1,985489,334703,11,4,2,2,3,1,г Пермь,repeated,край Пермский,male,10,0,,2023-04-10 21:06:38,2023-05-10 07:51:18,,0
3,-0.000742,0.952894,,-0.388291,0.176974,-1.385301,1.239449,1.633284,-0.61762,0.176974,-0.465796,1.35433,,,,-0.063933,-0.143015,-0.413118,0.347703,-0.347703,-0.427809,-0.218265,-0.185707,2.2626,2,6,2247534,362893,11,0,2,2,1,1,г Самара,repeated,обл Самарская,female,2,1,,2023-10-01 00:13:16,2024-01-12 17:40:14,2024-01-12 17:29:21,0
4,-0.000742,0.952894,,-0.388291,-1.145865,-1.657011,0.357534,0.763061,-0.61762,-1.145865,-0.133607,1.35433,1.407752,-0.612324,-0.705103,-0.063933,-0.143015,-0.320236,0.347703,-0.347703,-0.108766,-0.218265,-0.185707,2.2626,2,1,4695006,537669,11,11,2,1,2,1,г Москва,repeated,г Москва,female,17,0,,2024-07-15 10:40:35,2024-07-18 12:20:08,,0


In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4036207 entries, 0 to 4036206
Data columns (total 45 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   monthly_income         float64
 1   work_experience        float64
 2   requested_sum          float64
 3   main_agreement_amount  float64
 4   main_agreement_term    float64
 5   requested_period_days  float64
 6   requested_amount       float64
 7   req_app_amount         float64
 8   approved_amount        float64
 9   period_days            float64
 10  days_finish_loan       float64
 11  ag                     float64
 12  cnt_ext                float64
 13  term                   float64
 14  price                  float64
 15  elecs_sum              float64
 16  recurents_sum          float64
 17  tamount                float64
 18  issues                 float64
 19  principal              float64
 20  interest               float64
 21  overdue_interest       float64
 22  overdue_fee       

In [7]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1348743 entries, 0 to 1348742
Data columns (total 43 columns):
 #   Column                 Non-Null Count    Dtype  
---  ------                 --------------    -----  
 0   monthly_income         1348680 non-null  float64
 1   work_experience        463225 non-null   float64
 2   requested_sum          163790 non-null   float64
 3   main_agreement_amount  1348743 non-null  float64
 4   main_agreement_term    1348743 non-null  float64
 5   requested_period_days  1200882 non-null  float64
 6   requested_amount       1209772 non-null  float64
 7   req_app_amount         1209772 non-null  float64
 8   approved_amount        1348743 non-null  float64
 9   period_days            1348743 non-null  float64
 10  days_finish_loan       1348743 non-null  float64
 11  ag                     1348743 non-null  float64
 12  cnt_ext                151701 non-null   float64
 13  term                   151701 non-null   float64
 14  price             

In [8]:
optimized_types = {
    "monthly_income": np.float32,
    "work_experience": np.float32,
    "requested_sum": np.float32,
    "main_agreement_amount": np.float32,
    "main_agreement_term": np.float32,
    "requested_period_days": np.float32,
    "requested_amount": np.float32,
    "req_app_amount": np.float32,
    "approved_amount": np.float32,
    "period_days": np.float32,
    "days_finish_loan": np.float32,
    "ag": np.float32,
    "cnt_ext": np.float32,
    "term": np.float32,
    "price": np.float32,
    "elecs_sum": np.float32,
    "recurents_sum": np.float32,
    "tamount": np.float32,
    "issues": np.float32,
    "principal": np.float32,
    "interest": np.float32,
    "overdue_interest": np.float32,
    "overdue_fee": np.float32,
    "nbki_score": np.float32,
    "contact_cases": np.float32,
    "payment_frequency": np.int32,
    "status": np.int32,
    "loan_id": np.int32,
    "client_id": np.int32,
    "source": np.int32,
    "first_source": np.int32,
    "interface": np.int32,
    "type": np.int32,
    "repayment_type": np.int32,
    "client_type": np.int32,
    "loan_order": np.int32,
    "have_extension": np.int32,
    "churn": np.int32,
    "settlement": "category",
    "client_type.1": "category",
    "region": "category",
    "gender": "category",
    "created_at": "datetime64[ns]",
    "closed_at": "datetime64[ns]",
    "start_dt": "datetime64[ns]"
}
train = train.astype(optimized_types)

optimized_types_test = {col: dtype for col, dtype in optimized_types.items() if col in test.columns}
test = test.astype(optimized_types_test)

In [9]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4036207 entries, 0 to 4036206
Data columns (total 45 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   monthly_income         float32       
 1   work_experience        float32       
 2   requested_sum          float32       
 3   main_agreement_amount  float32       
 4   main_agreement_term    float32       
 5   requested_period_days  float32       
 6   requested_amount       float32       
 7   req_app_amount         float32       
 8   approved_amount        float32       
 9   period_days            float32       
 10  days_finish_loan       float32       
 11  ag                     float32       
 12  cnt_ext                float32       
 13  term                   float32       
 14  price                  float32       
 15  elecs_sum              float32       
 16  recurents_sum          float32       
 17  tamount                float32       
 18  issues                

In [10]:
train.isnull().sum()

monthly_income               207
work_experience          2648360
requested_sum            3546390
main_agreement_amount          0
main_agreement_term            0
requested_period_days     441618
requested_amount          414836
req_app_amount            414836
approved_amount                0
period_days                    0
days_finish_loan               0
ag                             0
cnt_ext                  3582556
term                     3582556
price                    3582816
elecs_sum                   5794
recurents_sum               5794
tamount                     1774
issues                         0
principal                      0
interest                       0
overdue_interest               0
overdue_fee                    0
nbki_score                 84476
payment_frequency              0
status                         0
loan_id                        0
client_id                      0
source                         0
first_source                   0
interface 

In [11]:
test.isnull().sum()

monthly_income                63
work_experience           885518
requested_sum            1184953
main_agreement_amount          0
main_agreement_term            0
requested_period_days     147861
requested_amount          138971
req_app_amount            138971
approved_amount                0
period_days                    0
days_finish_loan               0
ag                             0
cnt_ext                  1197042
term                     1197042
price                    1197133
elecs_sum                   1984
recurents_sum               1984
tamount                      618
issues                         0
principal                      0
interest                       0
overdue_interest               0
overdue_fee                    0
nbki_score                 27977
payment_frequency              0
status                         0
loan_id                        0
client_id                      0
source                         0
first_source                   0
interface 

In [12]:
train['region'].head(50)

0                                        г Москва
1                                        г Москва
2                                   край Пермский
3                                   обл Самарская
4                                        г Москва
5                                 Респ Удмуртская
6                             обл Калининградская
7                               обл Ленинградская
8                                 обл Челябинская
9                                  обл Московская
10                            край Ставропольский
11    АО Ханты-Мансийский Автономный округ - Югра
12                              Респ Башкортостан
13                                обл Ульяновская
14                                обл Челябинская
15                              край Красноярский
16                              г Санкт-Петербург
17                                 Респ Татарстан
18    АО Ханты-Мансийский Автономный округ - Югра
19                              обл Новосибирская


In [13]:
train['settlement'].value_counts()

settlement
г Москва                   247270
г Санкт-Петербург          113231
г Новосибирск               58199
г Екатеринбург              53476
г Казань                    43216
                            ...  
деревня Нижние Адам-Учи         1
деревня Нижние Бурнаши          1
деревня Нижние Кропачи          1
село Пахаревка                  1
д Митяево                       1
Name: count, Length: 47425, dtype: int64

In [14]:
class DataCleanerTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, region_column=None, region_reference=None):
        self.missing_cols = []
        self.region_column = region_column
        self.region_reference = region_reference
        self.reference_list = []
        self.label_encoders = {}

    def fit(self, X, y=None):
        # Сохраняем список колонок с пропущенными значениями для добавления индикаторов
        self.missing_cols = X.columns[X.isnull().any()].tolist()

        if self.region_column and self.region_reference:
            reference = pd.read_csv(self.region_reference)
            reference["region_name"] = reference["region_name"].str.lower().str.strip()
            self.reference_list = reference["region_name"].tolist()

        return self

    def transform(self, X):
        data = X.copy()
        
        # Добавляем индикаторы пропущенных значений перед заполнением пропусков
        data = self.add_missing_indicators(data)

        # Применяем функции для заполнения пропущенных значений
        data = self.fill_monthly_income(data)
        data = self.fill_work_experience(data)
        data = self.fill_requested_sum(data)
        data = self.fill_requested_amount_and_req_app_amount(data)
        data = self.fill_requested_period_days(data)
        data = self.fill_term_and_price(data)
        data = self.fill_elecs_sum_and_recurents_sum(data)
        data = self.fill_tamount(data)
        data = self.fill_nbki_score(data)
        data = self.fill_cnt_ext(data)
        data = self.fill_start_dt(data)
        data = self.fill_contact_cases(data)

        if self.region_column:
            data[self.region_column] = data[self.region_column].str.lower().str.strip()
            data[self.region_column] = data[self.region_column].apply(self.find_closest_match)

        data = self.process_settlement(data)

        # Генерация новых признаков
        data = self.generate_features(data)

        # Обработка дат
        data = self.process_dates(data)

        # Преобразование категориальных переменных
        data = self.encode_categorical_features(data)

        # Удаление ненужных признаков
        data = self.delete_bad_features(data)

        return data

    def fill_monthly_income(self, data):
        data['monthly_income'] = data.groupby('status')['monthly_income'].transform(
            lambda x: x.fillna(x.median()) if not np.isnan(x.median()) else x.fillna(data['monthly_income'].median())
        )
        return data

    def fill_work_experience(self, data):
        data['work_experience'] = data.groupby('status')['work_experience'].transform(
            lambda x: x.fillna(x.median()) if not np.isnan(x.median()) else x.fillna(data['work_experience'].median())
        )
        return data

    def fill_requested_sum(self, data):
        data['requested_sum'] = data['requested_sum'].fillna(
            data['approved_amount'] * (data['requested_sum'].median() / data['approved_amount'].median())
        )
        return data

    def fill_requested_amount_and_req_app_amount(self, data):
        data['requested_amount'] = data['requested_amount'].fillna(data['requested_sum'])
        data['req_app_amount'] = data['requested_amount'] - data['approved_amount']
        return data

    def fill_requested_period_days(self, data):
        data['requested_period_days'] = data.groupby('type')['requested_period_days'].transform(
            lambda x: x.fillna(x.median())
        )
        return data

    def fill_term_and_price(self, data):
        data['term'] = data['term'].fillna(0)
        data['price'] = data['price'].fillna(0)
        return data

    def fill_elecs_sum_and_recurents_sum(self, data):
        data['elecs_sum'] = data.groupby('repayment_type')['elecs_sum'].transform(
            lambda x: x.fillna(x.median())
        )
        data['recurents_sum'] = data.groupby('repayment_type')['recurents_sum'].transform(
            lambda x: x.fillna(x.median())
        )
        return data

    def fill_tamount(self, data):
        data['tamount'] = data['tamount'].fillna(data['principal'] + data['interest'] + data['issues'])
        return data

    def fill_nbki_score(self, data):
        data['nbki_score'] = data.groupby('status')['nbki_score'].transform(
            lambda x: x.fillna(x.median())
        )
        return data

    def fill_cnt_ext(self, data):
        data['cnt_ext'] = data['cnt_ext'].fillna(0)
        return data

    def add_missing_indicators(self, data):
        for col in self.missing_cols:
            data[f'{col}_missing'] = data[col].isnull().astype(int)
        return data

    def fill_contact_cases(self, data):
        data['contact_cases'] = data.groupby('repayment_type')['contact_cases'].transform(
            lambda x: x.fillna(x.median())
        )
        data['contact_cases'] = data['contact_cases'].fillna(data['contact_cases'].median())
        return data

    def fill_start_dt(self, data):
        data['start_dt'] = pd.to_datetime(data['start_dt'], errors='coerce')
        data['start_dt'] = data.groupby('client_id')['start_dt'].transform(
            lambda x: x.fillna(x.min())
        )
        data['start_dt'] = data['start_dt'].fillna(data['created_at'])
        return data

    def find_closest_match(self, region):
        match = process.extractOne(region, self.reference_list, scorer=fuzz.ratio)
        return match[0] if match and match[1] >= 60 else region  # Минимальный порог схожести - 60%

    def generate_features(self, data):
        # Платежное поведение
        data['payment_to_income_ratio'] = data['tamount'] / data['monthly_income']
        data['interest_to_principal_ratio'] = data['interest'] / data['principal']
        data['overdue_ratio'] = (data['overdue_interest'] + data['overdue_fee']) / data['principal']

        # Временные характеристики
        data['early_repayment'] = data['period_days'] - data['days_finish_loan']

        # Кредитное поведение
        data['approval_ratio'] = data['approved_amount'] / data['requested_amount']
        data['previous_extensions'] = data['cnt_ext'].fillna(0)
        data['had_extensions'] = data['cnt_ext'].notna().astype(int)

        # Признаки рискованности клиента
        data['risk_score'] = (
            (data['nbki_score'] * -1) +  # Инвертируем скор, чтобы высокие значения означали высокий риск
            (data['overdue_ratio'] * 2) +
            (data['had_extensions'] * 1.5)
        )

        # Поведенческие признаки
        data['digital_engagement'] = ((data['interface'] == 2) |
                                        (data['source'].isin([11, 12]))).astype(int)
        data['prefers_longer_terms'] = (data['requested_period_days'] > 
                                          data['requested_period_days'].mean()).astype(int)

        # Группируем по client_id и создаем новые признаки
        client_features = data.groupby('client_id').agg({
            'loan_id': 'count',  # Общее количество займов
            'days_finish_loan': 'mean',  # Средний срок закрытия займа
            'approved_amount': 'mean',  # Средняя сумма займа
            'cnt_ext': 'sum',  # Общее количество продлений
            'elecs_sum': 'sum',  # Сумма штрафов
            'created_at': 'max',  # Дата последнего займа
            'monthly_income': 'std',  # Вариация дохода
            'contact_cases': 'sum'  # Количество обращений в поддержку
        }).reset_index()

        client_features.rename(columns={
            'loan_id': 'total_loans',
            'days_finish_loan': 'avg_days_finish_loan',
            'approved_amount': 'avg_approved_amount',
            'cnt_ext': 'total_extensions',
            'elecs_sum': 'total_fees',
            'created_at': 'last_loan_date',
            'monthly_income': 'income_variation',
            'contact_cases': 'total_contacts'
        }, inplace=True)
        data = data.merge(client_features, on='client_id', how='left')
        
        # Финансовая нагрузка
        data['total_debt_burden'] = (data['elecs_sum'] + data['recurents_sum']) / data['monthly_income']
        data['disposable_income_ratio'] = 1 - data['total_debt_burden']

        return data

    def process_dates(self, data):
        date_columns = ['created_at', 'closed_at', 'start_dt']
        for col in date_columns:
            if col in data.columns:
                data[col] = pd.to_datetime(data[col], errors='coerce')  # Преобразуем в datetime
                data[col] = data[col].fillna(data['start_dt'])  # Заполняем отсутствующие значения датой из 'start_dt'
                # Преобразуем datetime в формат timestamp (количество секунд с начала эпохи)
                data[col] = data[col].astype('int64') // 10**9
        return data

    def encode_categorical_features(self, data):
        categorical_features = ['status', 'payment_frequency', 'source', 'first_source', 
                                'interface', 'type', 'repayment_type', 'settlement', 'client_type.1', 'region', 'gender']

        for feature in categorical_features:
            data[feature] = data[feature].astype(str).fillna("missing")  # Используем строку, чтобы избежать ошибки
            le = LabelEncoder()
            data[feature] = le.fit_transform(data[feature])
            self.label_encoders[feature] = le

        return data

    def clean_city_name(self, name):
        # Извлечение последнего слова
        name = name.split()[-1].lower().strip()
        # Удаление нежелательных символов (всего, что не буквы и не цифры)
        name = re.sub(r'[^а-яa-z0-9]', '', name)
        
        return name

    def process_settlement(self, data):
        data['settlement'] = data['settlement'].apply(self.clean_city_name)
        threshold = 400
        data_region_counts = data['settlement'].value_counts()
        data['settlement'] = data['settlement'].apply(lambda x: x if data_region_counts[x] > threshold else 'другие')

        return data

    def delete_bad_features(self, data):
        columns = ['loan_id', 'client_id']
        data.drop(columns=columns, inplace=True)
        
        return data


In [15]:
loan_id_test = test['loan_id']

In [16]:
# Пример использования
start_time = time.time()
transformer = DataCleanerTransformer(
    region_column="region", region_reference="region_reference.csv"
)
# Файл region_reference содержит правильно написанные названия регионов, исправление работает, так что, находятся 
# ближайше возможные записи из region_reference и по ним заменяются записи из train и test

train = transformer.fit_transform(train)
test = transformer.fit_transform(test)

end_time = time.time()
elapsed_time = round(end_time - start_time, 2)
print(f"\nElapsed Time: {elapsed_time} seconds")


Elapsed Time: 572.3 seconds


In [17]:
X = train.drop(['churn'], axis=1)
y = train['churn']

rus = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = rus.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

In [18]:
%%time
# Настройка модели
model = CatBoostClassifier(verbose=False, random_seed=42)

# Параметры для GridSearch
param_grid = {
    'iterations': [500],
    'depth': [6, 8],
}

# Инициализация GridSearchCV
grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring='f1',  # Оптимизируем F1-метрику
    cv=3,          # Стратифицированная кросс-валидация
    verbose=2,
    n_jobs=-1      # Используем все доступные процессоры
)

# Обучение GridSearch
grid_search.fit(X_train, y_train)

# Вывод лучших параметров и метрик
print("\nЛучшие параметры: ", grid_search.best_params_)
print("Лучший F1-скор на кросс-валидации: ", grid_search.best_score_)

# Оценка на валидационных данных
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

print("\nОтчет о классификации на валидационных данных:")
print(classification_report(y_test, y_pred))

Fitting 3 folds for each of 2 candidates, totalling 6 fits

Лучшие параметры:  {'depth': 8, 'iterations': 500}
Лучший F1-скор на кросс-валидации:  0.9358515554583898

Отчет о классификации на валидационных данных:
              precision    recall  f1-score   support

           0       0.91      0.97      0.94    236805
           1       0.97      0.90      0.94    237647

    accuracy                           0.94    474452
   macro avg       0.94      0.94      0.94    474452
weighted avg       0.94      0.94      0.94    474452

CPU times: user 13min 7s, sys: 12.8 s, total: 13min 20s
Wall time: 17min 12s


In [19]:
def submission(model, train_df, y_train, test_df, loan_id_submission, target_column='churn', output_prefix='submission_baseline'):
    
    X_train = train_df
    y_train = y_train
    X_test = test_df

    model.fit(X_train, y_train)

    y_pred_test = model.predict(X_test)

    # Подготовка DataFrame для сохранения
    submission = pd.DataFrame({
        'loan_id': loan_id_submission,
        'churn': y_pred_test
    })

    # Формирование имени файла
    model_name = type(model).__name__
    output_file = f"{output_prefix}_{model_name}.csv"

    # Сохранение файла
    submission.to_csv(output_file, index=False)
    print(f"Предсказания сохранены в файл: {output_file}")

In [20]:
model= CatBoostClassifier(depth=8, iterations=500)
submission(model, train.drop(columns=['churn', 'closed_at']), train['churn'], test, loan_id_test)

Learning rate set to 0.5
0:	learn: 0.3868208	total: 991ms	remaining: 8m 14s
1:	learn: 0.2969361	total: 2.01s	remaining: 8m 21s
2:	learn: 0.2539313	total: 2.87s	remaining: 7m 55s
3:	learn: 0.2341843	total: 3.79s	remaining: 7m 49s
4:	learn: 0.2146058	total: 4.73s	remaining: 7m 48s
5:	learn: 0.2079574	total: 5.44s	remaining: 7m 27s
6:	learn: 0.2005728	total: 6.45s	remaining: 7m 34s
7:	learn: 0.1952080	total: 7.54s	remaining: 7m 43s
8:	learn: 0.1897037	total: 8.72s	remaining: 7m 55s
9:	learn: 0.1863603	total: 9.54s	remaining: 7m 47s
10:	learn: 0.1827046	total: 10.4s	remaining: 7m 40s
11:	learn: 0.1799363	total: 11.2s	remaining: 7m 34s
12:	learn: 0.1763581	total: 12s	remaining: 7m 30s
13:	learn: 0.1749933	total: 12.7s	remaining: 7m 20s
14:	learn: 0.1727718	total: 13.7s	remaining: 7m 21s
15:	learn: 0.1703832	total: 14.5s	remaining: 7m 20s
16:	learn: 0.1688658	total: 15.3s	remaining: 7m 14s
17:	learn: 0.1676968	total: 16.1s	remaining: 7m 12s
18:	learn: 0.1664476	total: 17.1s	remaining: 7m 13s