In [1]:
import pandas as pd
import numpy as np
import chardet
import category_encoders as ce
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import LabelEncoder
from scipy.stats.mstats import winsorize
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_score
import joblib

In [3]:
#Даны данные об использовании клиентами услуг банка (файл ABT_EM).
#Витрина (матрица объекты-признаки) уже готова, на одного клиента приходится одна строчка таблицы данных.
#Необходимо построить модель прогнозирования оттока по кредитной карте 
#(переменная cc_clos – индикатор закрытия кредитной карты в следующем месяце - целевая).
#Все остальные столбцы доступны для использования в качестве предикторов.
#При решении обязательно сделать партицирование выборки – разделить все объекты на обучение (70%) и тест (30%).
#Желательно сделать отбор переменных несколькими методами 
#и применить несколько различных алгоритмов машинного обучения (деревья решений, регрессси, нейросети).
#Задача считается решенной, если значение к-та GINI на тестовой выборке превышает 60%.
def analyze_file(filename):

    with open(filename, 'rb') as f:
        encoding = chardet.detect(f.read())['encoding']

    for sep in ['\t']:
        try:
            df_sample = pd.read_csv(filename, sep=sep, encoding=encoding)
            print(f"Разделитель '{sep}': {len(df_sample.columns)} столбцов")
            print(df_sample.head())
            print("------")
            df_sample.to_csv('../data/abt_em.csv', index=False)
        except:
            continue


analyze_file('../data/abt_em.tab')
df = pd.read_csv('../data/abt_em.csv')

Разделитель '	': 73 столбцов
   customerID  atm_trn_cnt_1  atm_trn_cnt  bt_trn_cnt_1  bt_trn_cnt  \
0     3764517              5           23            14           1   
1     4440112             10            5            10           2   
2     2267213              0            0            14           1   
3     4488910             16            6            10           1   
4     2150617              0            0            18           2   

   chq_trn_cnt_1  chq_trn_cnt  cc_trn_cnt_1  cc_trn_cnt  elt_trn_cnt_1  ...  \
0              0            0             0           0             12  ...   
1              0            0            11           0             10  ...   
2              0            0            16          10             16  ...   
3              9            1            10           1             13  ...   
4              0            0            10           0             10  ...   

   npv_total  tra_avg_bal_1  tra_avg_bal  tr_min_bal  tra_auth_no  \


In [3]:
df

Unnamed: 0,customerID,atm_trn_cnt_1,atm_trn_cnt,bt_trn_cnt_1,bt_trn_cnt,chq_trn_cnt_1,chq_trn_cnt,cc_trn_cnt_1,cc_trn_cnt,elt_trn_cnt_1,...,npv_total,tra_avg_bal_1,tra_avg_bal,tr_min_bal,tra_auth_no,tra_auth_yes,call_enq_cnt,call_trn_cnt_1,call_trn_cnt,Change_In_ATM_TR
0,3764517,5,23,14,1,0,0,0,0,12,...,468.37,7235,7218.98,6067.53,-1,-1,0,0,0,360.000000
1,4440112,10,5,10,2,0,0,11,0,10,...,-395.25,254,234.26,87.78,-1,-1,0,0,0,-50.000000
2,2267213,0,0,14,1,0,0,16,10,16,...,38.00,0,0.00,0.00,-1,-1,1,10,0,
3,4488910,16,6,10,1,9,1,10,1,13,...,-142.88,568,561.63,284.96,-1,-1,0,0,0,-62.500000
4,2150617,0,0,18,2,0,0,10,0,10,...,-166.96,172,193.23,109.03,-1,-1,0,0,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4253,3010910,9,2,9,3,10,4,24,31,34,...,6635.77,0,0.00,0.00,-1,-1,0,0,0,-77.777778
4254,1142196,23,3,18,5,0,0,10,5,32,...,639.00,6796,6781.59,5835.38,-1,-1,0,0,0,-86.956522
4255,4695416,7,1,10,6,0,0,3,8,10,...,3931.03,2837,2821.27,952.03,-1,-1,0,0,0,-85.714286
4256,1007358,10,7,10,2,13,1,10,6,25,...,2829.80,1950,1940.32,967.01,-1,-1,0,6,0,-30.000000


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4258 entries, 0 to 4257
Data columns (total 73 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        4258 non-null   int64  
 1   atm_trn_cnt_1     4258 non-null   int64  
 2   atm_trn_cnt       4258 non-null   int64  
 3   bt_trn_cnt_1      4258 non-null   int64  
 4   bt_trn_cnt        4258 non-null   int64  
 5   chq_trn_cnt_1     4258 non-null   int64  
 6   chq_trn_cnt       4258 non-null   int64  
 7   cc_trn_cnt_1      4258 non-null   int64  
 8   cc_trn_cnt        4258 non-null   int64  
 9   elt_trn_cnt_1     4258 non-null   int64  
 10  elt_trn_cnt       4258 non-null   int64  
 11  ht_trn_cnt_1      4258 non-null   int64  
 12  ht_trn_cnt        4258 non-null   int64  
 13  it_trn_cnt        4258 non-null   int64  
 14  it_trn_cnt_1      4258 non-null   int64  
 15  pos_trn_cnt_1     4258 non-null   int64  
 16  pos_trn_cnt       4258 non-null   int64  


In [5]:
def handle_missing_values(df):
    missing_analysis = df.isnull().sum()
    print("пропуски по столбцам:")
    print(missing_analysis[missing_analysis > 0])
    
    df['Change_In_ATM_TR'] = df['Change_In_ATM_TR'].fillna(df['Change_In_ATM_TR'].median())
    
    if df.isnull().sum().sum() > 0:
        numeric_cols = df.select_dtypes(include=[np.number]).columns
        df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())
        
        categorical_cols = df.select_dtypes(include=['object']).columns
        df[categorical_cols] = df[categorical_cols].fillna('Unknown')
    
    return df



In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4258 entries, 0 to 4257
Data columns (total 73 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        4258 non-null   int64  
 1   atm_trn_cnt_1     4258 non-null   int64  
 2   atm_trn_cnt       4258 non-null   int64  
 3   bt_trn_cnt_1      4258 non-null   int64  
 4   bt_trn_cnt        4258 non-null   int64  
 5   chq_trn_cnt_1     4258 non-null   int64  
 6   chq_trn_cnt       4258 non-null   int64  
 7   cc_trn_cnt_1      4258 non-null   int64  
 8   cc_trn_cnt        4258 non-null   int64  
 9   elt_trn_cnt_1     4258 non-null   int64  
 10  elt_trn_cnt       4258 non-null   int64  
 11  ht_trn_cnt_1      4258 non-null   int64  
 12  ht_trn_cnt        4258 non-null   int64  
 13  it_trn_cnt        4258 non-null   int64  
 14  it_trn_cnt_1      4258 non-null   int64  
 15  pos_trn_cnt_1     4258 non-null   int64  
 16  pos_trn_cnt       4258 non-null   int64  


In [7]:

df[['tra_avg_bal_1', 'tra_avg_bal']]

Unnamed: 0,tra_avg_bal_1,tra_avg_bal
0,7235,7218.98
1,254,234.26
2,0,0.00
3,568,561.63
4,172,193.23
...,...,...
4253,0,0.00
4254,6796,6781.59
4255,2837,2821.27
4256,1950,1940.32


In [8]:
def encode_categorical_features(df):
    categorical_cols = df.select_dtypes(include=object).columns
    
    print("категориальные признаки:")
    for col in categorical_cols:
        print(f"{col}: {df[col].nunique()} уникальных значений")
        print(f"значения: {df[col].unique()}")
        print(f"распределение:\n{df[col].value_counts()}")
        print("------------")
    
    binary_cols = []
    multi_cols = []
    
    for col in categorical_cols:
        if df[col].nunique() == 2:
            binary_cols.append(col)
        else:
            multi_cols.append(col)
    
    label_encoders = {}
    for col in binary_cols:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
        label_encoders[col] = le
        print(f"закодирован {col}")
    
    target_encoder = ce.TargetEncoder(cols=multi_cols)
    df = target_encoder.fit_transform(df, df['cc_clos'])
    
    return df, label_encoders, target_encoder



In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4258 entries, 0 to 4257
Data columns (total 73 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        4258 non-null   int64  
 1   atm_trn_cnt_1     4258 non-null   int64  
 2   atm_trn_cnt       4258 non-null   int64  
 3   bt_trn_cnt_1      4258 non-null   int64  
 4   bt_trn_cnt        4258 non-null   int64  
 5   chq_trn_cnt_1     4258 non-null   int64  
 6   chq_trn_cnt       4258 non-null   int64  
 7   cc_trn_cnt_1      4258 non-null   int64  
 8   cc_trn_cnt        4258 non-null   int64  
 9   elt_trn_cnt_1     4258 non-null   int64  
 10  elt_trn_cnt       4258 non-null   int64  
 11  ht_trn_cnt_1      4258 non-null   int64  
 12  ht_trn_cnt        4258 non-null   int64  
 13  it_trn_cnt        4258 non-null   int64  
 14  it_trn_cnt_1      4258 non-null   int64  
 15  pos_trn_cnt_1     4258 non-null   int64  
 16  pos_trn_cnt       4258 non-null   int64  


In [10]:
def handle_outliers(df):
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    numeric_cols = [col for col in numeric_cols if col != 'cc_clos']
    
    for col in numeric_cols:
        Q1 = df[col].quantile(0.05)
        Q3 = df[col].quantile(0.95)
        df[col] = df[col].clip(lower=Q1, upper=Q3)
    
    return df

In [11]:
def scale_and_select_features(df, target_col='cc_clos', method='variance',corr_threshold=0.1, mi_threshold=0.01, return_X_y=False):
    y = df[target_col]
    X = df.drop(columns=[target_col])
    
    if 'customerID' in X.columns:
        X = X.drop(columns=['customerID'])
    
    print(f"иисходное количество признаков: {X.shape[1]}")
    
    if method == 'variance':
        selector = VarianceThreshold(threshold=0.1)
        X_selected = selector.fit_transform(X)
        selected_columns = X.columns[selector.get_support()].tolist()
        
    elif method == 'correlation':
        selected_columns, _ = select_features_by_correlation(X, y, corr_threshold)
        X_selected = X[selected_columns]
        
    elif method == 'mutual_info':
        selected_columns, _ = select_features_by_mutual_info(X, y, mi_threshold)
        X_selected = X[selected_columns]
    else:
        raise ValueError("метод должен быть: 'variance', 'correlation', 'mutual_info'")
    
    print(f"Отобрано признаков: {len(selected_columns)}")
    
    scaler = RobustScaler()
    X_scaled = scaler.fit_transform(X_selected)
    
    if return_X_y:
        return X_scaled, y, scaler, selected_columns
    
    X_final = pd.DataFrame(X_scaled, columns=selected_columns, index=df.index)
    X_final[target_col] = y.values
    
    return X_final, scaler, selected_columns

In [12]:
def select_features_by_correlation(X_train, y_train, threshold=0.1):
    correlations = X_train.corrwith(y_train).abs()
    correlations = correlations.sort_values(ascending=False)
    
    selected_features = correlations[correlations > threshold].index.tolist()
    
    print(f"порог корреляции: {threshold}")
    print(f"всего признаков: {len(correlations)}")
    print(f"отобрано признаков: {len(selected_features)}")
    print(f"медианная корреляция: {correlations.median():.4f}")
    print(f"максимальная корреляция: {correlations.max():.4f}")
    
    return selected_features, correlation

In [13]:
def select_features_by_mutual_info(X_train, y_train, threshold=0.01):
    mi_scores = mutual_info_classif(X_train, y_train)
    mi_series = pd.Series(mi_scores, index=X_train.columns)
    mi_series = mi_series.sort_values(ascending=False)
    
    selected_features = mi_series[mi_series > threshold].index.tolist()
    
    print(f"порог Mutual Information: {threshold}")
    print(f"всего признаков: {len(mi_series)}")
    print(f"отобрано признаков: {len(selected_features)}")
    print(f"медианный MI: {mi_series.median():.4f}")
    print(f"максимальный MI: {mi_series.max():.4f}")
   
    return selected_features, mi_series

In [14]:
def compare_feature_selection_methods(X_train, y_train, corr_threshold=0.1, mi_threshold=0.01):
    
    features_corr, corr_scores = select_features_by_correlation(X_train, y_train, corr_threshold)
    features_mi, mi_scores = select_features_by_mutual_info(X_train, y_train, mi_threshold)
    
    selector = VarianceThreshold(threshold=0.01)
    selector.fit(X_train)
    features_variance = X_train.columns[selector.get_support()].tolist()
    
    all_features = set(X_train.columns)
    comparison_data = []
    
    for feature in all_features:
        comparison_data.append({
            'feature': feature,
            'correlation': corr_scores.get(feature, 0),
            'mutual_info': mi_scores.get(feature, 0),
            'selected_corr': feature in features_corr,
            'selected_mi': feature in features_mi,
            'selected_variance': feature in features_variance
        })
    
    comparison_df = pd.DataFrame(comparison_data)
    

    print(f"Всего признаков: {len(all_features)}")
    print(f"Отобрано по корреляции: {len(features_corr)}")
    print(f"Отобрано по MI: {len(features_mi)}") 
    print(f"Отобрано по дисперсии: {len(features_variance)}")
    

    corr_mi = set(features_corr) & set(features_mi)
    corr_var = set(features_corr) & set(features_variance)
    mi_var = set(features_mi) & set(features_variance)
    all_three = set(features_corr) & set(features_mi) & set(features_variance)
    
    print(f"\nПересечения методов:")
    print(f"Корреляция ∩ MI: {len(corr_mi)} признаков")
    print(f"Корреляция ∩ Дисперсия: {len(corr_var)} признаков") 
    print(f"MI ∩ Дисперсия: {len(mi_var)} признаков")
    print(f"Все три метода: {len(all_three)} признаков")
    
    return comparison_df, {
        'correlation': features_corr,
        'mutual_info': features_mi, 
        'variance': features_variance,
        'intersection_all': list(all_three)
    }

In [15]:
def full_preprocessing_pipeline(df, target_col='cc_clos'):
    print(f"исходная форма: {df.shape}")
    
    #пропуска
    df = handle_missing_values(df)
    print(f"после обработки пропусков: {df.shape}")
    
    #инкодинг категориальных значений
    df, label_encoders, target_encoder = encode_categorical_features(df)
    print(f"после кодирования категориальных: {df.shape}")
    
    #обработка выбросов
    df = handle_outliers(df)
    print(f"после обработки выбросов: {df.shape}")
    
    #масштабирование и отбор признаков
    df_final, scaler, selector = scale_and_select_features(
    df, method='variance'
)
    print(f"финальная форма: {df_final.shape}")
    
    print("preprocessing - end")
    
    return df_final, {
        'label_encoders': label_encoders,
        'target_encoder': target_encoder,
        'scaler': scaler,
        'selector': selector
    }

df_processed, preprocessors = full_preprocessing_pipeline(df)

исходная форма: (4258, 73)
пропуски по столбцам:
Change_In_ATM_TR    1556
dtype: int64
после обработки пропусков: (4258, 73)
категориальные признаки:
arrear_ind: 2 уникальных значений
значения: ['N' 'Y']
распределение:
arrear_ind
N    4000
Y     258
Name: count, dtype: int64
------------
chq_acc_ind: 2 уникальных значений
значения: ['N' 'Y']
распределение:
chq_acc_ind
N    2869
Y    1389
Name: count, dtype: int64
------------
gender: 3 уникальных значений
значения: ['F' 'M' 'U']
распределение:
gender
F    2118
M    2100
U      40
Name: count, dtype: int64
------------
title: 8 уникальных значений
значения: ['MS' 'MRS' 'MR' 'MISS' 'UNKNOWN' 'AC' 'REL' 'MASTER']
распределение:
title
MR         1536
UNKNOWN    1177
MRS         910
MISS        396
MS          223
AC           13
MASTER        2
REL           1
Name: count, dtype: int64
------------
закодирован arrear_ind
закодирован chq_acc_ind
после кодирования категориальных: (4258, 73)
после обработки выбросов: (4258, 73)
иисходное коли

  df[col] = df[col].clip(lower=Q1, upper=Q3)


In [16]:
def prepare_for_modeling(df_processed, target_col='cc_clos', test_size=0.3):
    x = df_processed.drop(columns=[target_col])
    y = df_processed[target_col]
    
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_size, stratify=y)
    
    print(f"train: {x_train.shape}, test: {x_test.shape}")
    print(f"распределение целевой в train: {y_train.value_counts(normalize=True)}")
    print(f"распределение целевой в test: {y_test.value_counts(normalize=True)}")
    
    return x_train, x_test, y_train, y_test



In [18]:
df_processed, preprocessors = full_preprocessing_pipeline(df)
x_train, x_test, y_train, y_test = prepare_for_modeling(df_processed)

исходная форма: (4258, 73)
пропуски по столбцам:
Series([], dtype: int64)
после обработки пропусков: (4258, 73)
категориальные признаки:
gender: 3 уникальных значений
значения: ['F' 'M' 'U']
распределение:
gender
F    2118
M    2100
U      40
Name: count, dtype: int64
------------
title: 8 уникальных значений
значения: ['MS' 'MRS' 'MR' 'MISS' 'UNKNOWN' 'AC' 'REL' 'MASTER']
распределение:
title
MR         1536
UNKNOWN    1177
MRS         910
MISS        396
MS          223
AC           13
MASTER        2
REL           1
Name: count, dtype: int64
------------
после кодирования категориальных: (4258, 73)
после обработки выбросов: (4258, 73)
иисходное количество признаков: 71
Отобрано признаков: 63
финальная форма: (4258, 64)
preprocessing - end
train: (2980, 63), test: (1278, 63)
распределение целевой в train: cc_clos
0    0.953691
1    0.046309
Name: proportion, dtype: float64
распределение целевой в test: cc_clos
0    0.953834
1    0.046166
Name: proportion, dtype: float64


  df[col] = df[col].clip(lower=Q1, upper=Q3)


In [32]:
def gini_score(y_true, y_pred_proba):
    auc = roc_auc_score(y_true, y_pred_proba)
    return 2 * auc - 1

def evaluate_model(model, x_test, y_test, model_name):
    y_pred = model.predict(x_test)
    y_pred_proba = model.predict_proba(x_test)[:, 1]
    
    auc = roc_auc_score(y_test, y_pred_proba)
    gini = gini_score(y_test, y_pred_proba)
    
    print(f"\n{'-'*50}")
    print(f"рзультаты {model_name}:")
    print(f"{'-'*50}")
    print(f"AUC: {auc:.4f}")
    print(f"GINI: {gini:.4f}")
    
    print(f"Classification Report:")
    print(classification_report(y_test, y_pred))
    
    cm = confusion_matrix(y_test, y_pred)
    print(f"сonfusion matrix:")
    print(f"TN: {cm[0,0]}, FP: {cm[0,1]}")
    print(f"FN: {cm[1,0]}, TP: {cm[1,1]}")
    
    return {
        'model_name': model_name,
        'model': model,
        'auc': auc,
        'gini': gini,
        'y_pred_proba': y_pred_proba,
        'y_pred': y_pred
    }

In [33]:
models = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
}


def train_logistic_regression(x_train, x_test, y_train, y_test):
    param_grid = {
        'C': [0.001, 0.01, 0.1, 1, 10],
        'penalty': ['l1', 'l2'],
        'solver': ['liblinear', 'saga'],
        'max_iter': [100, 200, 300]
    }
    
    cv = StratifiedKFold(n_splits=5, shuffle=True)
    
    grid_search = GridSearchCV(
        LogisticRegression(class_weight='balanced'),
        param_grid,
        cv=cv,
        scoring='roc_auc',
        n_jobs=-1,
        verbose=1
    )
    
    grid_search.fit(x_train, y_train)
    
    print(f"лучшие параметры: {grid_search.best_params_}")
    print(f"лучший AUC на cv: {grid_search.best_score_:.4f}")
    
    results = evaluate_model(grid_search.best_estimator_, x_test, y_test, "Logistic Regression")
    results['best_params'] = grid_search.best_params_
    
    return results
def train_random_forest(x_train, x_test, y_train, y_test):
    param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [10, 20, 30, None],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['sqrt', 'log2', None],
        'class_weight': ['balanced', 'balanced_subsample']
    }
    
    cv = StratifiedKFold(n_splits=5, shuffle=True)
    
    grid_search = GridSearchCV(
        RandomForestClassifier(n_jobs=-1),
        param_grid,
        cv=cv,
        scoring='roc_auc',
        n_jobs=-1,
        verbose=1
    )
    
    grid_search.fit(x_train, y_train)
    
    print(f"лучшие параметры: {grid_search.best_params_}")
    print(f"лучший AUC на cv: {grid_search.best_score_:.4f}")
    
    feature_importance = pd.DataFrame({
        'feature': x_train.columns,
        'importance': grid_search.best_estimator_.feature_importances_
    }).sort_values('importance', ascending=False)
    
    print(f"nоп важных признаков:")
    for i, row in feature_importance.head(10).iterrows():
        print(f"  {row['feature']}: {row['importance']:.4f}")
    
    results = evaluate_model(grid_search.best_estimator_, x_test, y_test, "Random Forest")
    results['best_params'] = grid_search.best_params_
    results['feature_importance'] = feature_importance
    
    return results

def train_gradient_boosting(x_train, x_test, y_train, y_test):
    param_grid = {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.05, 0.1, 0.2],
        'max_depth': [3, 5, 7],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'subsample': [0.8, 0.9, 1.0]
    }
    
    cv = StratifiedKFold(n_splits=5, shuffle=True)
    
    grid_search = GridSearchCV(
        GradientBoostingClassifier(),
        param_grid,
        cv=cv,
        scoring='roc_auc',
        n_jobs=-1,
        verbose=1
    )
    
    grid_search.fit(x_train, y_train)
    
    print(f"лучшие параметры: {grid_search.best_params_}")
    print(f"лучший AUC на сv: {grid_search.best_score_:.4f}")
    
    results = evaluate_model(grid_search.best_estimator_, x_test, y_test, "Gradient Boosting")
    results['best_params'] = grid_search.best_params_
    
    return results
    
def compare_all_models(x_train, x_test, y_train, y_test, models_to_train):
    models_to_train = ['logistic', 'random_forest', 'gradient_boosting']

    results = {}
    
    training_functions = {
        'logistic': train_logistic_regression,
        'random_forest': train_random_forest,
        'gradient_boosting': train_gradient_boosting,
    }
    
    for model_name in models_to_train:
        if model_name in training_functions:
            print(f"\n{'-'*60}")
            print(f"обучение {model_name.upper()}...")
            print(f"{'-'*60}")
                
            result = training_functions[model_name](x_train, x_test, y_train, y_test)
            results[model_name] = result
    
    print(f"\n{'-'*80}")
    print("сравнение моделей")
    print(f"{'-'*80}")
    
    comparison_data = []
    for model_name, result in results.items():
        comparison_data.append({
            'Model': model_name.replace('_', ' ').title(),
            'AUC': f"{result['auc']:.4f}",
            'GINI': f"{result['gini']:.4f}",
            'Target_GINI_0.6': 'да' if result['gini'] > 0.6 else 'нет',
            'Best_Params': str(result['best_params'])[:50] + '...' if len(str(result['best_params'])) > 50 else str(result['best_params'])
        })
    
    comparison_df = pd.DataFrame(comparison_data)
    comparison_df = comparison_df.sort_values('GINI', ascending=False)
    
    print(comparison_df.to_string(index=False))
    
    best_model_name = max(results.items(), key=lambda x: x[1]['gini'])[0]
    best_model_result = results[best_model_name]
    
    print(f"ЛУЧШАЯ МОДЕЛЬ: {best_model_name.replace('_', ' ').title()}")
    print(f"GINI: {best_model_result['gini']:.4f}")
    print(f"AUC: {best_model_result['auc']:.4f}")
    
    return results, comparison_df, best_model_result



In [None]:
all_results, comparison_df, best_model = compare_all_models(x_train, x_test, y_train, y_test,
    models_to_train=['logistic', 'random_forest', 'gradient_boosting']
)

joblib.dump(best_model['model'], '../models/best_churn_model.pkl')

In [24]:
df_processed

Unnamed: 0,atm_trn_cnt_1,atm_trn_cnt,bt_trn_cnt_1,bt_trn_cnt,chq_trn_cnt_1,chq_trn_cnt,cc_trn_cnt_1,cc_trn_cnt,elt_trn_cnt_1,elt_trn_cnt,...,avg_bal_1,npv_total,tra_avg_bal_1,tra_avg_bal,tr_min_bal,tra_auth_no,tra_auth_yes,call_trn_cnt_1,Change_In_ATM_TR,cc_clos
0,-0.2,2.166667,0.666667,0.000000,0.0,0.0,-1.111111,-0.285714,0.142857,1.263158,...,0.997053,0.407691,4.851266,4.869779,8.496811,0.0,0.0,0.0,6.344718,0
1,0.3,0.666667,0.000000,0.333333,0.0,0.0,0.111111,-0.285714,0.000000,-0.052632,...,0.082449,-0.394882,0.104708,0.096749,0.122925,0.0,0.0,0.0,0.396545,0
2,-0.7,-0.166667,0.666667,0.000000,0.0,0.0,0.666667,1.142857,0.428571,0.157895,...,-0.185265,-0.057098,-0.067993,-0.063333,0.000000,0.0,0.0,10.0,0.000000,0
3,0.9,0.833333,0.000000,0.000000,1.8,1.0,0.000000,-0.142857,0.214286,0.000000,...,-5.220105,-0.252444,0.318205,0.320458,0.399051,0.0,0.0,0.0,-0.099136,0
4,-0.7,-0.166667,1.333333,0.333333,0.0,0.0,0.000000,-0.285714,0.000000,-0.368421,...,-0.572168,-0.278450,0.048955,0.068711,0.152683,0.0,0.0,0.0,0.000000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4253,0.2,0.166667,-0.166667,0.666667,2.0,4.0,1.555556,4.142857,1.714286,1.473684,...,-21.028841,5.030220,-0.067993,-0.063333,0.000000,0.0,0.0,0.0,-0.704969,0
4254,1.3,0.333333,1.333333,1.333333,0.0,0.0,0.000000,0.428571,1.571429,1.263158,...,0.148461,0.591967,4.552779,4.570887,8.171714,0.0,0.0,0.0,-1.068947,0
4255,0.0,0.000000,0.000000,1.666667,0.0,0.0,-0.777778,0.857143,0.000000,0.315789,...,-21.028841,4.147276,1.860955,1.864590,1.333198,0.0,0.0,0.0,-1.019687,0
4256,0.3,1.000000,0.000000,0.333333,2.6,1.0,0.000000,0.571429,1.071429,1.894737,...,4.476333,2.957975,1.257862,1.262590,1.354176,0.0,0.0,6.0,1.189635,0
