# Setup

In [1]:
# import sklearn
# print(sklearn.__version__)

1.6.1


In [1]:
import pandas as pd
import numpy as np
import os
import re
# import gc  #  Работа с памятью
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import seaborn as sns

## Phi_K Correlation
# import phik
# from phik.report import plot_correlation_matrix
# from phik import report


## Preprocessing 
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, OneHotEncoder  #  Encode categorical features
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler  #  Transform features by scaling

from sklearn.impute import SimpleImputer, KNNImputer  #  Transformers for missing value imputation
# Some estimators are designed to handle NaN values without preprocessing 

from sklearn.compose import ColumnTransformer

# from sklearn.pipeline import Pipeline
# Pipeline позволяет объединить несколько шагов обработки данных и обучения в единую последовательность

# from sklearn.feature_selection import SelectKBest, f_classif

## Tools for model selection, such as cross validation and hyper-parameter tuning
from sklearn.model_selection import train_test_split
#from sklearn.model_selection import StratifiedKFold, 
#from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score  
#from sklearn.model_selection import GridSearchCV, RandomizedSearchCV  #  Поиск оптимальных параметров


## Ensemble-based methods for classification, regression and anomaly detection
from sklearn.ensemble import HistGradientBoostingClassifier, AdaBoostClassifier
# from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
# from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor


# Linear Models section 
from sklearn.linear_model import LogisticRegression
# from sklearn.linear_model import LinearRegression


## Decision tree based models for classification and regression
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.tree import DecisionTreeRegressor
# from sklearn.tree import plot_tree

## boosting models

# import xgboost as xgb
# from xgboost import XGBClassifier
# from xgboost import XGBRegressor
# from xgboost import plot_importance  #  Важность признаков
# from xgboost import plot_tree  #  Визуализация дерева

# import lightgbm as lgb
# from lightgbm import early_stopping, log_evaluation

# from catboost import CatBoostClassifier, Pool


## Score functions, performance metrics
from sklearn.metrics import mean_absolute_error, roc_auc_score, mean_squared_error, r2_score  #  Regression metrics
from sklearn.metrics import accuracy_score, classification_report #  Classification metrics
from sklearn.metrics import log_loss  #  Отслеживание метрик во время обучения
## Оптимизация гиперпараметров
import optuna

## SHAP для интерпретации
import shap


# import warnings
# warnings.filterwarnings("ignore")
# warnings.filterwarnings("ignore", category=DeprecationWarning)
# warnings.filterwarnings("ignore", category=FutureWarning)
# warnings.filterwarnings("ignore", category=UserWarning)

In [None]:
# pandas options

pd.set_option("display.precision", 2)


# Datasets

In [4]:
## classification 
from sklearn.datasets import load_iris
# from sklearn.datasets import load_breast_cancer
# from sklearn.datasets import load_wine

# ## regression
# from sklearn.datasets import load_boston
# from sklearn.datasets import load_diabetes

# Загрузка датасета Iris
iris = load_iris()
X, y = iris.data, iris.target

# Информация о датасете
# print(iris.DESCR)
print(f"Размерность данных: {X.shape}")
print(f"Классы: {iris.target_names}")

Размерность данных: (150, 4)
Классы: ['setosa' 'versicolor' 'virginica']


In [None]:
train_df = pd.read_csv('shift_ml_2025_train.csv')
target = 'итоговый_статус_займа'
test_df = pd.read_csv('shift_ml_2025_test.csv')

In [None]:
# copy
train = train_df.copy()
test = test.copy()

In [None]:
# объединить несколько CSV-файлов

import glob

files = glob.glob('*.csv')
dfs = [pd.read_csv(file) for file in files]
combined = pd.concat(dfs, ignore_index=True)

In [None]:
# CSV файлы
df_csv = pd.read_csv('data.csv', encoding='utf-8', sep=',')

# CSV с параметрами
df = pd.read_csv('data.csv', 
                 sep=';',           # разделитель
                 header=0,          # строка с заголовками
                 index_col=0,       # столбец для индекса
                 parse_dates=True,  # автоматическое преобразование дат
                 na_values=['N/A', 'NULL'])  # значения для NaN

# Excel файлы
df_excel = pd.read_excel('data.xlsx', sheet_name='Sheet1')

# JSON файлы
df_json = pd.read_json('data.json')

# SQL базы данных
import sqlite3
conn = sqlite3.connect('database.db')
df_sql = pd.read_sql('SELECT * FROM table_name', conn)

# HTML таблицы
df_html = pd.read_html('https://example.com/table')

# Parquet файлы (для больших данных)
df_parquet = pd.read_parquet('data.parquet')

# Генерация синтетических данных

In [None]:
from sklearn.datasets import make_classification, make_regression

# Генерация данных для классификации
X, y = make_classification(
    n_samples=1000,
    n_features=20,
    n_informative=10,
    n_redundant=5,
    n_clusters_per_class=1,
    random_state=42
)

# Генерация данных для регрессии
X_reg, y_reg = make_regression(
    n_samples=1000,
    n_features=10,
    noise=0.1,
    random_state=42
)

# Basic EDA

In [None]:
train.shape, test.shape

In [None]:
train.head()

In [None]:
train.info(verbose=True)

In [None]:
train_df.columns

In [None]:
df.describe().T

### 'Баланс классов в целевой переменной'

In [None]:
train[target].value_counts(normalize=True)

In [None]:
# vc = train[target].value_counts().sort_index()
# plt.figure()
# vc.plot(kind="bar")
# plt.title("Распределение классов")
# plt.xlabel("Класс")
# plt.ylabel("Количество")
# plt.show()

In [None]:
pd.crosstab(df["Churn"], df["International plan"], normalize=True)
pd.crosstab(df["Churn"], df["Customer service calls"], margins=True)

In [None]:
columns_to_show = ["Total day minutes", "Total eve minutes", "Total night minutes"]

df.groupby(["Churn"])[columns_to_show].agg([np.mean, np.std, np.min, np.max])

In [None]:
df.pivot_table(
    ["Total day calls", "Total eve calls", "Total night calls"],
    ["Area code"],
    aggfunc="mean",
)

In [None]:
sns.countplot(x="Many_service_calls", hue="Churn", data=df);

In [None]:
# sns.pairplot(train_df, hue='Personality')

In [None]:
# phik_overview = train_df.phik_matrix()
# phik_overview['Personality'].sort_values(ascending=False)

# Missing values

In [None]:
test_df.isnull().sum()

In [None]:
missing = train.isna().sum().sort_values(ascending=False)
missing_pct = (missing / len(train) * 100).round(2)
display(pd.DataFrame({"missing": missing, "missing_pct": missing_pct}).head(25))

# Duplicates

In [None]:
# DataFrame.duplicated(subset=None, keep='first')

In [None]:
# DataFrame.drop_duplicates(subset=None, *, keep='first', inplace=False, ignore_index=False)

In [None]:
# Преобразование типов данных
df['Возраст'] = df['Возраст'].astype('int64')
df['Дата'] = pd.to_datetime(df['Дата'])

# Выбираем нужные признаки

In [None]:
X = train.drop(target, axis=1)
y = train[target]

In [None]:
# Categorical features
cat_features = X.select_dtypes(include=['object']).columns.tolist()
cat_features

In [None]:
d = {"No": False, "Yes": True}
df["International plan"] = df["International plan"].map(d)

# Предобработка данных

| Метод | Класс | Назначение | Параметры |
| ----- | ----- | ---------- | ---------- |
| Стандартизация | StandardScaler | Приведение к нормальному распределению | with_mean, with_std |
| Нормализация | MinMaxScaler | Масштабирование в диапазон [0,1] | feature_range |
| Робастное масштабирование | RobustScaler | Устойчивость к выбросам | quantile_range |
| One-hot кодирование | OneHotEncoder | Кодирование категориальных признаков | sparse_output, drop |
| Кодирование меток | LabelEncoder | Преобразование категорий в числа | - |
| Полиномиальные признаки | PolynomialFeatures | Создание полиномиальных комбинаций | degree, include_bias |

### Transform features by scaling

In [None]:
# Стандартизация (среднее=0, стд=1)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_train)

# Нормализация в диапазон [0, 1]
min_max_scaler = MinMaxScaler()
X_minmax = min_max_scaler.fit_transform(X_train)

# Устойчивое масштабирование (робастное к выбросам)
robust_scaler = RobustScaler()
X_robust = robust_scaler.fit_transform(X_train)

###  Encode categorical features

In [None]:
# Кодирование меток
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y_categorical)

# Encode target
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# One-hot кодирование
encoder = OneHotEncoder(sparse_output=False)
X_encoded = encoder.fit_transform(X_categorical)

# Для pandas DataFrame
df_encoded = pd.get_dummies(df, columns=['categorical_column'])

In [None]:
# Encode categorical features
cat_cols = ['Stage_fear','Drained_after_socializing']
encoder = OrdinalEncoder()
X[cat_cols] = encoder.fit_transform(X[cat_cols])
X_test[cat_cols] = encoder.fit_transform(X_test[cat_cols])

# Разделение на обучающую и тестовую выборки

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2,
    random_state=42,
    stratify=y  # Сохранение пропорций классов
)

In [None]:
# Разделение на три части
X_train, X_temp, y_train, y_temp = train_test_split(X, y, 
                                                    test_size=0.4, 
                                                    random_state=42)

X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, 
                                                test_size=0.5, 
                                                random_state=42)

In [4]:
# 5-Fold CV

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
oof_pred = np.zeros(len(X))
test_pred = np.zeros(len(X_test))

# Обучаем модель

### Линейные модели

In [None]:
# Линейная регрессия
linear_reg = LinearRegression()
linear_reg.fit(X_train, y_train)
print(f"Коэффициенты: {linear_reg.coef_}")
print(f"Свободный член: {linear_reg.intercept_}")

# Логистическая регрессия
logistic_reg = LogisticRegression(random_state=42)
logistic_reg.fit(X_train, y_train)

# Регуляризованные модели
ridge = Ridge(alpha=1.0)
lasso = Lasso(alpha=1.0)

### Деревья решений

In [None]:
# Дерево решений для классификации
tree_clf = DecisionTreeClassifier(
    max_depth=3,
    min_samples_split=5,
    random_state=42
)
tree_clf.fit(X_train, y_train)

# Визуализация дерева
plt.figure(figsize=(12, 8))
plot_tree(tree_clf, feature_names=iris.feature_names, class_names=iris.target_names, filled=True)
plt.show()

### Ансамблевые методы

In [None]:
# Случайный лес
rf_clf = RandomForestClassifier(
    n_estimators=100,
    max_depth=3,
    random_state=42
)
rf_clf.fit(X_train, y_train)

# Градиентный бустинг
gb_clf = GradientBoostingClassifier(
    n_estimators=100,
    learning_rate=0.1,
    random_state=42
)
gb_clf.fit(X_train, y_train)

# Важность признаков
feature_importance = rf_clf.feature_importances_

### Метод опорных векторов

In [None]:
from sklearn.svm import SVC, SVR, LinearSVC

# SVM для классификации
svm_clf = SVC(
    kernel='rbf',
    C=1.0,
    gamma='scale',
    random_state=42
)
svm_clf.fit(X_train, y_train)

# SVM для регрессии
svm_reg = SVR(kernel='rbf', C=1.0, gamma='scale')

### Алгоритмы кластеризации

In [None]:
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN

# K-means
kmeans = KMeans(n_clusters=3, random_state=42)
cluster_labels = kmeans.fit_predict(X)

# Иерархическая кластеризация
agg_clustering = AgglomerativeClustering(n_clusters=3)
agg_labels = agg_clustering.fit_predict(X)

# DBSCAN
dbscan = DBSCAN(eps=0.5, min_samples=5)
dbscan_labels = dbscan.fit_predict(X)

### Кросс-валидация

In [None]:
from sklearn.model_selection import cross_val_score, StratifiedKFold

# Простая кросс-валидация
scores = cross_val_score(model, X, y, cv=5)
print(f"Средняя точность: {scores.mean():.3f} (+/- {scores.std() * 2:.3f})")

# Стратифицированная кросс-валидация
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(model, X, y, cv=skf)

### Подбор гиперпараметров

In [None]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

# Сетка параметров
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7, None],
    'min_samples_split': [2, 5, 10]
}

# Поиск по сетке
grid_search = GridSearchCV(
    RandomForestClassifier(random_state=42),
    param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1
)

grid_search.fit(X_train, y_train)
print(f"Лучшие параметры: {grid_search.best_params_}")
print(f"Лучший результат: {grid_search.best_score_:.3f}")

# Случайный поиск
random_search = RandomizedSearchCV(
    RandomForestClassifier(random_state=42),
    param_grid,
    n_iter=20,
    cv=5,
    random_state=42
)

In [3]:
# CatBoost

cb_params = dict(
    iterations=2000,
    learning_rate=0.05,
    depth=8,
    loss_function="Logloss",
    eval_metric="AUC",
    random_seed=42,
    l2_leaf_reg=3.0,
    verbose=200,
    early_stopping_rounds=200,
    task_type="CPU"  # есть GPU — ставь "GPU"
)

model = CatBoostClassifier(cb_params)

model.fit(X_train, y_train, eval_set=(X_val, y_val), use_best_model=True)

# Оценка качества моделей

In [None]:
# Метрики для классификации
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, classification_report

# Предсказания
y_pred = model.predict(X_test)

# Основные метрики
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Точность: {accuracy:.3f}")
print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")
print(f"F1-score: {f1:.3f}")

# Подробный отчет
print(classification_report(y_test, y_pred))

# Матрица ошибок
cm = confusion_matrix(y_test, y_pred)

In [None]:
# Метрики для регрессии
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

y_pred_reg = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred_reg)
mae = mean_absolute_error(y_test, y_pred_reg)
r2 = r2_score(y_test, y_pred_reg)

print(f"MSE: {mse:.3f}")
print(f"MAE: {mae:.3f}")
print(f"R²: {r2:.3f}")

In [None]:
val_pred = model.predict_proba(X_val)[:, 1]
val_auc = roc_auc_score(y_val, val_pred)
print(f"Validation ROC-AUC: {val_auc:.4f}")

In [None]:
# Делаем предсказания

In [None]:
# Формируем сабмит

In [None]:
# Сохранение и загрузка моделей
# import joblib
# import pickle

# # Сохранение с помощью joblib (рекомендуется)
# joblib.dump(model, 'model.pkl')
# loaded_model = joblib.load('model.pkl')

# # Сохранение с помощью pickle
# with open('model.pkl', 'wb') as f:
#     pickle.dump(model, f)

# with open('model.pkl', 'rb') as f:
#     loaded_model = pickle.load(f)