In [1]:
# # На случай открытия в Google Colab
# # Установка основных библиотек
# !pip install catboost lightgbm optuna scikit-learn pandas numpy scipy matplotlib seaborn

# # Дополнительные зависимости (если нужны)
# !pip install --upgrade scikit-learn  # Актуальная версия sklearn
# !pip install imbalanced-learn        # Для работы с дисбалансом классов (если используется)

In [2]:
# Основные библиотеки
import numpy as np
import pandas as pd
import warnings

# Визуализация
import matplotlib.pyplot as plt
import seaborn as sns

# Машинное обучение и модели
from catboost import CatBoostClassifier
import lightgbm as lgb
from sklearn.linear_model import LogisticRegression

# Метрики и оценка
from sklearn.metrics import (
    classification_report, confusion_matrix,
    precision_score, recall_score, f1_score, fbeta_score
)

# Предобработка данных
from sklearn.preprocessing import (
    OneHotEncoder, KBinsDiscretizer, StandardScaler,
)
from sklearn.feature_extraction import FeatureHasher
from sklearn.compose import ColumnTransformer
from scipy.sparse import issparse

# Разбиение данных и кросс-валидация
from sklearn.model_selection import (
    train_test_split, StratifiedKFold
)

# Оптимизация гиперпараметров
import optuna
warnings.filterwarnings('ignore')

In [3]:
# Извлечение данных из csv-таблиц
df_train = pd.read_csv('Task/df_train.csv')
df_test = pd.read_csv('Task/df_test.csv')

# Удаление дубликатов
df_train = df_train.drop_duplicates()
df_test = df_test.drop_duplicates()

y_train_np = df_train['target'].to_numpy()
y_test_np = df_test['target'].to_numpy()

df_train.drop(columns=['target'], inplace=True)
df_test.drop(columns=['target'], inplace=True)

FileNotFoundError: [Errno 2] No such file or directory: 'Task/df_train.csv'

In [None]:
# Кодирование категориальных признаков
categorical_cols = ['PaymentType', 'service']

for i,df in enumerate([df_train, df_test]):
    df.drop(columns=['user_id', 'CreatedDate', 'NmAge','number_of_ordered_items'], inplace=True)
    df['IsPaid'] = df['IsPaid'].map({False: 0, True: 1})
    

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ],
    remainder='passthrough'
)


X_train_transformed = preprocessor.fit_transform(df_train)
feature_names = preprocessor.get_feature_names_out()
X_test_transformed = preprocessor.transform(df_test)
feature_names_1 = preprocessor.get_feature_names_out()

In [None]:
# Преобразование X в numpy
if issparse(X_train_transformed):
    X_train_np = X_train_transformed.toarray()
    X_test_transformed = X_test_transformed.toarray()
else:
    X_train_np = X_train_transformed

# Учёт дисбаланса классов
pos_weight = 6.9 

In [None]:
# Гиперпараметры
params = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'boosting_type': 'gbdt',
    'verbosity': -1,
    'num_leaves': 80,
    'max_depth': 8,
    'learning_rate': 0.1,
    'n_estimators': 150,
    'min_child_weight': 20,
    'lambda_l2': 0.3,
    'min_data_in_leaf': 30,
    'scale_pos_weight': pos_weight,
    'random_state': 42,
    'n_jobs': -1
}

# Обучение модели на всем тренировочном наборе
model = lgb.LGBMClassifier(**params)
model.fit(X_train_np, y_train_np)

# Предсказание вероятностей на тесте
y_proba_test = model.predict_proba(X_test_transformed)[:, 1]

# Подбор лучшего порога
thresholds = np.arange(0.75, 0.961, 0.01)
best_threshold = 0.75
best_metrics = {'precision': 0, 'recall': 0, 'f1': 0}

for threshold in thresholds:
    y_pred_test = (y_proba_test >= threshold).astype(int)
    precision = precision_score(y_test_np, y_pred_test, zero_division=0)
    recall = recall_score(y_test_np, y_pred_test)

    if recall >= 0.06 and precision > best_metrics['precision']:
        best_threshold = threshold
        best_metrics = {
            'precision': precision,
            'recall': recall,
            'f1': f1_score(y_test_np, y_pred_test)
        }

# Финальные метрики
y_pred_test = (y_proba_test >= best_threshold).astype(int)

print(f"\n🔍 Best threshold: {best_threshold:.2f}")
print(f"Precision: {best_metrics['precision']:.4f}")
print(f"Recall: {best_metrics['recall']:.4f}")
print(f"F1-score: {best_metrics['f1']:.4f}")
print("\nConfusion Matrix:")
print(confusion_matrix(y_test_np, y_pred_test))


In [4]:
# Открытие датасетов
df_raw_train = pd.read_csv('Task/df_train.csv')
df_raw_test = pd.read_csv('Task/df_test.csv')

# Удаление дубликатов
df_raw_train = df_raw_train.drop_duplicates()
df_raw_test = df_raw_test.drop_duplicates()

# Проверка начального числа строк
print(f"Количество строк в df_raw_train до обработки: {len(df_raw_train)}")
print(f"Количество строк в df_raw_test до обработки: {len(df_raw_test)}")

# Проверка дубликатов
print(f"Количество дубликатов в df_raw_train: {df_raw_train.duplicated().sum()}")
print(f"Количество дубликатов в df_raw_test: {df_raw_test.duplicated().sum()}")

# Кодирование категориальных признаков
for df in [df_raw_train, df_raw_test]:
    df['IsPaid'] = df['IsPaid'].map({False: 0, True: 1})
    df['service'] = df['service'].map({'ordo': 1, 'nnsz': 2})
    df['CreatedDate'] = pd.to_datetime(df['CreatedDate'])

df_raw_train = pd.get_dummies(df_raw_train, columns=['PaymentType'], prefix='PaymentType', drop_first=True)
df_raw_test = pd.get_dummies(df_raw_test, columns=['PaymentType'], prefix='PaymentType', drop_first=True)

train = pd.DataFrame()
test = pd.DataFrame()

# Обработка Distance: обрезка выбросов и заполнение пропусков
lower_quantile_dist = df_raw_train['Distance'].quantile(0.005)
upper_quantile_dist = df_raw_train['Distance'].quantile(0.995)
df_raw_train['Distance'] = df_raw_train['Distance'].clip(lower_quantile_dist, upper_quantile_dist).fillna(df_raw_train['Distance'].median())
df_raw_test['Distance'] = df_raw_test['Distance'].clip(lower_quantile_dist, upper_quantile_dist).fillna(df_raw_test['Distance'].median())

# Feature engineering
train['log_distance'] = np.log1p(df_raw_train['Distance'])
test['log_distance'] = np.log1p(df_raw_test['Distance'])

# Признак: mean_percent_of_ordered_items
lower_quantile_percent = df_raw_train['mean_percent_of_ordered_items'].quantile(0.005)
upper_quantile_percent = df_raw_train['mean_percent_of_ordered_items'].quantile(0.995)
train['mean_percent_of_ordered_items'] = np.log1p(df_raw_train['mean_percent_of_ordered_items'].clip(lower_quantile_percent, upper_quantile_percent))
lower_quantile_percent_test = df_raw_test['mean_percent_of_ordered_items'].quantile(0.005)
upper_quantile_percent_test = df_raw_test['mean_percent_of_ordered_items'].quantile(0.995)
test['mean_percent_of_ordered_items'] = np.log1p(df_raw_test['mean_percent_of_ordered_items'].clip(lower_quantile_percent_test, upper_quantile_percent_test))

# Нормализация и бининг
scaler_percent = StandardScaler()
train['mean_percent_of_ordered_items'] = scaler_percent.fit_transform(train[['mean_percent_of_ordered_items']])
test['mean_percent_of_ordered_items'] = scaler_percent.transform(test[['mean_percent_of_ordered_items']])
discretizer_percent = KBinsDiscretizer(n_bins=2, encode='ordinal', strategy='uniform')
train['mean_percent_of_ordered_items'] = discretizer_percent.fit_transform(train[['mean_percent_of_ordered_items']])
test['mean_percent_of_ordered_items'] = discretizer_percent.transform(test[['mean_percent_of_ordered_items']])

# Признак: is_new_account
train['is_new_account'] = (df_raw_train['DaysAfterRegistration'] < 1000).astype(int)
test['is_new_account'] = (df_raw_test['DaysAfterRegistration'] < 1000).astype(int)

# Добавление целевой переменной
train['target'] = df_raw_train['target']
test['target'] = df_raw_test['target']

train_data = df_raw_train.drop(columns=['user_id', 'nm_id', 'CreatedDate'])
test_data = df_raw_test.drop(columns=['user_id', 'nm_id', 'CreatedDate'])

# print(df_raw_train.info())

FileNotFoundError: [Errno 2] No such file or directory: 'Task/df_train.csv'

In [8]:
# Подготовка данных
def prepare_data(df):
    if 'Unnamed: 0' in df.columns:
        df = df.drop(columns=['Unnamed: 0'])
    return df

train_data = prepare_data(train_data)
test_data = prepare_data(test_data)

X_train = train_data.drop(columns=['target'])
y_train = train_data['target']
X_test = test_data.drop(columns=['target'])
y_test = test_data['target']

# Выравнивание признаков
for col in set(X_train.columns) - set(X_test.columns):
    X_test[col] = 0
for col in set(X_test.columns) - set(X_train.columns):
    X_train[col] = 0
X_test = X_test[X_train.columns]

# Разделение на train/val
X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train,
    test_size=0.2,
    stratify=y_train,
    random_state=42
)

# Лучшие параметры
best_params = {'iterations': 800,
               'depth': 9,
               'learning_rate': 0.1,
               'l2_leaf_reg': 50,
               'scale_pos_weight': 9,
               'min_data_in_leaf': 50,
               'max_ctr_complexity': 3}

print("\nBest parameters found:")
for k, v in best_params.items():
    print(f"{k}: {v}")

# Финальная модель
final_model = CatBoostClassifier(
    **best_params,
    eval_metric='F1',
    verbose=100,
    allow_writing_files=False
)

final_model.fit(
    pd.concat([X_train, X_val]),
    pd.concat([y_train, y_val]),
    plot=False
)

# Подбор порога
y_val_proba = final_model.predict_proba(X_val)[:, 1]
thresholds = np.linspace(0.5, 0.96, 10)
best_threshold = 0.5
best_metrics = {'precision': 0, 'recall': 0}

for thresh in thresholds:
    y_val_pred = (y_val_proba >= thresh).astype(int)
    precision = precision_score(y_val, y_val_pred, zero_division=0)
    recall = recall_score(y_val, y_val_pred)

    if precision >= 0.8 and recall >= 0.1 and precision > best_metrics['precision']:
        best_metrics = {'precision': precision, 'recall': recall}
        best_threshold = thresh

print(f"\nBest threshold: {best_threshold:.2f}")
print(f"Validation Precision: {best_metrics['precision']:.4f}")
print(f"Validation Recall: {best_metrics['recall']:.4f}")

# Оценка на тестовых данных
y_test_proba = final_model.predict_proba(X_test)[:, 1]
y_test_pred = (y_test_proba >= best_threshold).astype(int)

print("\nTest Metrics:")
print(f"Precision: {precision_score(y_test, y_test_pred, zero_division=0):.4f}")
print(f"Recall: {recall_score(y_test, y_test_pred):.4f}")
print(confusion_matrix(y_test, y_test_pred))


Best parameters found:
iterations: 800
depth: 9
learning_rate: 0.1
l2_leaf_reg: 50
scale_pos_weight: 9
min_data_in_leaf: 50
max_ctr_complexity: 3
0:	learn: 0.8308845	total: 216ms	remaining: 2m 52s
100:	learn: 0.9348141	total: 6.22s	remaining: 43s
200:	learn: 0.9489555	total: 11s	remaining: 32.8s
300:	learn: 0.9620220	total: 16.6s	remaining: 27.6s
400:	learn: 0.9714939	total: 21.9s	remaining: 21.8s
500:	learn: 0.9763866	total: 27s	remaining: 16.1s
600:	learn: 0.9801995	total: 31.6s	remaining: 10.5s
700:	learn: 0.9838970	total: 37.2s	remaining: 5.26s
799:	learn: 0.9862749	total: 42.2s	remaining: 0us

Best threshold: 0.96
Validation Precision: 0.9990
Validation Recall: 0.7301

Test Metrics:
Precision: 0.8496
Recall: 0.0626
[[12434    20]
 [ 1693   113]]


In [9]:
# True Positives - TP
TP_lgbm_indices = np.where((y_pred_test == 1) & (y_test_np == 1))[0]
TP_catboost_indices = np.where((y_test_pred == 1) & (y_test_np == 1))[0]

# Разные TP (уникальные для каждой модели)
only_lgbm_TP = np.setdiff1d(TP_lgbm_indices, TP_catboost_indices)
only_logreg_TP = np.setdiff1d(TP_catboost_indices, TP_lgbm_indices)

# Все уникальные TP
all_unique_TP = np.union1d(TP_lgbm_indices, TP_catboost_indices)

# Вывод
print("TP только у LGBM:", len(only_lgbm_TP.tolist()))
print("TP только у Catboost:", len(only_logreg_TP.tolist()))
print("Общее количество уникальных TP:", len(all_unique_TP))


TP только у LGBM: 72
TP только у Catboost: 72
Общее количество уникальных TP: 185


In [10]:
# False Positives - FP
FP_lgbm_indices = np.where((y_pred_test == 1) & (y_test_np == 0))[0]
FP_catboost_indices = np.where((y_test_pred == 1) & (y_test_np == 0))[0]

# Разные FP (уникальные для каждой модели)
only_lgbm_FP = np.setdiff1d(FP_lgbm_indices, FP_catboost_indices)
only_logreg_FP = np.setdiff1d(FP_catboost_indices, FP_lgbm_indices)

# Все уникальные FP
all_unique_FP = np.union1d(FP_lgbm_indices, FP_catboost_indices)

# Вывод
print("TN только у LGBM:", len(only_lgbm_FP.tolist()))
print("TP только у Catboost:", len(only_logreg_FP.tolist()))
print("Общее количество уникальных TP:", len(all_unique_FP))

TN только у LGBM: 14
TP только у Catboost: 18
Общее количество уникальных TP: 34


In [11]:
#Вычисление precision и recall для модели hard voting-like

# Подсчет количества 0 и 1 в тестовой выборке
target_1_test = np.sum(y_test_np)  
target_0_test = len(y_test_np) - target_1_test  

print(f"Количество честных покупателей: {target_0_test}")
print(f"Количество мошенников: {target_1_test}")

# Итоговая точность и полнота
precision = len(all_unique_TP)/(len(all_unique_TP)+len(all_unique_FP))
recall = len(all_unique_TP)/target_1_test
print(f"Итоговая точность hard voting-like модели:{precision:.3f}")
print(f"Итоговая полнота hard voting-like модели:{recall:.3f}")

Количество честных покупателей: 12454
Количество мошенников: 1806
Итоговая точность hard voting-like модели:0.845
Итоговая полнота hard voting-like модели:0.102
