### Урок 5. Feature Engineering, Feature Selection, part I

### -- Автор: Шенк Евгений Станиславович

### Домашнее задание 5:
Продолжим работу с данными, которые были использованы в ДЗ2 и 3, продолжим решать задачу обнаружения мошеннических транзакций, что позволит получить полное решение задачи / полный пайплайн.  

Задание 0: выбрать любую модель машнного обучения и зафиксировать любой тип валидации. Обучить базовую модель и зафиксировать базовое качество модели. В каждом следующем задании нужно будет обучить выбранную модель и оценивать ее качество на зафиксированной схеме валидации. После каждого задания, требуется сделать вывод о достигаемом качестве модели, по сравнению с качестом из предыдущего шага.  

Задание 1: признак TransactionDT - это смещение в секундах относительно базовой даты. Базовая дата - 2017-12-01, преобразовать признак TransactionDT в datetime, прибавив к базовой дате исходное значение признака. Из полученного признака выделить год, месяц, день недели, час, день.  

Задание 2: сделать конкатенацию признаков  
* card1 + card2;  
* card1 + card2 + card_3 + card_5;  
* card1 + card2 + card_3 + card_5 + addr1 + addr2  

Рассматривать их как категориальных признаки.  

Задание 3: Сделать FrequencyEncoder для признаков card1 - card6, addr1, addr2.  

Задание 4: Создать признаки на основе отношения: TransactionAmt к вычисленной статистике. Статистика - среднее значение / стандартное отклонение TransactionAmt, сгруппированное по card1 - card6, addr1, addr2, и по признакам, созданным в задании 2.  

Задание 5: Создать признаки на основе отношения: D15 к вычисленной статистике. Статистика - среднее значение / стандартное отклонение D15, сгруппированное по card1 - card6, addr1, addr2, и по признакам, созданным в задании 2.  

Задание 6: выделить дробную часть и целую часть признака TransactionAmt в два отдельных признака. После создать отдельных признак - логарифм от TransactionAmt  

Задание 7 (опция): выполнить предварительную подготовку / очистку признаков P_emaildomain и R_emaildomain (что и как делать - остается на ваше усмотрение) и сделать Frequency Encoding для очищенных признаков.  

#### Выводы и результаты вконце нотбука

In [1]:
import numpy as np
import pandas as pd
import lightgbm as lgb
import matplotlib.pyplot as plt
from tqdm import tqdm
from typing import List, Tuple, Optional

from sklearn.model_selection import KFold, StratifiedKFold, GroupKFold, train_test_split, cross_val_score, GroupShuffleSplit
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import OrdinalEncoder

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

In [2]:
pd.options.display.max_columns = 400

### Загрузка датасета

In [3]:
data = pd.read_csv("../data/assignment_2_train.csv")
lb_dataset = pd.read_csv("../data/assignment_2_test.csv")

print("data.shape = {} rows, {} cols".format(*data.shape))
print("lb_dataset.shape = {} rows, {} cols".format(*lb_dataset.shape))

data.shape = 180000 rows, 394 cols
lb_dataset.shape = 100001 rows, 394 cols


In [4]:
data.sort_values(by='TransactionID', ascending=True, inplace=True)

In [5]:
data.head(2)

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,addr2,dist1,dist2,P_emaildomain,R_emaildomain,C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11,C12,C13,C14,D1,D2,D3,D4,D5,D6,D7,D8,D9,D10,D11,D12,D13,D14,D15,M1,M2,M3,M4,M5,M6,M7,M8,M9,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,V29,V30,V31,V32,V33,V34,V35,V36,V37,V38,V39,V40,V41,V42,V43,V44,V45,V46,V47,V48,V49,V50,V51,V52,V53,V54,V55,V56,V57,V58,V59,V60,V61,V62,V63,V64,V65,V66,V67,V68,V69,V70,V71,V72,V73,V74,V75,V76,V77,V78,V79,V80,V81,V82,V83,V84,V85,V86,V87,V88,V89,V90,V91,V92,V93,V94,V95,V96,V97,V98,V99,V100,V101,V102,V103,V104,V105,V106,V107,V108,V109,V110,V111,V112,V113,V114,V115,V116,V117,V118,V119,V120,V121,V122,V123,V124,V125,V126,V127,V128,V129,V130,V131,V132,V133,V134,V135,V136,V137,V138,V139,V140,V141,V142,V143,V144,V145,V146,V147,V148,V149,V150,V151,V152,V153,V154,V155,V156,V157,V158,V159,V160,V161,V162,V163,V164,V165,V166,V167,V168,V169,V170,V171,V172,V173,V174,V175,V176,V177,V178,V179,V180,V181,V182,V183,V184,V185,V186,V187,V188,V189,V190,V191,V192,V193,V194,V195,V196,V197,V198,V199,V200,V201,V202,V203,V204,V205,V206,V207,V208,V209,V210,V211,V212,V213,V214,V215,V216,V217,V218,V219,V220,V221,V222,V223,V224,V225,V226,V227,V228,V229,V230,V231,V232,V233,V234,V235,V236,V237,V238,V239,V240,V241,V242,V243,V244,V245,V246,V247,V248,V249,V250,V251,V252,V253,V254,V255,V256,V257,V258,V259,V260,V261,V262,V263,V264,V265,V266,V267,V268,V269,V270,V271,V272,V273,V274,V275,V276,V277,V278,V279,V280,V281,V282,V283,V284,V285,V286,V287,V288,V289,V290,V291,V292,V293,V294,V295,V296,V297,V298,V299,V300,V301,V302,V303,V304,V305,V306,V307,V308,V309,V310,V311,V312,V313,V314,V315,V316,V317,V318,V319,V320,V321,V322,V323,V324,V325,V326,V327,V328,V329,V330,V331,V332,V333,V334,V335,V336,V337,V338,V339
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,credit,315.0,87.0,19.0,,,,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,2.0,0.0,1.0,1.0,14.0,,13.0,,,,,,,13.0,13.0,,,,0.0,T,T,T,M2,F,T,,,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,117.0,0.0,0.0,0.0,0.0,0.0,117.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,117.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,credit,325.0,87.0,,,gmail.com,,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,,,0.0,,,,,,0.0,,,,,0.0,,,,M0,T,T,,,,,,,,,,,,,,,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,


In [6]:
lb_dataset.head(2)

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,addr2,dist1,dist2,P_emaildomain,R_emaildomain,C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11,C12,C13,C14,D1,D2,D3,D4,D5,D6,D7,D8,D9,D10,D11,D12,D13,D14,D15,M1,M2,M3,M4,M5,M6,M7,M8,M9,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,V29,V30,V31,V32,V33,V34,V35,V36,V37,V38,V39,V40,V41,V42,V43,V44,V45,V46,V47,V48,V49,V50,V51,V52,V53,V54,V55,V56,V57,V58,V59,V60,V61,V62,V63,V64,V65,V66,V67,V68,V69,V70,V71,V72,V73,V74,V75,V76,V77,V78,V79,V80,V81,V82,V83,V84,V85,V86,V87,V88,V89,V90,V91,V92,V93,V94,V95,V96,V97,V98,V99,V100,V101,V102,V103,V104,V105,V106,V107,V108,V109,V110,V111,V112,V113,V114,V115,V116,V117,V118,V119,V120,V121,V122,V123,V124,V125,V126,V127,V128,V129,V130,V131,V132,V133,V134,V135,V136,V137,V138,V139,V140,V141,V142,V143,V144,V145,V146,V147,V148,V149,V150,V151,V152,V153,V154,V155,V156,V157,V158,V159,V160,V161,V162,V163,V164,V165,V166,V167,V168,V169,V170,V171,V172,V173,V174,V175,V176,V177,V178,V179,V180,V181,V182,V183,V184,V185,V186,V187,V188,V189,V190,V191,V192,V193,V194,V195,V196,V197,V198,V199,V200,V201,V202,V203,V204,V205,V206,V207,V208,V209,V210,V211,V212,V213,V214,V215,V216,V217,V218,V219,V220,V221,V222,V223,V224,V225,V226,V227,V228,V229,V230,V231,V232,V233,V234,V235,V236,V237,V238,V239,V240,V241,V242,V243,V244,V245,V246,V247,V248,V249,V250,V251,V252,V253,V254,V255,V256,V257,V258,V259,V260,V261,V262,V263,V264,V265,V266,V267,V268,V269,V270,V271,V272,V273,V274,V275,V276,V277,V278,V279,V280,V281,V282,V283,V284,V285,V286,V287,V288,V289,V290,V291,V292,V293,V294,V295,V296,V297,V298,V299,V300,V301,V302,V303,V304,V305,V306,V307,V308,V309,V310,V311,V312,V313,V314,V315,V316,V317,V318,V319,V320,V321,V322,V323,V324,V325,V326,V327,V328,V329,V330,V331,V332,V333,V334,V335,V336,V337,V338,V339
0,3287000,1,7415038,226.0,W,12473,555.0,150.0,visa,226.0,credit,299.0,87.0,116.0,,aol.com,,2.0,3.0,0.0,0.0,0.0,5.0,0.0,0.0,3.0,0.0,3.0,2.0,6.0,2.0,4.0,4.0,0.0,4.0,3.0,,,,,4.0,4.0,,,,3.0,T,T,F,M0,T,F,F,F,T,1.0,2.0,2.0,1.0,3.0,1.0,1.0,1.0,2.0,0.0,0.0,2.0,2.0,1.0,0.0,0.0,0.0,0.0,2.0,5.0,0.0,0.0,1.0,1.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,2.0,2.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,2.0,0.0,0.0,0.0,1.0,1.0,2.0,2.0,2.0,2.0,0.0,0.0,0.0,0.0,2.0,5.0,0.0,0.0,1.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,3.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,7.0,7.0,0.0,1.0,1.0,2.0,6.0,6.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,452.0,1482.0,1482.0,0.0,206.0,206.0,452.0,1276.0,1276.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,14.0,7.0,9.0,15.0,0.0,2.0,0.0,2.0,1.0,2.0,1.0,2.0,2.0,2.0,12.0,12.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,452.0,2924.0,2924.0,0.0,412.0,0.0,412.0,206.0,412.0,412.0,452.0,2512.0,2512.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,
1,3287001,0,7415054,3072.0,W,15651,417.0,150.0,visa,226.0,debit,330.0,87.0,,,yahoo.com,,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,,0.0,0.0,0.0,,,,,0.0,,,,,0.0,,,,,,T,,,,,,,,,,,,,,,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,1.0,0.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,2.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,3059.949951,3059.949951,3059.949951,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3059.949951,3059.949951,3059.949951,,,,,,,,,,,,,,,,,,


In [7]:
numerical_features = data.drop(['isFraud'], axis=1).select_dtypes(include=[np.number]).columns
categorical_features = data.select_dtypes(include=[np.object]).columns

In [8]:
params = {
    "eval_metric": "auc",
    "verbose": 50,
    "early_stopping_rounds": 25,
}

In [9]:
def do_train_test_split(data: pd.DataFrame, numerical_features: list):
    x_train, x_valid = train_test_split(
        data.drop(['isFraud'], axis=1), train_size=0.6, shuffle=False,
    )
    y_train, y_valid = train_test_split(
        data["isFraud"], train_size=0.6, shuffle=False,
    )

    x_train = x_train[numerical_features]
    x_valid = x_valid[numerical_features]

    x_valid, x_test = train_test_split(
        x_valid, train_size=0.6, shuffle=False,
    )
    y_valid, y_test = train_test_split(
        y_valid, train_size=0.6, shuffle=False,
    )

    print("x_train.shape = {} rows, {} cols".format(*x_train.shape))
    print("x_valid.shape = {} rows, {} cols".format(*x_valid.shape))
    print("x_test.shape = {} rows, {} cols".format(*x_test.shape))
    
    return x_train, x_valid, x_test, y_train, y_valid, y_test

In [10]:
def make_cross_validation(X: pd.DataFrame,
                          y: pd.Series,
                          estimator: object,
                          params: dict,
                          metric: callable,
                          cv_strategy):
    """
    Кросс-валидация.

    Parameters
    ----------
    X: pd.DataFrame
        Матрица признаков.

    y: pd.Series
        Вектор целевой переменной.

    estimator: callable
        Объект модели для обучения.
        
    paprams: dict
        Параметры модели

    metric: callable
        Метрика для оценки качества решения.
        Ожидается, что на вход будет передана функция,
        которая принимает 2 аргумента: y_true, y_pred.

    cv_strategy: cross-validation generator
        Объект для описания стратегии кросс-валидации.
        Ожидается, что на вход будет передан объект типа
        KFold или StratifiedKFold.

    Returns
    -------
    oof_score: float
        Значение метрики качества на OOF-прогнозах.

    fold_train_scores: List[float]
        Значение метрики качества на каждом обучающем датасете кросс-валидации.

    fold_valid_scores: List[float]
        Значение метрики качества на каждом валидационном датасете кросс-валидации.

    oof_predictions: np.array
        Прогнозы на OOF.

    """
    estimators, fold_train_scores, fold_valid_scores = [], [], []
    oof_predictions = np.zeros(X.shape[0])

    for fold_number, (train_idx, valid_idx) in enumerate(cv_strategy.split(X, y)):
        x_train, x_valid = X.loc[train_idx], X.loc[valid_idx]
        y_train, y_valid = y.loc[train_idx], y.loc[valid_idx]

        estimator.fit(x_train, y_train, 
                      eval_set=(x_valid, y_valid),
                      **params)
        y_train_pred = estimator.predict_proba(x_train)[:,1]
        y_valid_pred = estimator.predict_proba(x_valid)[:,1]

        fold_train_scores.append(metric(y_train, y_train_pred))
        fold_valid_scores.append(metric(y_valid, y_valid_pred))
        oof_predictions[valid_idx] = y_valid_pred

        msg = (
            f"Fold: {fold_number+1}, train-observations = {len(train_idx)}, "
            f"valid-observations = {len(valid_idx)}\n"
            f"train-score = {round(fold_train_scores[fold_number], 4)}, "
            f"valid-score = {round(fold_valid_scores[fold_number], 4)}" 
        )
        print(msg)
        print("="*69)
        estimators.append(estimator)

    oof_score = metric(y, oof_predictions)
    print(f"CV-results train: {round(np.mean(fold_train_scores), 4)} +/- {round(np.std(fold_train_scores), 3)}")
    print(f"CV-results valid: {round(np.mean(fold_valid_scores), 4)} +/- {round(np.std(fold_valid_scores), 3)}")
    print(f"OOF-score = {round(oof_score, 4)}")

    return estimators, oof_score, fold_train_scores, fold_valid_scores, oof_predictions

In [11]:
def calc_val_results(estimator: object):
    train_score = roc_auc_score(y_train, estimator.predict_proba(x_train)[:,1])
    print(f'Train roc_auc_score: {train_score}')
    valid_score = roc_auc_score(y_valid, estimator.predict_proba(x_valid)[:,1])
    print(f'Valid roc_auc_score: {valid_score}')
    test_score = roc_auc_score(y_test, estimator.predict_proba(x_test)[:,1])
    print(f'Test roc_auc_score:  {test_score}')
    lb_score = roc_auc_score(lb_dataset['isFraud'], estimator.predict_proba(lb_dataset_prep)[:,1])
    print(f'LB roc_auc_score:    {lb_score}')
    return(train_score, valid_score, test_score, lb_score)

### Задание 0. Валидация

In [12]:
data.sort_values(by='TransactionDT', ascending=True, inplace=True)

In [13]:
x_train, x_valid, x_test, y_train, y_valid, y_test = do_train_test_split(data, numerical_features)

x_train.shape = 108000 rows, 379 cols
x_valid.shape = 43200 rows, 379 cols
x_test.shape = 28800 rows, 379 cols


In [14]:
set(x_train["TransactionID"].unique()) & (set(x_valid["TransactionID"].unique()))

set()

In [15]:
x_train = x_train.drop(['TransactionID', 'TransactionDT'], axis=1)
total_features = x_train.columns

x_valid = x_valid[total_features]
x_test = x_test[total_features]

In [16]:
model_lgb_0 = lgb.LGBMClassifier(n_estimators=70, num_leaves=5, seed=2177)
### Model for CV
model_0 = lgb.LGBMClassifier(n_estimators=70, num_leaves=5, seed=2177)

In [17]:
model_lgb_0.fit(x_train, y_train, 
                eval_set=(x_valid, y_valid),
                **params)

Training until validation scores don't improve for 25 rounds
[50]	valid_0's auc: 0.867539	valid_0's binary_logloss: 0.0912127
Did not meet early stopping. Best iteration is:
[70]	valid_0's auc: 0.870411	valid_0's binary_logloss: 0.0898365


LGBMClassifier(n_estimators=70, num_leaves=5, seed=2177)

In [18]:
cv_strategy = StratifiedKFold(n_splits=5, random_state=2177)

estimators, oof_score, fold_train_scores, fold_valid_scores, oof_predictions = make_cross_validation(
    x_train, y_train, model_0, params, metric=roc_auc_score, cv_strategy=cv_strategy
)

Training until validation scores don't improve for 25 rounds
[50]	valid_0's auc: 0.835678	valid_0's binary_logloss: 0.0909029
Did not meet early stopping. Best iteration is:
[70]	valid_0's auc: 0.841219	valid_0's binary_logloss: 0.0894202
Fold: 1, train-observations = 86400, valid-observations = 21600
train-score = 0.8974, valid-score = 0.8412
Training until validation scores don't improve for 25 rounds
[50]	valid_0's auc: 0.86584	valid_0's binary_logloss: 0.0830425
Did not meet early stopping. Best iteration is:
[69]	valid_0's auc: 0.870402	valid_0's binary_logloss: 0.0817001
Fold: 2, train-observations = 86400, valid-observations = 21600
train-score = 0.892, valid-score = 0.8704
Training until validation scores don't improve for 25 rounds
[50]	valid_0's auc: 0.883105	valid_0's binary_logloss: 0.0780251
Did not meet early stopping. Best iteration is:
[56]	valid_0's auc: 0.883837	valid_0's binary_logloss: 0.0776801
Fold: 3, train-observations = 86400, valid-observations = 21600
train-s

In [19]:
print(f'{np.mean(fold_valid_scores) - np.std(fold_valid_scores)}, {np.mean(fold_valid_scores) + np.std(fold_valid_scores)}')

0.8529398896736208, 0.8857424257685306


In [20]:
lb_dataset_prep = lb_dataset.copy()
lb_dataset_prep = lb_dataset_prep[total_features]

In [21]:
task_0_results = calc_val_results(model_lgb_0)

Train roc_auc_score: 0.8918544982629307
Valid roc_auc_score: 0.8704107786656701
Test roc_auc_score:  0.8552928941748364
LB roc_auc_score:    0.8530810831615069


### Задание 1: признак TransactionDT 

In [22]:
def conv_to_datetime(x, base_date='2017-12-01'):
    base_date = pd.to_datetime(base_date)
    return base_date + pd.Timedelta(x, unit='seconds')

In [23]:
def features_transform_1(df):
    df['TransactionDT'] = df['TransactionDT'].apply(lambda x: conv_to_datetime(x))

    df["year"] = df["TransactionDT"].dt.year
    df["month_of_year"] = df["TransactionDT"].dt.month
    df["day_of_week"] = df["TransactionDT"].dt.weekday
    df["hour_of_day"] = df["TransactionDT"].dt.hour
    df["day_of_month"] = df["TransactionDT"].dt.day

In [24]:
data_1 = data.copy()

In [25]:
features_transform_1(data_1)

In [26]:
data_1.head(3)

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,addr2,dist1,dist2,P_emaildomain,R_emaildomain,C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11,C12,C13,C14,D1,D2,D3,D4,D5,D6,D7,D8,D9,D10,D11,D12,D13,D14,D15,M1,M2,M3,M4,M5,M6,M7,M8,M9,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,V29,V30,V31,V32,V33,V34,V35,V36,V37,V38,V39,V40,V41,V42,V43,V44,V45,V46,V47,V48,V49,V50,V51,V52,V53,V54,V55,V56,V57,V58,V59,V60,V61,V62,V63,V64,V65,V66,V67,V68,V69,V70,V71,V72,V73,V74,V75,V76,V77,V78,V79,V80,V81,V82,V83,V84,V85,V86,V87,V88,V89,V90,V91,V92,V93,V94,V95,V96,V97,V98,V99,V100,V101,V102,V103,V104,V105,V106,V107,V108,V109,V110,V111,V112,V113,V114,V115,V116,V117,V118,V119,V120,V121,V122,V123,V124,V125,V126,V127,V128,V129,V130,V131,V132,V133,V134,V135,V136,V137,V138,V139,V140,V141,V142,V143,V144,V145,V146,V147,V148,V149,V150,V151,V152,V153,V154,V155,V156,V157,V158,V159,V160,V161,V162,V163,V164,V165,V166,V167,V168,V169,V170,V171,V172,V173,V174,V175,V176,V177,V178,V179,V180,V181,V182,V183,V184,V185,V186,V187,V188,V189,V190,V191,V192,V193,V194,V195,V196,V197,V198,V199,V200,V201,V202,V203,V204,V205,V206,V207,V208,V209,V210,V211,V212,V213,V214,V215,V216,V217,V218,V219,V220,V221,V222,V223,V224,V225,V226,V227,V228,V229,V230,V231,V232,V233,V234,V235,V236,V237,V238,V239,V240,V241,V242,V243,V244,V245,V246,V247,V248,V249,V250,V251,V252,V253,V254,V255,V256,V257,V258,V259,V260,V261,V262,V263,V264,V265,V266,V267,V268,V269,V270,V271,V272,V273,V274,V275,V276,V277,V278,V279,V280,V281,V282,V283,V284,V285,V286,V287,V288,V289,V290,V291,V292,V293,V294,V295,V296,V297,V298,V299,V300,V301,V302,V303,V304,V305,V306,V307,V308,V309,V310,V311,V312,V313,V314,V315,V316,V317,V318,V319,V320,V321,V322,V323,V324,V325,V326,V327,V328,V329,V330,V331,V332,V333,V334,V335,V336,V337,V338,V339,year,month_of_year,day_of_week,hour_of_day,day_of_month
0,2987000,0,2017-12-02 00:00:00,68.5,W,13926,,150.0,discover,142.0,credit,315.0,87.0,19.0,,,,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,2.0,0.0,1.0,1.0,14.0,,13.0,,,,,,,13.0,13.0,,,,0.0,T,T,T,M2,F,T,,,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,117.0,0.0,0.0,0.0,0.0,0.0,117.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,117.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,2017,12,5,0,2
1,2987001,0,2017-12-02 00:00:01,29.0,W,2755,404.0,150.0,mastercard,102.0,credit,325.0,87.0,,,gmail.com,,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,,,0.0,,,,,,0.0,,,,,0.0,,,,M0,T,T,,,,,,,,,,,,,,,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,2017,12,5,0,2
2,2987002,0,2017-12-02 00:01:09,59.0,W,4663,490.0,150.0,visa,166.0,debit,330.0,87.0,287.0,,outlook.com,,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,,,0.0,,,,,,0.0,315.0,,,,315.0,T,T,T,M0,F,F,F,F,F,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,2017,12,5,0,2


In [27]:
numerical_features = data_1.drop(['isFraud'], axis=1).select_dtypes(include=[np.number]).columns

In [28]:
x_train, x_valid, x_test, y_train, y_valid, y_test = do_train_test_split(data_1, numerical_features)

x_train.shape = 108000 rows, 383 cols
x_valid.shape = 43200 rows, 383 cols
x_test.shape = 28800 rows, 383 cols


In [29]:
x_train = x_train.drop(['TransactionID'], axis=1)
total_features = x_train.columns

x_valid = x_valid[total_features]
x_test = x_test[total_features]

In [30]:
model_lgb_1 = lgb.LGBMClassifier(n_estimators=70, num_leaves=5, seed=2177)
### Model for CV
model_1 = lgb.LGBMClassifier(n_estimators=70, num_leaves=5, seed=2177)

In [31]:
model_lgb_1.fit(x_train, y_train, 
                eval_set=(x_valid, y_valid),
                **params)

Training until validation scores don't improve for 25 rounds
[50]	valid_0's auc: 0.868354	valid_0's binary_logloss: 0.0909625
Did not meet early stopping. Best iteration is:
[70]	valid_0's auc: 0.871489	valid_0's binary_logloss: 0.0896932


LGBMClassifier(n_estimators=70, num_leaves=5, seed=2177)

In [32]:
cv_strategy = StratifiedKFold(n_splits=5, random_state=2177)

estimators, oof_score, fold_train_scores, fold_valid_scores, oof_predictions = make_cross_validation(
    x_train, y_train, model_1, params, metric=roc_auc_score, cv_strategy=cv_strategy
)

Training until validation scores don't improve for 25 rounds
Early stopping, best iteration is:
[3]	valid_0's auc: 0.724767	valid_0's binary_logloss: 0.10656
Fold: 1, train-observations = 86400, valid-observations = 21600
train-score = 0.7828, valid-score = 0.7248
Training until validation scores don't improve for 25 rounds
Early stopping, best iteration is:
[16]	valid_0's auc: 0.851487	valid_0's binary_logloss: 0.0892212
Fold: 2, train-observations = 86400, valid-observations = 21600
train-score = 0.8607, valid-score = 0.8515
Training until validation scores don't improve for 25 rounds
[50]	valid_0's auc: 0.868211	valid_0's binary_logloss: 0.0797164
Early stopping, best iteration is:
[44]	valid_0's auc: 0.880558	valid_0's binary_logloss: 0.0785868
Fold: 3, train-observations = 86400, valid-observations = 21600
train-score = 0.8815, valid-score = 0.8806
Training until validation scores don't improve for 25 rounds
[50]	valid_0's auc: 0.882829	valid_0's binary_logloss: 0.0758537
Did not 

In [33]:
print(f'{np.mean(fold_valid_scores) - np.std(fold_valid_scores)}, {np.mean(fold_valid_scores) + np.std(fold_valid_scores)}')

0.7811010772299153, 0.8988485889393768


In [34]:
lb_dataset_prep = lb_dataset.copy()
features_transform_1(lb_dataset_prep)
lb_dataset_prep = lb_dataset_prep[total_features]

In [36]:
task_1_results = calc_val_results(model_lgb_1)

Train roc_auc_score: 0.8925164170192197
Valid roc_auc_score: 0.8714885027861738
Test roc_auc_score:  0.8552142394526749
LB roc_auc_score:    0.8525793716314294


### Задание 2: сделать конкатенацию признаков
card1 + card2;  
card1 + card2 + card_3 + card_5;  
card1 + card2 + card_3 + card_5 + addr1 + addr2  

In [37]:
def features_transform_2(df):
    df[['card1', 'card2', 'card3', 'card4', 'card5', 'card6', 'addr1', 'addr2']] = df[['card1', 'card2', 'card3', 'card4', 'card5', 'card6', 'addr1', 'addr2']].astype('str')
    df["card_1_2"] = df['card1'] + " | " + df['card2']
    df["card_1_2_3_5"] = df['card1'] + " | " + df['card2'] + " | " + df['card3'] + " | " + df['card5']
    df["card_1_2_3_5_addr_1_2"] = df['card1'] + " | " + df['card2'] + " | " + df['card3'] + " | " + df['card5'] + " | "  + df['addr1'] + " | " + df['addr2']

In [38]:
data_2 = data.copy()

In [39]:
features_transform_2(data_2)

In [40]:
data_2.head(3)

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,addr2,dist1,dist2,P_emaildomain,R_emaildomain,C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11,C12,C13,C14,D1,D2,D3,D4,D5,D6,D7,D8,D9,D10,D11,D12,D13,D14,D15,M1,M2,M3,M4,M5,M6,M7,M8,M9,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,V29,V30,V31,V32,V33,V34,V35,V36,V37,V38,V39,V40,V41,V42,V43,V44,V45,V46,V47,V48,V49,V50,V51,V52,V53,V54,V55,V56,V57,V58,V59,V60,V61,V62,V63,V64,V65,V66,V67,V68,V69,V70,V71,V72,V73,V74,V75,V76,V77,V78,V79,V80,V81,V82,V83,V84,V85,V86,V87,V88,V89,V90,V91,V92,V93,V94,V95,V96,V97,V98,V99,V100,V101,V102,V103,V104,V105,V106,V107,V108,V109,V110,V111,V112,V113,V114,V115,V116,V117,V118,V119,V120,V121,V122,V123,V124,V125,V126,V127,V128,V129,V130,V131,V132,V133,V134,V135,V136,V137,V138,V139,V140,V141,V142,V143,V144,V145,V146,V147,V148,V149,V150,V151,V152,V153,V154,V155,V156,V157,V158,V159,V160,V161,V162,V163,V164,V165,V166,V167,V168,V169,V170,V171,V172,V173,V174,V175,V176,V177,V178,V179,V180,V181,V182,V183,V184,V185,V186,V187,V188,V189,V190,V191,V192,V193,V194,V195,V196,V197,V198,V199,V200,V201,V202,V203,V204,V205,V206,V207,V208,V209,V210,V211,V212,V213,V214,V215,V216,V217,V218,V219,V220,V221,V222,V223,V224,V225,V226,V227,V228,V229,V230,V231,V232,V233,V234,V235,V236,V237,V238,V239,V240,V241,V242,V243,V244,V245,V246,V247,V248,V249,V250,V251,V252,V253,V254,V255,V256,V257,V258,V259,V260,V261,V262,V263,V264,V265,V266,V267,V268,V269,V270,V271,V272,V273,V274,V275,V276,V277,V278,V279,V280,V281,V282,V283,V284,V285,V286,V287,V288,V289,V290,V291,V292,V293,V294,V295,V296,V297,V298,V299,V300,V301,V302,V303,V304,V305,V306,V307,V308,V309,V310,V311,V312,V313,V314,V315,V316,V317,V318,V319,V320,V321,V322,V323,V324,V325,V326,V327,V328,V329,V330,V331,V332,V333,V334,V335,V336,V337,V338,V339,card_1_2,card_1_2_3_5,card_1_2_3_5_addr_1_2
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,credit,315.0,87.0,19.0,,,,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,2.0,0.0,1.0,1.0,14.0,,13.0,,,,,,,13.0,13.0,,,,0.0,T,T,T,M2,F,T,,,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,117.0,0.0,0.0,0.0,0.0,0.0,117.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,117.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,13926 | nan,13926 | nan | 150.0 | 142.0,13926 | nan | 150.0 | 142.0 | 315.0 | 87.0
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,credit,325.0,87.0,,,gmail.com,,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,,,0.0,,,,,,0.0,,,,,0.0,,,,M0,T,T,,,,,,,,,,,,,,,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,2755 | 404.0,2755 | 404.0 | 150.0 | 102.0,2755 | 404.0 | 150.0 | 102.0 | 325.0 | 87.0
2,2987002,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,debit,330.0,87.0,287.0,,outlook.com,,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,,,0.0,,,,,,0.0,315.0,,,,315.0,T,T,T,M0,F,F,F,F,F,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,4663 | 490.0,4663 | 490.0 | 150.0 | 166.0,4663 | 490.0 | 150.0 | 166.0 | 330.0 | 87.0


In [41]:
numerical_features = data_2.drop(['isFraud'], axis=1).select_dtypes(include=[np.number]).columns
new_features = data_2[['card_1_2', 'card_1_2_3_5', 'card_1_2_3_5_addr_1_2']].columns

In [42]:
lb_dataset_prep = lb_dataset.copy()
features_transform_2(lb_dataset_prep)
lb_dataset_prep = lb_dataset_prep[numerical_features.union(new_features)]

In [43]:
o_enc = OrdinalEncoder()
global_dataframe = pd.concat([data_2, lb_dataset_prep])
o_enc.fit(global_dataframe[new_features])

OrdinalEncoder()

In [44]:
data_2[new_features] = o_enc.transform(data_2[new_features])
lb_dataset_prep[new_features] = o_enc.transform(lb_dataset_prep[new_features])

In [45]:
x_train, x_valid, x_test, y_train, y_valid, y_test = do_train_test_split(data_2, numerical_features.union(new_features))

x_train.shape = 108000 rows, 376 cols
x_valid.shape = 43200 rows, 376 cols
x_test.shape = 28800 rows, 376 cols


In [46]:
x_train = x_train.drop(['TransactionID', 'TransactionDT'], axis=1)
total_features = x_train.columns

x_valid = x_valid[total_features]
x_test = x_test[total_features]
lb_dataset_prep = lb_dataset_prep[total_features]

In [47]:
model_lgb_2 = lgb.LGBMClassifier(n_estimators=70, num_leaves=5, seed=2177)
### Model for CV
model_2 = lgb.LGBMClassifier(n_estimators=70, num_leaves=5, seed=2177)

In [48]:
model_lgb_2.fit(x_train, y_train, 
                eval_set=(x_valid, y_valid),
                **params)

Training until validation scores don't improve for 25 rounds
[50]	valid_0's auc: 0.868856	valid_0's binary_logloss: 0.0919435
Did not meet early stopping. Best iteration is:
[70]	valid_0's auc: 0.871387	valid_0's binary_logloss: 0.0905369


LGBMClassifier(n_estimators=70, num_leaves=5, seed=2177)

In [49]:
cv_strategy = StratifiedKFold(n_splits=5, random_state=2177)

estimators, oof_score, fold_train_scores, fold_valid_scores, oof_predictions = make_cross_validation(
    x_train, y_train, model_2, params, metric=roc_auc_score, cv_strategy=cv_strategy
)

Training until validation scores don't improve for 25 rounds
[50]	valid_0's auc: 0.839123	valid_0's binary_logloss: 0.0895693
Did not meet early stopping. Best iteration is:
[68]	valid_0's auc: 0.843088	valid_0's binary_logloss: 0.0883516
Fold: 1, train-observations = 86400, valid-observations = 21600
train-score = 0.8943, valid-score = 0.8431
Training until validation scores don't improve for 25 rounds
[50]	valid_0's auc: 0.865254	valid_0's binary_logloss: 0.0825682
Did not meet early stopping. Best iteration is:
[70]	valid_0's auc: 0.868869	valid_0's binary_logloss: 0.0812701
Fold: 2, train-observations = 86400, valid-observations = 21600
train-score = 0.8906, valid-score = 0.8689
Training until validation scores don't improve for 25 rounds
[50]	valid_0's auc: 0.885563	valid_0's binary_logloss: 0.0780071
Did not meet early stopping. Best iteration is:
[55]	valid_0's auc: 0.887085	valid_0's binary_logloss: 0.0775226
Fold: 3, train-observations = 86400, valid-observations = 21600
train

In [50]:
print(f'{np.mean(fold_valid_scores) - np.std(fold_valid_scores)}, {np.mean(fold_valid_scores) + np.std(fold_valid_scores)}')

0.8537500295395397, 0.886266470777078


In [53]:
task_2_results = calc_val_results(model_lgb_2)

Train roc_auc_score: 0.886997937812125
Valid roc_auc_score: 0.8713869097182022
Test roc_auc_score:  0.8501282395993108
LB roc_auc_score:    0.8544888707996221


### Задание 3: Сделать FrequencyEncoder для признаков card1 - card6, addr1, addr2.  

In [54]:
def features_transform_3(df, feature, freq_encoder=None):
    df[feature] = df[feature].astype('str')
    if freq_encoder is None:
        freq_encoder = df[feature].value_counts(normalize=True)
    df[feature] = df[feature].map(freq_encoder)
    
    return freq_encoder

In [55]:
data_3 = data.copy()
lb_dataset_prep = lb_dataset.copy()

In [56]:
data_3.head(3)

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,addr2,dist1,dist2,P_emaildomain,R_emaildomain,C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11,C12,C13,C14,D1,D2,D3,D4,D5,D6,D7,D8,D9,D10,D11,D12,D13,D14,D15,M1,M2,M3,M4,M5,M6,M7,M8,M9,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,V29,V30,V31,V32,V33,V34,V35,V36,V37,V38,V39,V40,V41,V42,V43,V44,V45,V46,V47,V48,V49,V50,V51,V52,V53,V54,V55,V56,V57,V58,V59,V60,V61,V62,V63,V64,V65,V66,V67,V68,V69,V70,V71,V72,V73,V74,V75,V76,V77,V78,V79,V80,V81,V82,V83,V84,V85,V86,V87,V88,V89,V90,V91,V92,V93,V94,V95,V96,V97,V98,V99,V100,V101,V102,V103,V104,V105,V106,V107,V108,V109,V110,V111,V112,V113,V114,V115,V116,V117,V118,V119,V120,V121,V122,V123,V124,V125,V126,V127,V128,V129,V130,V131,V132,V133,V134,V135,V136,V137,V138,V139,V140,V141,V142,V143,V144,V145,V146,V147,V148,V149,V150,V151,V152,V153,V154,V155,V156,V157,V158,V159,V160,V161,V162,V163,V164,V165,V166,V167,V168,V169,V170,V171,V172,V173,V174,V175,V176,V177,V178,V179,V180,V181,V182,V183,V184,V185,V186,V187,V188,V189,V190,V191,V192,V193,V194,V195,V196,V197,V198,V199,V200,V201,V202,V203,V204,V205,V206,V207,V208,V209,V210,V211,V212,V213,V214,V215,V216,V217,V218,V219,V220,V221,V222,V223,V224,V225,V226,V227,V228,V229,V230,V231,V232,V233,V234,V235,V236,V237,V238,V239,V240,V241,V242,V243,V244,V245,V246,V247,V248,V249,V250,V251,V252,V253,V254,V255,V256,V257,V258,V259,V260,V261,V262,V263,V264,V265,V266,V267,V268,V269,V270,V271,V272,V273,V274,V275,V276,V277,V278,V279,V280,V281,V282,V283,V284,V285,V286,V287,V288,V289,V290,V291,V292,V293,V294,V295,V296,V297,V298,V299,V300,V301,V302,V303,V304,V305,V306,V307,V308,V309,V310,V311,V312,V313,V314,V315,V316,V317,V318,V319,V320,V321,V322,V323,V324,V325,V326,V327,V328,V329,V330,V331,V332,V333,V334,V335,V336,V337,V338,V339
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,credit,315.0,87.0,19.0,,,,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,2.0,0.0,1.0,1.0,14.0,,13.0,,,,,,,13.0,13.0,,,,0.0,T,T,T,M2,F,T,,,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,117.0,0.0,0.0,0.0,0.0,0.0,117.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,117.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,credit,325.0,87.0,,,gmail.com,,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,,,0.0,,,,,,0.0,,,,,0.0,,,,M0,T,T,,,,,,,,,,,,,,,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,
2,2987002,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,debit,330.0,87.0,287.0,,outlook.com,,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,,,0.0,,,,,,0.0,315.0,,,,315.0,T,T,T,M0,F,F,F,F,F,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,


In [57]:
feat_list = ['card1', 'card2', 'card3', 'card4', 'card5', 'card6', 'addr1', 'addr2']
for feat in feat_list:
    freq_encoder = features_transform_3(data_3, feat)
    features_transform_3(lb_dataset_prep, feat, freq_encoder)

In [58]:
numerical_features = data_3.drop(['isFraud'], axis=1).select_dtypes(include=[np.number]).columns

In [59]:
x_train, x_valid, x_test, y_train, y_valid, y_test = do_train_test_split(data_3, numerical_features)

x_train.shape = 108000 rows, 381 cols
x_valid.shape = 43200 rows, 381 cols
x_test.shape = 28800 rows, 381 cols


In [60]:
x_train = x_train.drop(['TransactionID', 'TransactionDT'], axis=1)
total_features = x_train.columns

x_valid = x_valid[total_features]
x_test = x_test[total_features]
lb_dataset_prep = lb_dataset_prep[total_features]

In [61]:
model_lgb_3 = lgb.LGBMClassifier(n_estimators=70, num_leaves=5, seed=2177)
### Model for CV
model_3 = lgb.LGBMClassifier(n_estimators=70, num_leaves=5, seed=2177)

In [62]:
model_lgb_3.fit(x_train, y_train, 
                eval_set=(x_valid, y_valid),
                **params)

Training until validation scores don't improve for 25 rounds
[50]	valid_0's auc: 0.869441	valid_0's binary_logloss: 0.0913306
Did not meet early stopping. Best iteration is:
[69]	valid_0's auc: 0.871227	valid_0's binary_logloss: 0.090284


LGBMClassifier(n_estimators=70, num_leaves=5, seed=2177)

In [63]:
cv_strategy = StratifiedKFold(n_splits=5, random_state=2177)

estimators, oof_score, fold_train_scores, fold_valid_scores, oof_predictions = make_cross_validation(
    x_train, y_train, model_3, params, metric=roc_auc_score, cv_strategy=cv_strategy
)

Training until validation scores don't improve for 25 rounds
[50]	valid_0's auc: 0.843588	valid_0's binary_logloss: 0.0890969
Did not meet early stopping. Best iteration is:
[70]	valid_0's auc: 0.850244	valid_0's binary_logloss: 0.0871563
Fold: 1, train-observations = 86400, valid-observations = 21600
train-score = 0.8956, valid-score = 0.8502
Training until validation scores don't improve for 25 rounds
[50]	valid_0's auc: 0.87047	valid_0's binary_logloss: 0.0816864
Did not meet early stopping. Best iteration is:
[68]	valid_0's auc: 0.872158	valid_0's binary_logloss: 0.080204
Fold: 2, train-observations = 86400, valid-observations = 21600
train-score = 0.8911, valid-score = 0.8722
Training until validation scores don't improve for 25 rounds
[50]	valid_0's auc: 0.876131	valid_0's binary_logloss: 0.0782438
Early stopping, best iteration is:
[42]	valid_0's auc: 0.885461	valid_0's binary_logloss: 0.0776928
Fold: 3, train-observations = 86400, valid-observations = 21600
train-score = 0.8794

In [64]:
print(f'{np.mean(fold_valid_scores) - np.std(fold_valid_scores)}, {np.mean(fold_valid_scores) + np.std(fold_valid_scores)}')

0.8591335865995247, 0.887135062890231


In [65]:
task_3_results = calc_val_results(model_lgb_3)

Train roc_auc_score: 0.8891287089638437
Valid roc_auc_score: 0.8712269390185771
Test roc_auc_score:  0.8542021475915836
LB roc_auc_score:    0.8581091745036393


### Задание 4

In [66]:
def create_numerical_aggs(data: pd.DataFrame,
                          groupby_id: str,
                          aggs: dict,
                          prefix: Optional[str] = None,
                          suffix: Optional[str] = None,
                          ) -> pd.DataFrame:
    """
    Построение агрегаций для числовых признаков.

    Parameters
    ----------
    data: pandas.core.frame.DataFrame
        Выборка для построения агрегаций.

    groupby_id: str
        Название ключа, по которому нужно произвести группировку.

    aggs: dict
        Словарь с названием признака и списка функций.
        Ключ словаря - название признака, который используется для
        вычисления агрегаций, значение словаря - список с названием
        функций для вычисления агрегаций.

    prefix: str, optional, default = None
        Префикс для названия признаков.
        Опциональный параметр, по умолчанию, не используется.

    suffix: str, optional, default = None
        Суффикс для названия признаков.
        Опциональный параметр, по умолчанию, не используется.

    Returns
    -------
    stats: pandas.core.frame.DataFrame
        Выборка с рассчитанными агрегациями.

    """
    if not prefix:
        prefix = ""
    if not suffix:
        suffix = ""

    data_grouped = data.groupby(groupby_id)
    stats = data_grouped.agg(aggs)
    stats.columns = [f"{prefix}{feature}_{stat}{suffix}".upper() for feature, stat in stats]
    stats = stats.reset_index()

    return stats

In [67]:
data_4 = data_2.copy()
lb_dataset_prep = lb_dataset.copy()

In [68]:
features_transform_2(lb_dataset_prep)
lb_dataset_prep[new_features] = o_enc.transform(lb_dataset_prep[new_features])

In [69]:
aggs = {
    "TransactionAmt": [np.mean, np.std]
}
feat_list = ['card1', 'card2', 'card3', 'card4', 'card5', 'card6', 'addr1', 'addr2', 'card_1_2', 'card_1_2_3_5', 'card_1_2_3_5_addr_1_2']

for feat in feat_list:
    stats = create_numerical_aggs(
        data_4, groupby_id=feat, aggs=aggs, prefix="", suffix=f"_by_{feat}"
    )
    data_4 = data_4.merge(stats, how='left', on=feat)
    lb_dataset_prep = lb_dataset_prep.merge(stats, how='left', on=feat)

In [70]:
numerical_features = data_4.drop(['isFraud', 'TransactionID', 'TransactionDT'], axis=1).select_dtypes(include=[np.number]).columns

In [71]:
lb_dataset_prep = lb_dataset_prep[numerical_features]

In [72]:
data_4.head(2)

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,addr2,dist1,dist2,P_emaildomain,R_emaildomain,C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11,C12,C13,C14,D1,D2,D3,D4,D5,D6,D7,D8,D9,D10,D11,D12,D13,D14,D15,M1,M2,M3,M4,M5,M6,M7,M8,M9,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,V29,V30,V31,V32,V33,V34,V35,V36,V37,V38,V39,V40,V41,V42,V43,V44,V45,V46,V47,V48,V49,V50,V51,V52,V53,V54,V55,V56,V57,V58,V59,V60,V61,V62,V63,V64,V65,V66,V67,V68,V69,V70,V71,V72,V73,V74,V75,V76,V77,V78,V79,V80,V81,V82,V83,V84,V85,V86,V87,V88,V89,V90,V91,V92,V93,V94,V95,V96,V97,V98,V99,V100,V101,V102,V103,V104,V105,V106,V107,V108,V109,V110,V111,V112,V113,V114,V115,V116,V117,V118,V119,V120,V121,V122,V123,V124,V125,V126,V127,V128,V129,V130,V131,V132,V133,V134,V135,V136,V137,V138,V139,V140,V141,V142,V143,V144,V145,...,V165,V166,V167,V168,V169,V170,V171,V172,V173,V174,V175,V176,V177,V178,V179,V180,V181,V182,V183,V184,V185,V186,V187,V188,V189,V190,V191,V192,V193,V194,V195,V196,V197,V198,V199,V200,V201,V202,V203,V204,V205,V206,V207,V208,V209,V210,V211,V212,V213,V214,V215,V216,V217,V218,V219,V220,V221,V222,V223,V224,V225,V226,V227,V228,V229,V230,V231,V232,V233,V234,V235,V236,V237,V238,V239,V240,V241,V242,V243,V244,V245,V246,V247,V248,V249,V250,V251,V252,V253,V254,V255,V256,V257,V258,V259,V260,V261,V262,V263,V264,V265,V266,V267,V268,V269,V270,V271,V272,V273,V274,V275,V276,V277,V278,V279,V280,V281,V282,V283,V284,V285,V286,V287,V288,V289,V290,V291,V292,V293,V294,V295,V296,V297,V298,V299,V300,V301,V302,V303,V304,V305,V306,V307,V308,V309,V310,V311,V312,V313,V314,V315,V316,V317,V318,V319,V320,V321,V322,V323,V324,V325,V326,V327,V328,V329,V330,V331,V332,V333,V334,V335,V336,V337,V338,V339,card_1_2,card_1_2_3_5,card_1_2_3_5_addr_1_2,TRANSACTIONAMT_MEAN_BY_CARD1,TRANSACTIONAMT_STD_BY_CARD1,TRANSACTIONAMT_MEAN_BY_CARD2,TRANSACTIONAMT_STD_BY_CARD2,TRANSACTIONAMT_MEAN_BY_CARD3,TRANSACTIONAMT_STD_BY_CARD3,TRANSACTIONAMT_MEAN_BY_CARD4,TRANSACTIONAMT_STD_BY_CARD4,TRANSACTIONAMT_MEAN_BY_CARD5,TRANSACTIONAMT_STD_BY_CARD5,TRANSACTIONAMT_MEAN_BY_CARD6,TRANSACTIONAMT_STD_BY_CARD6,TRANSACTIONAMT_MEAN_BY_ADDR1,TRANSACTIONAMT_STD_BY_ADDR1,TRANSACTIONAMT_MEAN_BY_ADDR2,TRANSACTIONAMT_STD_BY_ADDR2,TRANSACTIONAMT_MEAN_BY_CARD_1_2,TRANSACTIONAMT_STD_BY_CARD_1_2,TRANSACTIONAMT_MEAN_BY_CARD_1_2_3_5,TRANSACTIONAMT_STD_BY_CARD_1_2_3_5,TRANSACTIONAMT_MEAN_BY_CARD_1_2_3_5_ADDR_1_2,TRANSACTIONAMT_STD_BY_CARD_1_2_3_5_ADDR_1_2
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,credit,315.0,87.0,19.0,,,,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,2.0,0.0,1.0,1.0,14.0,,13.0,,,,,,,13.0,13.0,,,,0.0,T,T,T,M2,F,T,,,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,117.0,0.0,0.0,0.0,0.0,0.0,117.0,0.0,0.0,0.0,0.0,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,117.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,2883.0,2939.0,7794.0,193.227273,167.115733,195.077321,329.849448,140.340757,216.977741,220.5082,340.817021,123.38449,116.506517,169.667189,260.910712,133.110424,237.894612,140.836345,217.497586,296.375,235.958109,296.375,235.958109,68.5,
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,credit,325.0,87.0,,,gmail.com,,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,,,0.0,,,,,,0.0,,,,,0.0,,,,M0,T,T,,,,,,,,,,,,,,,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,6780.0,6913.0,18578.0,229.58808,413.469295,198.8001,294.181782,140.340757,216.977741,126.019069,205.33334,190.203414,290.383922,169.667189,260.910712,149.334343,228.947057,140.836345,217.497586,229.58808,413.469295,229.58808,413.469295,226.809524,270.054718


In [73]:
x_train, x_valid, x_test, y_train, y_valid, y_test = do_train_test_split(data_4, numerical_features)

x_train.shape = 108000 rows, 396 cols
x_valid.shape = 43200 rows, 396 cols
x_test.shape = 28800 rows, 396 cols


In [74]:
model_lgb_4 = lgb.LGBMClassifier(n_estimators=70, num_leaves=5, seed=2177)
### Model for CV
model_4 = lgb.LGBMClassifier(n_estimators=70, num_leaves=5, seed=2177)

In [75]:
model_lgb_4.fit(x_train, y_train, 
                eval_set=(x_valid, y_valid),
                **params)

Training until validation scores don't improve for 25 rounds
[50]	valid_0's auc: 0.87206	valid_0's binary_logloss: 0.0911987
Did not meet early stopping. Best iteration is:
[69]	valid_0's auc: 0.876047	valid_0's binary_logloss: 0.0895871


LGBMClassifier(n_estimators=70, num_leaves=5, seed=2177)

In [76]:
cv_strategy = StratifiedKFold(n_splits=5, random_state=2177)

estimators, oof_score, fold_train_scores, fold_valid_scores, oof_predictions = make_cross_validation(
    x_train, y_train, model_4, params, metric=roc_auc_score, cv_strategy=cv_strategy
)

Training until validation scores don't improve for 25 rounds
[50]	valid_0's auc: 0.846587	valid_0's binary_logloss: 0.0877865
Did not meet early stopping. Best iteration is:
[70]	valid_0's auc: 0.85139	valid_0's binary_logloss: 0.0863833
Fold: 1, train-observations = 86400, valid-observations = 21600
train-score = 0.9001, valid-score = 0.8514
Training until validation scores don't improve for 25 rounds
[50]	valid_0's auc: 0.870886	valid_0's binary_logloss: 0.081225
Did not meet early stopping. Best iteration is:
[68]	valid_0's auc: 0.875783	valid_0's binary_logloss: 0.0796353
Fold: 2, train-observations = 86400, valid-observations = 21600
train-score = 0.8942, valid-score = 0.8758
Training until validation scores don't improve for 25 rounds
[50]	valid_0's auc: 0.88862	valid_0's binary_logloss: 0.0766592
Did not meet early stopping. Best iteration is:
[59]	valid_0's auc: 0.889455	valid_0's binary_logloss: 0.0760438
Fold: 3, train-observations = 86400, valid-observations = 21600
train-sc

In [77]:
print(f'{np.mean(fold_valid_scores) - np.std(fold_valid_scores)}, {np.mean(fold_valid_scores) + np.std(fold_valid_scores)}')

0.8620837443535425, 0.8891947710063792


In [78]:
task_4_results = calc_val_results(model_lgb_4)

Train roc_auc_score: 0.8915953126972069
Valid roc_auc_score: 0.8760466366235351
Test roc_auc_score:  0.8579999222347496
LB roc_auc_score:    0.8587288036695903


### Задание 5

In [79]:
data_5 = data_2.copy()
lb_dataset_prep = lb_dataset.copy()

In [80]:
features_transform_2(lb_dataset_prep)
lb_dataset_prep[new_features] = o_enc.transform(lb_dataset_prep[new_features])

In [81]:
aggs = {
    "D15": [np.mean, np.std]
}
feat_list = ['card1', 'card2', 'card3', 'card4', 'card5', 'card6', 'addr1', 'addr2', 'card_1_2', 'card_1_2_3_5', 'card_1_2_3_5_addr_1_2']

for feat in feat_list:
    stats = create_numerical_aggs(
        data_5, groupby_id=feat, aggs=aggs, prefix="", suffix=f"_by_{feat}"
    )
    data_5 = data_5.merge(stats, how='left', on=feat)
    lb_dataset_prep = lb_dataset_prep.merge(stats, how='left', on=feat)

In [82]:
numerical_features = data_5.drop(['isFraud', 'TransactionID', 'TransactionDT'], axis=1).select_dtypes(include=[np.number]).columns

In [83]:
lb_dataset_prep = lb_dataset_prep[numerical_features]

In [84]:
data_5.head(2)

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,addr2,dist1,dist2,P_emaildomain,R_emaildomain,C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11,C12,C13,C14,D1,D2,D3,D4,D5,D6,D7,D8,D9,D10,D11,D12,D13,D14,D15,M1,M2,M3,M4,M5,M6,M7,M8,M9,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,V29,V30,V31,V32,V33,V34,V35,V36,V37,V38,V39,V40,V41,V42,V43,V44,V45,V46,V47,V48,V49,V50,V51,V52,V53,V54,V55,V56,V57,V58,V59,V60,V61,V62,V63,V64,V65,V66,V67,V68,V69,V70,V71,V72,V73,V74,V75,V76,V77,V78,V79,V80,V81,V82,V83,V84,V85,V86,V87,V88,V89,V90,V91,V92,V93,V94,V95,V96,V97,V98,V99,V100,V101,V102,V103,V104,V105,V106,V107,V108,V109,V110,V111,V112,V113,V114,V115,V116,V117,V118,V119,V120,V121,V122,V123,V124,V125,V126,V127,V128,V129,V130,V131,V132,V133,V134,V135,V136,V137,V138,V139,V140,V141,V142,V143,V144,V145,...,V165,V166,V167,V168,V169,V170,V171,V172,V173,V174,V175,V176,V177,V178,V179,V180,V181,V182,V183,V184,V185,V186,V187,V188,V189,V190,V191,V192,V193,V194,V195,V196,V197,V198,V199,V200,V201,V202,V203,V204,V205,V206,V207,V208,V209,V210,V211,V212,V213,V214,V215,V216,V217,V218,V219,V220,V221,V222,V223,V224,V225,V226,V227,V228,V229,V230,V231,V232,V233,V234,V235,V236,V237,V238,V239,V240,V241,V242,V243,V244,V245,V246,V247,V248,V249,V250,V251,V252,V253,V254,V255,V256,V257,V258,V259,V260,V261,V262,V263,V264,V265,V266,V267,V268,V269,V270,V271,V272,V273,V274,V275,V276,V277,V278,V279,V280,V281,V282,V283,V284,V285,V286,V287,V288,V289,V290,V291,V292,V293,V294,V295,V296,V297,V298,V299,V300,V301,V302,V303,V304,V305,V306,V307,V308,V309,V310,V311,V312,V313,V314,V315,V316,V317,V318,V319,V320,V321,V322,V323,V324,V325,V326,V327,V328,V329,V330,V331,V332,V333,V334,V335,V336,V337,V338,V339,card_1_2,card_1_2_3_5,card_1_2_3_5_addr_1_2,D15_MEAN_BY_CARD1,D15_STD_BY_CARD1,D15_MEAN_BY_CARD2,D15_STD_BY_CARD2,D15_MEAN_BY_CARD3,D15_STD_BY_CARD3,D15_MEAN_BY_CARD4,D15_STD_BY_CARD4,D15_MEAN_BY_CARD5,D15_STD_BY_CARD5,D15_MEAN_BY_CARD6,D15_STD_BY_CARD6,D15_MEAN_BY_ADDR1,D15_STD_BY_ADDR1,D15_MEAN_BY_ADDR2,D15_STD_BY_ADDR2,D15_MEAN_BY_CARD_1_2,D15_STD_BY_CARD_1_2,D15_MEAN_BY_CARD_1_2_3_5,D15_STD_BY_CARD_1_2_3_5,D15_MEAN_BY_CARD_1_2_3_5_ADDR_1_2,D15_STD_BY_CARD_1_2_3_5_ADDR_1_2
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,credit,315.0,87.0,19.0,,,,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,2.0,0.0,1.0,1.0,14.0,,13.0,,,,,,,13.0,13.0,,,,0.0,T,T,T,M2,F,T,,,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,117.0,0.0,0.0,0.0,0.0,0.0,117.0,0.0,0.0,0.0,0.0,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,117.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,2883.0,2939.0,7794.0,0.4,0.547723,122.316408,166.482818,168.46659,186.337996,114.041667,171.186747,101.575758,142.176613,108.751902,171.249895,188.936614,194.069187,169.541518,186.33749,0.5,0.57735,0.5,0.57735,0.0,
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,credit,325.0,87.0,,,gmail.com,,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,,,0.0,,,,,,0.0,,,,,0.0,,,,M0,T,T,,,,,,,,,,,,,,,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,6780.0,6913.0,18578.0,114.811765,177.751006,123.450725,173.075974,168.46659,186.337996,139.496772,178.778516,110.602067,171.390409,108.751902,171.249895,195.737281,194.440057,169.541518,186.33749,114.811765,177.751006,114.811765,177.751006,59.75,123.056897


In [85]:
x_train, x_valid, x_test, y_train, y_valid, y_test = do_train_test_split(data_5, numerical_features)

x_train.shape = 108000 rows, 396 cols
x_valid.shape = 43200 rows, 396 cols
x_test.shape = 28800 rows, 396 cols


In [86]:
model_lgb_5 = lgb.LGBMClassifier(n_estimators=70, num_leaves=5, seed=2177)
### Model for CV
model_5 = lgb.LGBMClassifier(n_estimators=70, num_leaves=5, seed=2177)

In [87]:
model_lgb_5.fit(x_train, y_train, 
                eval_set=(x_valid, y_valid),
                **params)

Training until validation scores don't improve for 25 rounds
[50]	valid_0's auc: 0.872651	valid_0's binary_logloss: 0.0909075
Did not meet early stopping. Best iteration is:
[70]	valid_0's auc: 0.87505	valid_0's binary_logloss: 0.0895259


LGBMClassifier(n_estimators=70, num_leaves=5, seed=2177)

In [88]:
cv_strategy = StratifiedKFold(n_splits=5, random_state=2177)

estimators, oof_score, fold_train_scores, fold_valid_scores, oof_predictions = make_cross_validation(
    x_train, y_train, model_5, params, metric=roc_auc_score, cv_strategy=cv_strategy
)

Training until validation scores don't improve for 25 rounds
[50]	valid_0's auc: 0.844871	valid_0's binary_logloss: 0.0889628
Did not meet early stopping. Best iteration is:
[70]	valid_0's auc: 0.851954	valid_0's binary_logloss: 0.0874218
Fold: 1, train-observations = 86400, valid-observations = 21600
train-score = 0.8994, valid-score = 0.852
Training until validation scores don't improve for 25 rounds
[50]	valid_0's auc: 0.868989	valid_0's binary_logloss: 0.0815925
Did not meet early stopping. Best iteration is:
[70]	valid_0's auc: 0.873064	valid_0's binary_logloss: 0.0800349
Fold: 2, train-observations = 86400, valid-observations = 21600
train-score = 0.8954, valid-score = 0.8731
Training until validation scores don't improve for 25 rounds
[50]	valid_0's auc: 0.886759	valid_0's binary_logloss: 0.0771044
Did not meet early stopping. Best iteration is:
[56]	valid_0's auc: 0.889327	valid_0's binary_logloss: 0.076351
Fold: 3, train-observations = 86400, valid-observations = 21600
train-s

In [89]:
print(f'{np.mean(fold_valid_scores) - np.std(fold_valid_scores)}, {np.mean(fold_valid_scores) + np.std(fold_valid_scores)}')

0.8602324281419839, 0.8884034231266768


In [90]:
task_5_results = calc_val_results(model_lgb_5)

Train roc_auc_score: 0.89284227542457
Valid roc_auc_score: 0.8750502923387544
Test roc_auc_score:  0.8563446151820895
LB roc_auc_score:    0.8599493400148555


### Задание 6

In [91]:
def features_transform_6(df, feature):
    df[f'{feature}_int'] = df[feature] // 1
    df[f'{feature}_frc'] = df[feature] % 1
    df[f'{feature}_log'] = np.log(df[feature])

In [92]:
data_6 = data.copy()
lb_dataset_prep = lb_dataset.copy()

In [93]:
features_transform_6(data_6, 'TransactionAmt')
features_transform_6(lb_dataset_prep, 'TransactionAmt')

In [94]:
data_6.head(2)

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,addr2,dist1,dist2,P_emaildomain,R_emaildomain,C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11,C12,C13,C14,D1,D2,D3,D4,D5,D6,D7,D8,D9,D10,D11,D12,D13,D14,D15,M1,M2,M3,M4,M5,M6,M7,M8,M9,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,V29,V30,V31,V32,V33,V34,V35,V36,V37,V38,V39,V40,V41,V42,V43,V44,V45,V46,V47,V48,V49,V50,V51,V52,V53,V54,V55,V56,V57,V58,V59,V60,V61,V62,V63,V64,V65,V66,V67,V68,V69,V70,V71,V72,V73,V74,V75,V76,V77,V78,V79,V80,V81,V82,V83,V84,V85,V86,V87,V88,V89,V90,V91,V92,V93,V94,V95,V96,V97,V98,V99,V100,V101,V102,V103,V104,V105,V106,V107,V108,V109,V110,V111,V112,V113,V114,V115,V116,V117,V118,V119,V120,V121,V122,V123,V124,V125,V126,V127,V128,V129,V130,V131,V132,V133,V134,V135,V136,V137,V138,V139,V140,V141,V142,V143,V144,V145,V146,V147,V148,V149,V150,V151,V152,V153,V154,V155,V156,V157,V158,V159,V160,V161,V162,V163,V164,V165,V166,V167,V168,V169,V170,V171,V172,V173,V174,V175,V176,V177,V178,V179,V180,V181,V182,V183,V184,V185,V186,V187,V188,V189,V190,V191,V192,V193,V194,V195,V196,V197,V198,V199,V200,V201,V202,V203,V204,V205,V206,V207,V208,V209,V210,V211,V212,V213,V214,V215,V216,V217,V218,V219,V220,V221,V222,V223,V224,V225,V226,V227,V228,V229,V230,V231,V232,V233,V234,V235,V236,V237,V238,V239,V240,V241,V242,V243,V244,V245,V246,V247,V248,V249,V250,V251,V252,V253,V254,V255,V256,V257,V258,V259,V260,V261,V262,V263,V264,V265,V266,V267,V268,V269,V270,V271,V272,V273,V274,V275,V276,V277,V278,V279,V280,V281,V282,V283,V284,V285,V286,V287,V288,V289,V290,V291,V292,V293,V294,V295,V296,V297,V298,V299,V300,V301,V302,V303,V304,V305,V306,V307,V308,V309,V310,V311,V312,V313,V314,V315,V316,V317,V318,V319,V320,V321,V322,V323,V324,V325,V326,V327,V328,V329,V330,V331,V332,V333,V334,V335,V336,V337,V338,V339,TransactionAmt_int,TransactionAmt_frc,TransactionAmt_log
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,credit,315.0,87.0,19.0,,,,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,2.0,0.0,1.0,1.0,14.0,,13.0,,,,,,,13.0,13.0,,,,0.0,T,T,T,M2,F,T,,,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,117.0,0.0,0.0,0.0,0.0,0.0,117.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,117.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,68.0,0.5,4.226834
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,credit,325.0,87.0,,,gmail.com,,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,,,0.0,,,,,,0.0,,,,,0.0,,,,M0,T,T,,,,,,,,,,,,,,,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,29.0,0.0,3.367296


In [95]:
numerical_features = data_6.drop(['isFraud', 'TransactionID', 'TransactionDT'], axis=1).select_dtypes(include=[np.number]).columns

In [96]:
lb_dataset_prep = lb_dataset_prep[numerical_features]

In [97]:
x_train, x_valid, x_test, y_train, y_valid, y_test = do_train_test_split(data_6, numerical_features)

x_train.shape = 108000 rows, 380 cols
x_valid.shape = 43200 rows, 380 cols
x_test.shape = 28800 rows, 380 cols


In [98]:
model_lgb_6 = lgb.LGBMClassifier(n_estimators=70, num_leaves=5, seed=2177)
### Model for CV
model_6 = lgb.LGBMClassifier(n_estimators=70, num_leaves=5, seed=2177)

In [99]:
model_lgb_6.fit(x_train, y_train, 
                eval_set=(x_valid, y_valid),
                **params)

Training until validation scores don't improve for 25 rounds
[50]	valid_0's auc: 0.869128	valid_0's binary_logloss: 0.0911204
Did not meet early stopping. Best iteration is:
[70]	valid_0's auc: 0.871508	valid_0's binary_logloss: 0.0897777


LGBMClassifier(n_estimators=70, num_leaves=5, seed=2177)

In [100]:
cv_strategy = StratifiedKFold(n_splits=5, random_state=2177)

estimators, oof_score, fold_train_scores, fold_valid_scores, oof_predictions = make_cross_validation(
    x_train, y_train, model_6, params, metric=roc_auc_score, cv_strategy=cv_strategy
)

Training until validation scores don't improve for 25 rounds
[50]	valid_0's auc: 0.835648	valid_0's binary_logloss: 0.0909489
Did not meet early stopping. Best iteration is:
[69]	valid_0's auc: 0.841642	valid_0's binary_logloss: 0.0895364
Fold: 1, train-observations = 86400, valid-observations = 21600
train-score = 0.8984, valid-score = 0.8416
Training until validation scores don't improve for 25 rounds
[50]	valid_0's auc: 0.868474	valid_0's binary_logloss: 0.08214
Did not meet early stopping. Best iteration is:
[63]	valid_0's auc: 0.87107	valid_0's binary_logloss: 0.0813092
Fold: 2, train-observations = 86400, valid-observations = 21600
train-score = 0.8895, valid-score = 0.8711
Training until validation scores don't improve for 25 rounds
[50]	valid_0's auc: 0.881835	valid_0's binary_logloss: 0.0782296
Did not meet early stopping. Best iteration is:
[49]	valid_0's auc: 0.882744	valid_0's binary_logloss: 0.0782004
Fold: 3, train-observations = 86400, valid-observations = 21600
train-sc

In [101]:
print(f'{np.mean(fold_valid_scores) - np.std(fold_valid_scores)}, {np.mean(fold_valid_scores) + np.std(fold_valid_scores)}')

0.8531616025780131, 0.8857607558179983


In [102]:
task_6_results = calc_val_results(model_lgb_6)

Train roc_auc_score: 0.8913610034228286
Valid roc_auc_score: 0.8715079801659226
Test roc_auc_score:  0.854748632722982
LB roc_auc_score:    0.8547836908503139


### Задание 7

In [103]:
def features_transform_7(df, feature):
    df[f'{feature}_name'] = df[feature].str.split('.').str[0]
    df[f'{feature}_region'] = df[feature].str.split('.').str[-1]

In [104]:
data_7 = data.copy()
lb_dataset_prep = lb_dataset.copy()

In [105]:
features_transform_7(data_7, 'P_emaildomain')
features_transform_7(data_7, 'R_emaildomain')
features_transform_7(lb_dataset_prep, 'P_emaildomain')
features_transform_7(lb_dataset_prep, 'R_emaildomain')

In [106]:
feat_list = ['P_emaildomain_name', 'P_emaildomain_region', 'R_emaildomain_name', 'R_emaildomain_region']
for feat in feat_list:
    freq_encoder = features_transform_3(data_7, feat) # features_transform_3 - Frequency Encoding из задания 3
    features_transform_3(lb_dataset_prep, feat, freq_encoder)

In [107]:
data_7.head(2)

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,addr2,dist1,dist2,P_emaildomain,R_emaildomain,C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11,C12,C13,C14,D1,D2,D3,D4,D5,D6,D7,D8,D9,D10,D11,D12,D13,D14,D15,M1,M2,M3,M4,M5,M6,M7,M8,M9,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,V29,V30,V31,V32,V33,V34,V35,V36,V37,V38,V39,V40,V41,V42,V43,V44,V45,V46,V47,V48,V49,V50,V51,V52,V53,V54,V55,V56,V57,V58,V59,V60,V61,V62,V63,V64,V65,V66,V67,V68,V69,V70,V71,V72,V73,V74,V75,V76,V77,V78,V79,V80,V81,V82,V83,V84,V85,V86,V87,V88,V89,V90,V91,V92,V93,V94,V95,V96,V97,V98,V99,V100,V101,V102,V103,V104,V105,V106,V107,V108,V109,V110,V111,V112,V113,V114,V115,V116,V117,V118,V119,V120,V121,V122,V123,V124,V125,V126,V127,V128,V129,V130,V131,V132,V133,V134,V135,V136,V137,V138,V139,V140,V141,V142,V143,V144,V145,V146,V147,V148,V149,V150,V151,V152,V153,V154,V155,V156,V157,V158,V159,V160,V161,V162,V163,V164,V165,V166,V167,V168,V169,V170,V171,V172,V173,V174,V175,V176,V177,V178,V179,V180,V181,V182,V183,V184,V185,V186,V187,V188,V189,V190,V191,V192,V193,V194,V195,V196,V197,V198,V199,V200,V201,V202,V203,V204,V205,V206,V207,V208,V209,V210,V211,V212,V213,V214,V215,V216,V217,V218,V219,V220,V221,V222,V223,V224,V225,V226,V227,V228,V229,V230,V231,V232,V233,V234,V235,V236,V237,V238,V239,V240,V241,V242,V243,V244,V245,V246,V247,V248,V249,V250,V251,V252,V253,V254,V255,V256,V257,V258,V259,V260,V261,V262,V263,V264,V265,V266,V267,V268,V269,V270,V271,V272,V273,V274,V275,V276,V277,V278,V279,V280,V281,V282,V283,V284,V285,V286,V287,V288,V289,V290,V291,V292,V293,V294,V295,V296,V297,V298,V299,V300,V301,V302,V303,V304,V305,V306,V307,V308,V309,V310,V311,V312,V313,V314,V315,V316,V317,V318,V319,V320,V321,V322,V323,V324,V325,V326,V327,V328,V329,V330,V331,V332,V333,V334,V335,V336,V337,V338,V339,P_emaildomain_name,P_emaildomain_region,R_emaildomain_name,R_emaildomain_region
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,credit,315.0,87.0,19.0,,,,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,2.0,0.0,1.0,1.0,14.0,,13.0,,,,,,,13.0,13.0,,,,0.0,T,T,T,M2,F,T,,,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,117.0,0.0,0.0,0.0,0.0,0.0,117.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,117.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,0.158,0.158,0.665,0.665
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,credit,325.0,87.0,,,gmail.com,,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,,,0.0,,,,,,0.0,,,,,0.0,,,,M0,T,T,,,,,,,,,,,,,,,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,0.373961,0.785067,0.665,0.665


In [108]:
numerical_features = data_7.drop(['isFraud', 'TransactionID', 'TransactionDT'], axis=1).select_dtypes(include=[np.number]).columns

In [109]:
lb_dataset_prep = lb_dataset_prep[numerical_features]

In [110]:
x_train, x_valid, x_test, y_train, y_valid, y_test = do_train_test_split(data_7, numerical_features)

x_train.shape = 108000 rows, 381 cols
x_valid.shape = 43200 rows, 381 cols
x_test.shape = 28800 rows, 381 cols


In [111]:
model_lgb_7 = lgb.LGBMClassifier(n_estimators=70, num_leaves=5, seed=2177)
### Model for CV
model_7 = lgb.LGBMClassifier(n_estimators=70, num_leaves=5, seed=2177)

In [112]:
model_lgb_7.fit(x_train, y_train, 
                eval_set=(x_valid, y_valid),
                **params)

Training until validation scores don't improve for 25 rounds
[50]	valid_0's auc: 0.870266	valid_0's binary_logloss: 0.0907106
Did not meet early stopping. Best iteration is:
[68]	valid_0's auc: 0.873659	valid_0's binary_logloss: 0.0891979


LGBMClassifier(n_estimators=70, num_leaves=5, seed=2177)

In [113]:
cv_strategy = StratifiedKFold(n_splits=5, random_state=2177)

estimators, oof_score, fold_train_scores, fold_valid_scores, oof_predictions = make_cross_validation(
    x_train, y_train, model_7, params, metric=roc_auc_score, cv_strategy=cv_strategy
)

Training until validation scores don't improve for 25 rounds
[50]	valid_0's auc: 0.837468	valid_0's binary_logloss: 0.0901171
Did not meet early stopping. Best iteration is:
[70]	valid_0's auc: 0.843487	valid_0's binary_logloss: 0.0883919
Fold: 1, train-observations = 86400, valid-observations = 21600
train-score = 0.8993, valid-score = 0.8435
Training until validation scores don't improve for 25 rounds
[50]	valid_0's auc: 0.870037	valid_0's binary_logloss: 0.0818358
Did not meet early stopping. Best iteration is:
[70]	valid_0's auc: 0.874738	valid_0's binary_logloss: 0.0803376
Fold: 2, train-observations = 86400, valid-observations = 21600
train-score = 0.8925, valid-score = 0.8747
Training until validation scores don't improve for 25 rounds
[50]	valid_0's auc: 0.884363	valid_0's binary_logloss: 0.0771495
Did not meet early stopping. Best iteration is:
[53]	valid_0's auc: 0.884972	valid_0's binary_logloss: 0.0768157
Fold: 3, train-observations = 86400, valid-observations = 21600
train

In [114]:
print(f'{np.mean(fold_valid_scores) - np.std(fold_valid_scores)}, {np.mean(fold_valid_scores) + np.std(fold_valid_scores)}')

0.8564302391292815, 0.887677454005046


In [115]:
task_7_results = calc_val_results(model_lgb_7)

Train roc_auc_score: 0.893453052028909
Valid roc_auc_score: 0.8736585227235975
Test roc_auc_score:  0.859468154304046
LB roc_auc_score:    0.8555797414692315


## Объединяем фичи

In [155]:
def do_total_eng():
    numerical_features = data_total.drop(['isFraud', 'TransactionID', 'TransactionDT'], axis=1).select_dtypes(include=[np.number]).columns
    x_train, x_valid, x_test, y_train, y_valid, y_test = do_train_test_split(data_total, numerical_features)
    lb_dataset_prep = lb_dataset_total[numerical_features]

    model_total = lgb.LGBMClassifier(n_estimators=70, num_leaves=5, seed=2177)
    model_total.fit(x_train, y_train, 
                    eval_set=(x_valid, y_valid),
                    **params)

    train_score = roc_auc_score(y_train, model_total.predict_proba(x_train)[:,1])
    valid_score = roc_auc_score(y_valid, model_total.predict_proba(x_valid)[:,1])
    test_score = roc_auc_score(y_test, model_total.predict_proba(x_test)[:,1])
    lb_score = roc_auc_score(lb_dataset['isFraud'], model_total.predict_proba(lb_dataset_prep)[:,1])

    return (train_score, valid_score, test_score, lb_score)

In [172]:
data_total = data.copy()
lb_dataset_total = lb_dataset.copy()
total_results=[]

In [140]:
#1
features_transform_1(data_total)
features_transform_1(lb_dataset_total)

total_results.append(do_total_eng())

x_train.shape = 108000 rows, 382 cols
x_valid.shape = 43200 rows, 382 cols
x_test.shape = 28800 rows, 382 cols
Training until validation scores don't improve for 25 rounds
[50]	valid_0's auc: 0.868354	valid_0's binary_logloss: 0.0909625
Did not meet early stopping. Best iteration is:
[70]	valid_0's auc: 0.871489	valid_0's binary_logloss: 0.0896932


In [173]:
#2
features_transform_2(data_total)
features_transform_2(lb_dataset_total)

new_features = data_total[['card_1_2', 'card_1_2_3_5', 'card_1_2_3_5_addr_1_2']].columns

data_total[new_features] = o_enc.transform(data_total[new_features])
lb_dataset_total[new_features] = o_enc.transform(lb_dataset_total[new_features])

total_results.append(do_total_eng())

x_train.shape = 108000 rows, 374 cols
x_valid.shape = 43200 rows, 374 cols
x_test.shape = 28800 rows, 374 cols
Training until validation scores don't improve for 25 rounds
[50]	valid_0's auc: 0.868856	valid_0's binary_logloss: 0.0919435
Did not meet early stopping. Best iteration is:
[70]	valid_0's auc: 0.871387	valid_0's binary_logloss: 0.0905369


In [174]:
#3
feat_list = ['card1', 'card2', 'card3', 'card4', 'card5', 'card6', 'addr1', 'addr2']
for feat in feat_list:
    freq_encoder = features_transform_3(data_total, feat)
    features_transform_3(lb_dataset_total, feat, freq_encoder)
    
total_results.append(do_total_eng())

x_train.shape = 108000 rows, 382 cols
x_valid.shape = 43200 rows, 382 cols
x_test.shape = 28800 rows, 382 cols
Training until validation scores don't improve for 25 rounds
[50]	valid_0's auc: 0.873517	valid_0's binary_logloss: 0.0911861
Did not meet early stopping. Best iteration is:
[67]	valid_0's auc: 0.87568	valid_0's binary_logloss: 0.0901072


In [175]:
#4
aggs = {
    "TransactionAmt": [np.mean, np.std]
}
feat_list = ['card1', 'card2', 'card3', 'card4', 'card5', 'card6', 'addr1', 'addr2', 'card_1_2', 'card_1_2_3_5', 'card_1_2_3_5_addr_1_2']

for feat in feat_list:
    stats = create_numerical_aggs(
        data_total, groupby_id=feat, aggs=aggs, prefix="", suffix=f"_by_{feat}"
    )
    data_total = data_total.merge(stats, how='left', on=feat)
    lb_dataset_total = lb_dataset_total.merge(stats, how='left', on=feat)
    
total_results.append(do_total_eng())

x_train.shape = 108000 rows, 404 cols
x_valid.shape = 43200 rows, 404 cols
x_test.shape = 28800 rows, 404 cols
Training until validation scores don't improve for 25 rounds
[50]	valid_0's auc: 0.87207	valid_0's binary_logloss: 0.0907896
Did not meet early stopping. Best iteration is:
[69]	valid_0's auc: 0.875641	valid_0's binary_logloss: 0.0893817


In [176]:
#5
aggs = {
    "D15": [np.mean, np.std]
}
feat_list = ['card1', 'card2', 'card3', 'card4', 'card5', 'card6', 'addr1', 'addr2', 'card_1_2', 'card_1_2_3_5', 'card_1_2_3_5_addr_1_2']

for feat in feat_list:
    stats = create_numerical_aggs(
        data_total, groupby_id=feat, aggs=aggs, prefix="", suffix=f"_by_{feat}"
    )
    data_total = data_total.merge(stats, how='left', on=feat)
    lb_dataset_total = lb_dataset_total.merge(stats, how='left', on=feat)
    
total_results.append(do_total_eng())

x_train.shape = 108000 rows, 426 cols
x_valid.shape = 43200 rows, 426 cols
x_test.shape = 28800 rows, 426 cols
Training until validation scores don't improve for 25 rounds
[50]	valid_0's auc: 0.870095	valid_0's binary_logloss: 0.0910937
Did not meet early stopping. Best iteration is:
[70]	valid_0's auc: 0.873681	valid_0's binary_logloss: 0.0894504


In [177]:
#6
features_transform_6(data_total, 'TransactionAmt')
features_transform_6(lb_dataset_total, 'TransactionAmt')

total_results.append(do_total_eng())

x_train.shape = 108000 rows, 429 cols
x_valid.shape = 43200 rows, 429 cols
x_test.shape = 28800 rows, 429 cols
Training until validation scores don't improve for 25 rounds
[50]	valid_0's auc: 0.870095	valid_0's binary_logloss: 0.0910937
Did not meet early stopping. Best iteration is:
[70]	valid_0's auc: 0.874039	valid_0's binary_logloss: 0.0894168


In [178]:
#7
features_transform_7(data_total, 'P_emaildomain')
features_transform_7(data_total, 'R_emaildomain')
features_transform_7(lb_dataset_total, 'P_emaildomain')
features_transform_7(lb_dataset_total, 'R_emaildomain')

feat_list = ['P_emaildomain_name', 'P_emaildomain_region', 'R_emaildomain_name', 'R_emaildomain_region']
for feat in feat_list:
    freq_encoder = features_transform_3(data_total, feat) #  Frequency Encoding
    features_transform_3(lb_dataset_total, feat, freq_encoder)
    
total_results.append(do_total_eng())

x_train.shape = 108000 rows, 433 cols
x_valid.shape = 43200 rows, 433 cols
x_test.shape = 28800 rows, 433 cols
Training until validation scores don't improve for 25 rounds
[50]	valid_0's auc: 0.874308	valid_0's binary_logloss: 0.0901174
Did not meet early stopping. Best iteration is:
[70]	valid_0's auc: 0.878173	valid_0's binary_logloss: 0.0882937


In [154]:
data_total.head(2)

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,addr2,dist1,dist2,P_emaildomain,R_emaildomain,C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11,C12,C13,C14,D1,D2,D3,D4,D5,D6,D7,D8,D9,D10,D11,D12,D13,D14,D15,M1,M2,M3,M4,M5,M6,M7,M8,M9,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,V29,V30,V31,V32,V33,V34,V35,V36,V37,V38,V39,V40,V41,V42,V43,V44,V45,V46,V47,V48,V49,V50,V51,V52,V53,V54,V55,V56,V57,V58,V59,V60,V61,V62,V63,V64,V65,V66,V67,V68,V69,V70,V71,V72,V73,V74,V75,V76,V77,V78,V79,V80,V81,V82,V83,V84,V85,V86,V87,V88,V89,V90,V91,V92,V93,V94,V95,V96,V97,V98,V99,V100,V101,V102,V103,V104,V105,V106,V107,V108,V109,V110,V111,V112,V113,V114,V115,V116,V117,V118,V119,V120,V121,V122,V123,V124,V125,V126,V127,V128,V129,V130,V131,V132,V133,V134,V135,V136,V137,V138,V139,V140,V141,V142,V143,V144,V145,...,V199,V200,V201,V202,V203,V204,V205,V206,V207,V208,V209,V210,V211,V212,V213,V214,V215,V216,V217,V218,V219,V220,V221,V222,V223,V224,V225,V226,V227,V228,V229,V230,V231,V232,V233,V234,V235,V236,V237,V238,V239,V240,V241,V242,V243,V244,V245,V246,V247,V248,V249,V250,V251,V252,V253,V254,V255,V256,V257,V258,V259,V260,V261,V262,V263,V264,V265,V266,V267,V268,V269,V270,V271,V272,V273,V274,V275,V276,V277,V278,V279,V280,V281,V282,V283,V284,V285,V286,V287,V288,V289,V290,V291,V292,V293,V294,V295,V296,V297,V298,V299,V300,V301,V302,V303,V304,V305,V306,V307,V308,V309,V310,V311,V312,V313,V314,V315,V316,V317,V318,V319,V320,V321,V322,V323,V324,V325,V326,V327,V328,V329,V330,V331,V332,V333,V334,V335,V336,V337,V338,V339,year,month_of_year,day_of_week,hour_of_day,day_of_month,card_1_2,card_1_2_3_5,card_1_2_3_5_addr_1_2,TRANSACTIONAMT_MEAN_BY_CARD1,TRANSACTIONAMT_STD_BY_CARD1,TRANSACTIONAMT_MEAN_BY_CARD2,TRANSACTIONAMT_STD_BY_CARD2,TRANSACTIONAMT_MEAN_BY_CARD3,TRANSACTIONAMT_STD_BY_CARD3,TRANSACTIONAMT_MEAN_BY_CARD4,TRANSACTIONAMT_STD_BY_CARD4,TRANSACTIONAMT_MEAN_BY_CARD5,TRANSACTIONAMT_STD_BY_CARD5,TRANSACTIONAMT_MEAN_BY_CARD6,TRANSACTIONAMT_STD_BY_CARD6,TRANSACTIONAMT_MEAN_BY_ADDR1,TRANSACTIONAMT_STD_BY_ADDR1,TRANSACTIONAMT_MEAN_BY_ADDR2,TRANSACTIONAMT_STD_BY_ADDR2,TRANSACTIONAMT_MEAN_BY_CARD_1_2,TRANSACTIONAMT_STD_BY_CARD_1_2,TRANSACTIONAMT_MEAN_BY_CARD_1_2_3_5,TRANSACTIONAMT_STD_BY_CARD_1_2_3_5,TRANSACTIONAMT_MEAN_BY_CARD_1_2_3_5_ADDR_1_2,TRANSACTIONAMT_STD_BY_CARD_1_2_3_5_ADDR_1_2,D15_MEAN_BY_CARD1,D15_STD_BY_CARD1,D15_MEAN_BY_CARD2,D15_STD_BY_CARD2,D15_MEAN_BY_CARD3,D15_STD_BY_CARD3,D15_MEAN_BY_CARD4,D15_STD_BY_CARD4,D15_MEAN_BY_CARD5,D15_STD_BY_CARD5,D15_MEAN_BY_CARD6,D15_STD_BY_CARD6,D15_MEAN_BY_ADDR1,D15_STD_BY_ADDR1,D15_MEAN_BY_ADDR2,D15_STD_BY_ADDR2,D15_MEAN_BY_CARD_1_2,D15_STD_BY_CARD_1_2,D15_MEAN_BY_CARD_1_2_3_5,D15_STD_BY_CARD_1_2_3_5,D15_MEAN_BY_CARD_1_2_3_5_ADDR_1_2,D15_STD_BY_CARD_1_2_3_5_ADDR_1_2,TransactionAmt_int,TransactionAmt_frc,TransactionAmt_log,P_emaildomain_name,P_emaildomain_region,R_emaildomain_name,R_emaildomain_region
0,2987000,0,2017-12-02 00:00:00,68.5,W,6.1e-05,0.014506,0.879722,0.013211,0.000272,0.317939,0.038156,0.876289,19.0,,,,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,2.0,0.0,1.0,1.0,14.0,,13.0,,,,,,,13.0,13.0,,,,0.0,T,T,T,M2,F,T,,,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,117.0,0.0,0.0,0.0,0.0,0.0,117.0,0.0,0.0,0.0,0.0,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,117.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,2017,12,5,0,2,2883.0,2939.0,7794.0,127.373022,187.860323,195.077321,329.849448,140.340757,216.977741,220.5082,340.817021,162.972041,145.452536,169.667189,260.910712,133.110424,237.894612,140.836345,217.497586,296.375,235.958109,296.375,235.958109,68.5,,172.174545,191.173467,122.316408,166.482818,168.46659,186.337996,114.041667,171.186747,131.534884,168.214057,108.751902,171.249895,188.936614,194.069187,169.541518,186.33749,0.5,0.57735,0.5,0.57735,0.0,,68.0,0.5,4.226834,0.158,0.158,0.665,0.665
1,2987001,0,2017-12-02 00:00:01,29.0,W,0.001244,0.006756,0.879722,0.302783,0.054433,0.317939,0.071367,0.876289,,,gmail.com,,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,,,0.0,,,,,,0.0,,,,,0.0,,,,M0,T,T,,,,,,,,,,,,,,,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,2017,12,5,0,2,6780.0,6913.0,18578.0,156.098192,305.995788,198.8001,294.181782,140.340757,216.977741,126.019069,205.33334,190.203414,290.383922,169.667189,260.910712,149.334343,228.947057,140.836345,217.497586,229.58808,413.469295,229.58808,413.469295,226.809524,270.054718,105.217277,164.218286,123.450725,173.075974,168.46659,186.337996,139.496772,178.778516,110.602067,171.390409,108.751902,171.249895,195.737281,194.440057,169.541518,186.33749,114.811765,177.751006,114.811765,177.751006,59.75,123.056897,29.0,0.0,3.367296,0.373961,0.785067,0.665,0.665


# Результаты

#### Вариант где фичи добавляются только по заданию (изначальный датасет изменяется только в рамках задания)

In [147]:
Global_results_1 = pd.DataFrame({'Base': task_0_results,
                                'task_1': task_1_results,
                                'task_2': task_2_results,
                                'task_3': task_3_results,
                                'task_4': task_4_results,
                                'task_5': task_5_results,
                                'task_6': task_6_results,
                                'task_7': task_7_results,},
                                index=['Train_AUC', 'Valid_AUC', 'Test_AUC', 'LB_AUC'])

In [148]:
Global_results_1

Unnamed: 0,Base,task_1,task_2,task_3,task_4,task_5,task_6,task_7
Train_AUC,0.891854,0.892516,0.886998,0.889129,0.891595,0.892842,0.891361,0.893453
Valid_AUC,0.870411,0.871489,0.871387,0.871227,0.876047,0.87505,0.871508,0.873659
Test_AUC,0.855293,0.855214,0.850128,0.854202,0.858,0.856345,0.854749,0.859468
LB_AUC,0.853081,0.852579,0.854489,0.858109,0.858729,0.859949,0.854784,0.85558


#### Вариант где фичи добавляются к фичам из предыдущего задания

In [149]:
Global_results_2 = pd.DataFrame({'Base': task_0_results,
                                'task_1': total_results[0],
                                'task_2': total_results[1],
                                'task_3': total_results[2],
                                'task_4': total_results[3],
                                'task_5': total_results[4],
                                'task_6': total_results[5],
                                'task_7': total_results[6],},
                                index=['Train_AUC', 'Valid_AUC', 'Test_AUC', 'LB_AUC'])

In [150]:
Global_results_2

Unnamed: 0,Base,task_1,task_2,task_3,task_4,task_5,task_6,task_7
Train_AUC,0.891854,0.892516,0.88781,0.892505,0.892703,0.895517,0.895099,0.898808
Valid_AUC,0.870411,0.871489,0.872251,0.877714,0.876292,0.876836,0.876575,0.87808
Test_AUC,0.855293,0.855214,0.85106,0.855457,0.858654,0.857974,0.857504,0.862738
LB_AUC,0.853081,0.852579,0.852085,0.858963,0.858364,0.858292,0.857519,0.859983


### Выводы
Валидация зафиксирована и хорошо работала на всех заданиях.  
Судя по полученым результатам стоит убрать фичи из задания 1 (временные фичи уведичивают дов интервал на валидации и вцелом портят результат)  
Перезапускаем создане фичей (все кроме 1):  

In [179]:
Global_results_3 = pd.DataFrame({'Base': task_0_results,
                                'task_1': [0,0,0,0],
                                'task_2': total_results[0],
                                'task_3': total_results[1],
                                'task_4': total_results[2],
                                'task_5': total_results[3],
                                'task_6': total_results[4],
                                'task_7': total_results[5],},
                                index=['Train_AUC', 'Valid_AUC', 'Test_AUC', 'LB_AUC'])

In [180]:
Global_results_3

Unnamed: 0,Base,task_1,task_2,task_3,task_4,task_5,task_6,task_7
Train_AUC,0.891854,0,0.886998,0.889793,0.890804,0.89363,0.894175,0.895906
Valid_AUC,0.870411,0,0.871387,0.87568,0.875641,0.873681,0.874039,0.878173
Test_AUC,0.855293,0,0.850128,0.854322,0.857977,0.856933,0.85687,0.862516
LB_AUC,0.853081,0,0.854497,0.859157,0.859158,0.85963,0.858744,0.861947
