## Урок 5

In [1]:
from typing import List, Optional
from tqdm import tqdm

import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt

import seaborn as sns
import scipy.stats as st
from scipy.stats import probplot, ks_2samp

from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, cross_val_score
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import check_is_fitted
import missingno as msno
import xgboost as xgb

from sklearn.metrics import (roc_auc_score, roc_curve, auc, confusion_matrix, accuracy_score, \
                            classification_report, plot_confusion_matrix, plot_precision_recall_curve, \
                            precision_recall_curve, recall_score, plot_roc_curve)
import catboost as cb
import xgboost as xgb
from xgboost import XGBClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
# plt.style.use('fivethirtyeight')
# %config InlineBackend.figure_format = 'svg'
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

from lightgbm import LGBMClassifier
import lightgbm as lgb

In [2]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    for col in df.columns:
        col_type = df[col].dtype
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')
    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(
        100 * (start_mem - end_mem) / start_mem))

    return df

In [3]:
train = reduce_mem_usage(pd.read_csv(r"./assignment_2_train.csv"))
test = reduce_mem_usage(pd.read_csv(r"./assignment_2_test.csv"))

Memory usage of dataframe is 541.08 MB
Memory usage after optimization is: 262.48 MB
Decreased by 51.5%
Memory usage of dataframe is 300.60 MB
Memory usage after optimization is: 145.83 MB
Decreased by 51.5%


In [4]:
train.head(2)

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,V330,V331,V332,V333,V334,V335,V336,V337,V338,V339
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,...,,,,,,,,,,
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,...,,,,,,,,,,


In [5]:
test.head(2)

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,V330,V331,V332,V333,V334,V335,V336,V337,V338,V339
0,3287000,1,7415038,226.0,W,12473,555.0,150.0,visa,226.0,...,,,,,,,,,,
1,3287001,0,7415054,3072.0,W,15651,417.0,150.0,visa,226.0,...,,,,,,,,,,


### Задание 0: построить базовую модель.

In [7]:
def get_train_and_split(train):
    cat_cols = train.select_dtypes(include=['category', 'object']).columns.tolist()
    
    # Преобразуем сначала object в category, потому что LGBM работает с категорями
    for feature in cat_cols:
        train[feature] = pd.Series(train[feature], dtype="category")
    
    X = train.drop(['isFraud'], axis=1)
    y = train['isFraud']

    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, random_state=99)
    
    return X_train, X_valid, y_train, y_valid, cat_cols

In [8]:
params = {
    "boosting_type": "gbdt",
    "objective": "binary",
    "metric": "auc",
    "learning_rate": 0.01,
    "n_estimators": 1000,
    "n_jobs": 6,
    "seed": 99
}

In [9]:
X_train, X_valid, y_train, y_valid, cat_cols = get_train_and_split(train)

In [10]:
model_lgbm0 = LGBMClassifier(**params)
model_lgbm0.fit(
    categorical_feature=cat_cols,
    X=X_train,
    y=y_train,
    eval_set=[(X_train, y_train), (X_valid, y_valid)],
    early_stopping_rounds=25,
    eval_metric="auc",
    verbose=25
)

Training until validation scores don't improve for 25 rounds
[25]	training's auc: 0.86501	valid_1's auc: 0.856429
[50]	training's auc: 0.882889	valid_1's auc: 0.873321
[75]	training's auc: 0.895319	valid_1's auc: 0.883766
[100]	training's auc: 0.902809	valid_1's auc: 0.891425
[125]	training's auc: 0.907445	valid_1's auc: 0.895538
[150]	training's auc: 0.912388	valid_1's auc: 0.89913
[175]	training's auc: 0.917652	valid_1's auc: 0.901404
[200]	training's auc: 0.922074	valid_1's auc: 0.90415
[225]	training's auc: 0.926308	valid_1's auc: 0.906693
[250]	training's auc: 0.930318	valid_1's auc: 0.909918
[275]	training's auc: 0.933664	valid_1's auc: 0.912505
[300]	training's auc: 0.93661	valid_1's auc: 0.914988
[325]	training's auc: 0.93972	valid_1's auc: 0.916854
[350]	training's auc: 0.942434	valid_1's auc: 0.918379
[375]	training's auc: 0.945284	valid_1's auc: 0.920736
[400]	training's auc: 0.947663	valid_1's auc: 0.922354
[425]	training's auc: 0.950029	valid_1's auc: 0.923984
[450]	traini

LGBMClassifier(learning_rate=0.01, metric='auc', n_estimators=1000, n_jobs=6,
               objective='binary', seed=99)

In [11]:
model0_train, model0_valid = model_lgbm0.best_score_.get('training').get('auc'), model_lgbm0.best_score_.get('valid_1').get('auc')
model0_train, model0_valid

(0.976857061341522, 0.9384269116413441)

### Задание 1: признак TransactionDT - это смещение в секундах относительно базовой даты. Базовая дата - 2017-12-01,  преобразовать признак TransactionDT в datetime, прибавив к базовой дате исходное значение признака. Из полученного признака выделить год, месяц, день недели, час, день.

In [12]:
train1 = train.copy() 

In [13]:
train1.head(2)

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,V330,V331,V332,V333,V334,V335,V336,V337,V338,V339
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,...,,,,,,,,,,
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,...,,,,,,,,,,


In [14]:
# Перевести дату в секунды, прибавить исходное значение
train1['TransactionDT'] = ((pd.to_datetime('2017-12-01') - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s')) + train1['TransactionDT']


In [15]:
# # Год
# pd.to_datetime(1512172800, unit='s').year
train1['TransactionDT_year'] = pd.to_datetime([ii for ii in train1['TransactionDT']], unit='s').year

# Месяц
train1['TransactionDT_month'] = pd.to_datetime([ii for ii in train1['TransactionDT']], unit='s').month

# День
train1['TransactionDT_day'] = pd.to_datetime([ii for ii in train1['TransactionDT']], unit='s').day

# День недели
train1['TransactionDT_dayofweek'] = pd.to_datetime([ii for ii in train1['TransactionDT']], unit='s').dayofweek + 1

# Час
train1['TransactionDT_hour'] = pd.to_datetime([ii for ii in train1['TransactionDT']], unit='s').hour

In [16]:
train1.head(2)

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,V335,V336,V337,V338,V339,TransactionDT_year,TransactionDT_month,TransactionDT_day,TransactionDT_dayofweek,TransactionDT_hour
0,2987000,0,1512172800,68.5,W,13926,,150.0,discover,142.0,...,,,,,,2017,12,2,6,0
1,2987001,0,1512172801,29.0,W,2755,404.0,150.0,mastercard,102.0,...,,,,,,2017,12,2,6,0


In [17]:
X_train1, X_valid1, y_train1, y_valid1, cat_cols1 = get_train_and_split(train1)
model_lgbm1 = LGBMClassifier(**params)
model_lgbm1.fit(
    categorical_feature=cat_cols1,
    X=X_train1,
    y=y_train1,
    eval_set=[(X_train1, y_train1), (X_valid1, y_valid1)],
    early_stopping_rounds=25,
    eval_metric="auc",
    verbose=25
)

Training until validation scores don't improve for 25 rounds
[25]	training's auc: 0.866454	valid_1's auc: 0.857073
[50]	training's auc: 0.882464	valid_1's auc: 0.872832
[75]	training's auc: 0.895596	valid_1's auc: 0.884796
[100]	training's auc: 0.902863	valid_1's auc: 0.891598
[125]	training's auc: 0.90749	valid_1's auc: 0.895261
[150]	training's auc: 0.911791	valid_1's auc: 0.897705
[175]	training's auc: 0.917495	valid_1's auc: 0.901148
[200]	training's auc: 0.92152	valid_1's auc: 0.903695
[225]	training's auc: 0.92496	valid_1's auc: 0.906389
[250]	training's auc: 0.929331	valid_1's auc: 0.909661
[275]	training's auc: 0.93435	valid_1's auc: 0.912599
[300]	training's auc: 0.937983	valid_1's auc: 0.91522
[325]	training's auc: 0.941168	valid_1's auc: 0.917385
[350]	training's auc: 0.945457	valid_1's auc: 0.920077
[375]	training's auc: 0.94799	valid_1's auc: 0.921529
[400]	training's auc: 0.950141	valid_1's auc: 0.92325
[425]	training's auc: 0.952239	valid_1's auc: 0.924417
[450]	training

LGBMClassifier(learning_rate=0.01, metric='auc', n_estimators=1000, n_jobs=6,
               objective='binary', seed=99)

In [18]:
model1_train, model1_valid = model_lgbm1.best_score_.get('training').get('auc'), model_lgbm1.best_score_.get('valid_1').get('auc')
model1_train, model1_valid

(0.9784660361710271, 0.9376012952504569)

Выводы: преобразование TransactionDT и добавление новых фичей не дало прироста качества.

### Задание 2: сделать конкатенацию признаков
* card1 + card2;
* card1 + card2 + card_3 + card_5;
* card1 + card2 + card_3 + card_5 + addr1 + addr2

In [19]:
train2 = train.copy()

In [20]:
train2.head(2)

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,V330,V331,V332,V333,V334,V335,V336,V337,V338,V339
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,...,,,,,,,,,,
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,...,,,,,,,,,,


In [21]:
train2['card1 + card2'] = train2['card1'].astype('str') + " | " +  train2['card2'].astype('str')

In [22]:
train2['card1 + card2 + card_3 + card_5'] = train2['card1'].astype('str') + " | " + \
    train2['card2'].astype('str') + " | " + train2['card3'].astype('str') + " | " + train2['card5'].astype('str')


In [23]:
train2['card1 + card2 + card_3 + card_5 + addr1 + addr2'] = train2['card1'].astype('str') + " | " + \
    train2['card2'].astype('str') + " | " + train2['card3'].astype('str') + " | " + train2['card5'].astype('str') + " | " + \
    train2['addr1'].astype('str') + " | " + train2['addr2'].astype('str')

In [24]:
train2.head(2)

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,V333,V334,V335,V336,V337,V338,V339,card1 + card2,card1 + card2 + card_3 + card_5,card1 + card2 + card_3 + card_5 + addr1 + addr2
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,...,,,,,,,,13926 | nan,13926 | nan | 150.0 | 142.0,13926 | nan | 150.0 | 142.0 | 315.0 | 87.0
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,...,,,,,,,,2755 | 404.0,2755 | 404.0 | 150.0 | 102.0,2755 | 404.0 | 150.0 | 102.0 | 325.0 | 87.0


In [25]:
X_train2, X_valid2, y_train2, y_valid2, cat_cols2 = get_train_and_split(train2)
model_lgbm2 = LGBMClassifier(**params)
model_lgbm2.fit(
    categorical_feature=cat_cols2,
    X=X_train2,
    y=y_train2,
    eval_set=[(X_train2, y_train2), (X_valid2, y_valid2)],
    early_stopping_rounds=25,
    eval_metric="auc",
    verbose=50
)

Training until validation scores don't improve for 25 rounds
[50]	training's auc: 0.92035	valid_1's auc: 0.88198
[100]	training's auc: 0.942025	valid_1's auc: 0.901229
[150]	training's auc: 0.953043	valid_1's auc: 0.911751
[200]	training's auc: 0.962401	valid_1's auc: 0.919205
[250]	training's auc: 0.967513	valid_1's auc: 0.92345
[300]	training's auc: 0.971498	valid_1's auc: 0.926965
[350]	training's auc: 0.974769	valid_1's auc: 0.930341
[400]	training's auc: 0.977374	valid_1's auc: 0.933214
[450]	training's auc: 0.98001	valid_1's auc: 0.935259
[500]	training's auc: 0.982297	valid_1's auc: 0.936883
[550]	training's auc: 0.984364	valid_1's auc: 0.938398
[600]	training's auc: 0.986165	valid_1's auc: 0.940031
[650]	training's auc: 0.987435	valid_1's auc: 0.941038
[700]	training's auc: 0.989199	valid_1's auc: 0.942502
[750]	training's auc: 0.990589	valid_1's auc: 0.944001
[800]	training's auc: 0.991452	valid_1's auc: 0.944751
[850]	training's auc: 0.992196	valid_1's auc: 0.945567
[900]	tra

LGBMClassifier(learning_rate=0.01, metric='auc', n_estimators=1000, n_jobs=6,
               objective='binary', seed=99)

In [26]:
model2_train, model2_valid = model_lgbm2.best_score_.get('training').get('auc'), model_lgbm2.best_score_.get('valid_1').get('auc')
model2_train, model2_valid

(0.9944403753635318, 0.9481375498399482)

Выводы: качество немного улучшилось, но пошло переобучение на трейне.

### Задание 3: Сделать FrequencyEncoder для признаков card1 - card6, addr1, addr2.

In [27]:
train3 = train.copy()

In [28]:
cat_coll = ['card1', 'card2', 'card3', 'card4', 'card5', 'card6', 'addr1', 'addr2']

In [29]:
from tqdm import tqdm
for feature in tqdm(cat_coll):
    freq_encoder = train3[feature].value_counts(normalize=True)
    train3[f"Embarked_freq_{feature}"] = train3[feature].map(freq_encoder)

100%|████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:00<00:00, 68.37it/s]


In [30]:
train3.head(2)

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,V338,V339,Embarked_freq_card1,Embarked_freq_card2,Embarked_freq_card3,Embarked_freq_card4,Embarked_freq_card5,Embarked_freq_card6,Embarked_freq_addr1,Embarked_freq_addr2
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,...,,,6.1e-05,,0.879737,0.013212,0.000274,0.317951,0.042773,0.982344
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,...,,,0.001244,0.006855,0.879737,0.302797,0.054723,0.317951,0.080004,0.982344


In [31]:
X_train3, X_valid3, y_train3, y_valid3, cat_cols3 = get_train_and_split(train3)
model_lgbm3 = LGBMClassifier(**params)
model_lgbm3.fit(
    categorical_feature=cat_cols3,
    X=X_train3,
    y=y_train3,
    eval_set=[(X_train3, y_train3), (X_valid3, y_valid3)],
    early_stopping_rounds=25,
    eval_metric="auc",
    verbose=50
)

Training until validation scores don't improve for 25 rounds
[50]	training's auc: 0.882282	valid_1's auc: 0.872308
[100]	training's auc: 0.903405	valid_1's auc: 0.891041
[150]	training's auc: 0.912423	valid_1's auc: 0.897623
[200]	training's auc: 0.924064	valid_1's auc: 0.904357
[250]	training's auc: 0.933578	valid_1's auc: 0.911453
[300]	training's auc: 0.94055	valid_1's auc: 0.91685
[350]	training's auc: 0.946803	valid_1's auc: 0.921068
[400]	training's auc: 0.951513	valid_1's auc: 0.924191
[450]	training's auc: 0.955394	valid_1's auc: 0.926518
[500]	training's auc: 0.959251	valid_1's auc: 0.928737
[550]	training's auc: 0.962637	valid_1's auc: 0.9313
[600]	training's auc: 0.965569	valid_1's auc: 0.932948
[650]	training's auc: 0.968315	valid_1's auc: 0.934399
[700]	training's auc: 0.971217	valid_1's auc: 0.935508
[750]	training's auc: 0.973163	valid_1's auc: 0.936556
[800]	training's auc: 0.97481	valid_1's auc: 0.937422
[850]	training's auc: 0.976306	valid_1's auc: 0.938274
[900]	trai

LGBMClassifier(learning_rate=0.01, metric='auc', n_estimators=1000, n_jobs=6,
               objective='binary', seed=99)

In [32]:
model3_train, model3_valid = model_lgbm3.best_score_.get('training').get('auc'), model_lgbm3.best_score_.get('valid_1').get('auc')
model3_train, model3_valid

(0.9803437116086793, 0.9402359458451894)

Выводы: наблюдается небольшой прирост качества на тренировочной выборке и незначительный прирост на валидационной выборке, переобучения на терейне меньше, чем в предыдущих экспериментах.

### Задание 4: Создать признаки на основе отношения: TransactionAmt к вычисленной статистике. Статистика - среднее значение / стандартное отклонение TransactionAmt, сгруппированное по card1 - card6, addr1, addr2, и по признакам, созданным в задании 2.

In [33]:
train4 = train.copy()

In [34]:
train4.head(2)

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,V330,V331,V332,V333,V334,V335,V336,V337,V338,V339
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,...,,,,,,,,,,
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,...,,,,,,,,,,


In [35]:
train4 = pd.concat([ train4, train2[train2.iloc[:, 394:].columns.tolist()] ], axis=1)

In [36]:
train4 = pd.concat([ train4, train3[train3.iloc[:, 394:].columns.tolist()] ], axis=1)

In [37]:
train4.head(1)

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,card1 + card2 + card_3 + card_5,card1 + card2 + card_3 + card_5 + addr1 + addr2,Embarked_freq_card1,Embarked_freq_card2,Embarked_freq_card3,Embarked_freq_card4,Embarked_freq_card5,Embarked_freq_card6,Embarked_freq_addr1,Embarked_freq_addr2
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,...,13926 | nan | 150.0 | 142.0,13926 | nan | 150.0 | 142.0 | 315.0 | 87.0,6.1e-05,,0.879737,0.013212,0.000274,0.317951,0.042773,0.982344


In [38]:
listt = train4.iloc[:, 394:].columns.tolist() + train4.iloc[:, 5:13].columns.tolist()
listt

['card1 + card2',
 'card1 + card2 + card_3 + card_5',
 'card1 + card2 + card_3 + card_5 + addr1 + addr2',
 'Embarked_freq_card1',
 'Embarked_freq_card2',
 'Embarked_freq_card3',
 'Embarked_freq_card4',
 'Embarked_freq_card5',
 'Embarked_freq_card6',
 'Embarked_freq_addr1',
 'Embarked_freq_addr2',
 'card1',
 'card2',
 'card3',
 'card4',
 'card5',
 'card6',
 'addr1',
 'addr2']

In [39]:
def create_numerical_aggs(data: pd.DataFrame,
                          groupby_id: str,
                          aggs: dict,
                          prefix: Optional[str] = None,
                          suffix: Optional[str] = None,
                          ) -> pd.DataFrame:
    """
    Построение агрегаций для числовых признаков.

    Parameters
    ----------
    data: pandas.core.frame.DataFrame
        Выборка для построения агрегаций.

    groupby_id: str
        Название ключа, по которому нужно произвести группировку.

    aggs: dict
        Словарь с названием признака и списка функций.
        Ключ словаря - название признака, который используется для
        вычисления агрегаций, значение словаря - список с названием
        функций для вычисления агрегаций.

    prefix: str, optional, default = None
        Префикс для названия признаков.
        Опциональный параметр, по умолчанию, не используется.

    suffix: str, optional, default = None
        Суффикс для названия признаков.
        Опциональный параметр, по умолчанию, не используется.

    Returns
    -------
    stats: pandas.core.frame.DataFrame
        Выборка с рассчитанными агрегациями.

    """
    if not prefix:
        prefix = ""
    if not suffix:
        suffix = ""

    data_grouped = data.groupby(groupby_id)
    stats = data_grouped.agg(aggs)
    stats.columns = [f"{prefix}{feature}_{stat}{suffix}".upper() for feature, stat in stats]
    stats = stats.reset_index()

    return stats

In [40]:
aggs = {'TransactionAmt': ["mean", "std"]}

for feature_for_group in tqdm(listt):
    stats = create_numerical_aggs(
        train4, groupby_id=feature_for_group, aggs=aggs, suffix=f"_BY_{feature_for_group}")

    train4 = train4.merge(
        stats, how="left", on=feature_for_group)
    
    train4[f'TransactionAmt / {train4.iloc[:, -1].name}'] = train4['TransactionAmt'] / train4[train4.iloc[:, -1].name] 
    train4[f'TransactionAmt / {train4.iloc[:, -3].name}'] = train4['TransactionAmt'] / train4[train4.iloc[:, -3].name]
    
    # сразу удалим ненужные признаки (срез не берет правую границу!)
    train4 = train4.drop(train4.iloc[:, -4:-2].columns.tolist(), axis=1)

100%|██████████████████████████████████████████████████████████████████████████████████| 19/19 [00:43<00:00,  2.31s/it]


In [41]:
train4.head(2)

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,TransactionAmt / TRANSACTIONAMT_STD_BY_CARD4,TransactionAmt / TRANSACTIONAMT_MEAN_BY_CARD4,TransactionAmt / TRANSACTIONAMT_STD_BY_CARD5,TransactionAmt / TRANSACTIONAMT_MEAN_BY_CARD5,TransactionAmt / TRANSACTIONAMT_STD_BY_CARD6,TransactionAmt / TRANSACTIONAMT_MEAN_BY_CARD6,TransactionAmt / TRANSACTIONAMT_STD_BY_ADDR1,TransactionAmt / TRANSACTIONAMT_MEAN_BY_ADDR1,TransactionAmt / TRANSACTIONAMT_STD_BY_ADDR2,TransactionAmt / TRANSACTIONAMT_MEAN_BY_ADDR2
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,...,0.200988,0.310646,0.58795,0.555175,0.262542,0.403732,0.287943,0.51461,0.314946,0.48638
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,...,0.141234,0.230124,0.099868,0.152468,0.111149,0.170923,0.126667,0.194195,0.133335,0.205913


In [42]:
X_train4, X_valid4, y_train4, y_valid4, cat_cols4 = get_train_and_split(train4)
model_lgbm4 = LGBMClassifier(**params)
model_lgbm4.fit(
    categorical_feature=cat_cols4,
    X=X_train4,
    y=y_train4,
    eval_set=[(X_train4, y_train4), (X_valid4, y_valid4)],
    early_stopping_rounds=25,
    eval_metric="auc",
    verbose=50
)

Training until validation scores don't improve for 25 rounds
[50]	training's auc: 0.921079	valid_1's auc: 0.886451
[100]	training's auc: 0.945502	valid_1's auc: 0.907679
[150]	training's auc: 0.960708	valid_1's auc: 0.922499
[200]	training's auc: 0.968469	valid_1's auc: 0.927302
[250]	training's auc: 0.973785	valid_1's auc: 0.93046
[300]	training's auc: 0.978067	valid_1's auc: 0.933451
[350]	training's auc: 0.981362	valid_1's auc: 0.936779
[400]	training's auc: 0.984736	valid_1's auc: 0.939302
[450]	training's auc: 0.987032	valid_1's auc: 0.941129
[500]	training's auc: 0.988983	valid_1's auc: 0.942299
[550]	training's auc: 0.990753	valid_1's auc: 0.944044
[600]	training's auc: 0.992175	valid_1's auc: 0.945104
[650]	training's auc: 0.993302	valid_1's auc: 0.945965
[700]	training's auc: 0.994481	valid_1's auc: 0.946933
[750]	training's auc: 0.995223	valid_1's auc: 0.94765
[800]	training's auc: 0.995957	valid_1's auc: 0.948044
[850]	training's auc: 0.996469	valid_1's auc: 0.94872
[900]	tr

LGBMClassifier(learning_rate=0.01, metric='auc', n_estimators=1000, n_jobs=6,
               objective='binary', seed=99)

In [43]:
model4_train, model4_valid = model_lgbm4.best_score_.get('training').get('auc'), model_lgbm4.best_score_.get('valid_1').get('auc')
model4_train, model4_valid

(0.9977643731482249, 0.9496259796588522)

Выводы: новые сгенерированные признаки TransactionAmt значительно увеличили качество на валидации, однако и модель сильно переобучилась на трейне.

### Задание 5: Создать признаки на основе отношения: D15 к вычисленной статистике. Статистика - среднее значение / стандартное отклонение D15, сгруппированное по card1 - card6, addr1, addr2, и по признакам, созданным в задании 2.

In [44]:
train5 = train.copy()

In [45]:
train5 = pd.concat([ train5, train3[train3.iloc[:, 394:].columns.tolist()] ], axis=1)
train5 = pd.concat([ train5, train2[train2.iloc[:, 394:].columns.tolist()] ], axis=1)

In [46]:
listt = train5.iloc[:, 394:].columns.tolist() + train5.iloc[:, 5:13].columns.tolist()
listt

['Embarked_freq_card1',
 'Embarked_freq_card2',
 'Embarked_freq_card3',
 'Embarked_freq_card4',
 'Embarked_freq_card5',
 'Embarked_freq_card6',
 'Embarked_freq_addr1',
 'Embarked_freq_addr2',
 'card1 + card2',
 'card1 + card2 + card_3 + card_5',
 'card1 + card2 + card_3 + card_5 + addr1 + addr2',
 'card1',
 'card2',
 'card3',
 'card4',
 'card5',
 'card6',
 'addr1',
 'addr2']

In [51]:
train5.D15.isna().sum()

48819

In [52]:
train5.D15.fillna(0, inplace=True)

In [53]:
aggs = {'D15': ["mean", "std"]}

for feature_for_group in tqdm(listt):
    stats = create_numerical_aggs(
        train5, groupby_id=feature_for_group, aggs=aggs, suffix=f"_BY_{feature_for_group}")

    train5 = train5.merge(
        stats, how="left", on=feature_for_group)
    
    train5[f'D15 / {train5.iloc[:, -1].name}'] = train5['D15'] / train5[train5.iloc[:, -1].name] 
    train5[f'D15 / {train5.iloc[:, -3].name}'] = train5['D15'] / train5[train5.iloc[:, -3].name]
    
    # сразу удалим ненужные признаки (срез не берет правую границу!)
    train5 = train5.drop(train5.iloc[:, -4:-2].columns.tolist(), axis=1)


  0%|                                                                                           | 0/19 [00:00<?, ?it/s][A
  5%|████▎                                                                              | 1/19 [00:02<00:41,  2.32s/it][A
 11%|████████▋                                                                          | 2/19 [00:04<00:39,  2.32s/it][A
 16%|█████████████                                                                      | 3/19 [00:06<00:37,  2.32s/it][A
 21%|█████████████████▍                                                                 | 4/19 [00:09<00:34,  2.32s/it][A
 26%|█████████████████████▊                                                             | 5/19 [00:11<00:32,  2.33s/it][A
 32%|██████████████████████████▏                                                        | 6/19 [00:13<00:30,  2.32s/it][A
 37%|██████████████████████████████▌                                                    | 7/19 [00:16<00:28,  2.35s/it][A
 42%|██████████

In [54]:
train5.head(2)

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,D15 / D15_STD_BY_CARD4,D15 / D15_MEAN_BY_CARD4,D15 / D15_STD_BY_CARD5,D15 / D15_MEAN_BY_CARD5,D15 / D15_STD_BY_CARD6,D15 / D15_MEAN_BY_CARD6,D15 / D15_STD_BY_ADDR1,D15 / D15_MEAN_BY_ADDR1,D15 / D15_STD_BY_ADDR2,D15 / D15_MEAN_BY_ADDR2
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [55]:
X_train5, X_valid5, y_train5, y_valid5, cat_cols5 = get_train_and_split(train5)
model_lgbm5 = LGBMClassifier(**params)
model_lgbm5.fit(
    categorical_feature=cat_cols5,
    X=X_train5,
    y=y_train5,
    eval_set=[(X_train5, y_train5), (X_valid5, y_valid5)],
    early_stopping_rounds=25,
    eval_metric="auc",
    verbose=50
)

Training until validation scores don't improve for 25 rounds
[50]	training's auc: 0.921141	valid_1's auc: 0.882877
[100]	training's auc: 0.947834	valid_1's auc: 0.907977
[150]	training's auc: 0.958996	valid_1's auc: 0.917168
[200]	training's auc: 0.967717	valid_1's auc: 0.924173
[250]	training's auc: 0.972918	valid_1's auc: 0.928761
[300]	training's auc: 0.977366	valid_1's auc: 0.932294
[350]	training's auc: 0.980863	valid_1's auc: 0.935243
[400]	training's auc: 0.983769	valid_1's auc: 0.93788
[450]	training's auc: 0.986427	valid_1's auc: 0.9401
[500]	training's auc: 0.988544	valid_1's auc: 0.941511
[550]	training's auc: 0.990147	valid_1's auc: 0.942841
[600]	training's auc: 0.991665	valid_1's auc: 0.943698
[650]	training's auc: 0.992768	valid_1's auc: 0.94456
[700]	training's auc: 0.993686	valid_1's auc: 0.945237
[750]	training's auc: 0.994443	valid_1's auc: 0.945589
[800]	training's auc: 0.995203	valid_1's auc: 0.946057
Early stopping, best iteration is:
[811]	training's auc: 0.99548

LGBMClassifier(learning_rate=0.01, metric='auc', n_estimators=1000, n_jobs=6,
               objective='binary', seed=99)

Выводы: новые сгенерированные признаки D15 значительно увеличили качество, однако также модель слишком переобучилась.

### Задание 6: выделить дробную часть и целую часть признака TransactionAmt в два отдельных признака.  После создать отдельных признак - логарифм от TransactionAmt

In [56]:
train6 = train.copy()

In [57]:
train6['TransactionAmt_int'] = train6['TransactionAmt'].astype('int')

In [58]:
train6['TransactionAmt_float'] = train6['TransactionAmt'] % 1

In [59]:
train6['TransactionAmt_log'] = np.log(train6['TransactionAmt'])

In [60]:
train6.head(2)

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,V333,V334,V335,V336,V337,V338,V339,TransactionAmt_int,TransactionAmt_float,TransactionAmt_log
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,...,,,,,,,,68,0.5,4.226834
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,...,,,,,,,,29,0.0,3.367296


In [61]:
X_train6, X_valid6, y_train6, y_valid6, cat_cols6 = get_train_and_split(train6)
model_lgbm6 = LGBMClassifier(**params)
model_lgbm6.fit(
    categorical_feature=cat_cols6,
    X=X_train6,
    y=y_train6,
    eval_set=[(X_train6, y_train6), (X_valid6, y_valid6)],
    early_stopping_rounds=25,
    eval_metric="auc",
    verbose=50
)

Training until validation scores don't improve for 25 rounds
[50]	training's auc: 0.882519	valid_1's auc: 0.872688
[100]	training's auc: 0.90285	valid_1's auc: 0.890702
[150]	training's auc: 0.912367	valid_1's auc: 0.898378
[200]	training's auc: 0.921905	valid_1's auc: 0.903686
[250]	training's auc: 0.929568	valid_1's auc: 0.9093
[300]	training's auc: 0.936727	valid_1's auc: 0.914373
[350]	training's auc: 0.942975	valid_1's auc: 0.918605
[400]	training's auc: 0.948644	valid_1's auc: 0.922306
[450]	training's auc: 0.952963	valid_1's auc: 0.924841
[500]	training's auc: 0.956518	valid_1's auc: 0.927337
[550]	training's auc: 0.959988	valid_1's auc: 0.929212
[600]	training's auc: 0.96265	valid_1's auc: 0.930958
[650]	training's auc: 0.965084	valid_1's auc: 0.932185
[700]	training's auc: 0.967596	valid_1's auc: 0.933586
[750]	training's auc: 0.96976	valid_1's auc: 0.93452
[800]	training's auc: 0.971275	valid_1's auc: 0.935305
[850]	training's auc: 0.973108	valid_1's auc: 0.936
[900]	training

LGBMClassifier(learning_rate=0.01, metric='auc', n_estimators=1000, n_jobs=6,
               objective='binary', seed=99)

Выводы: новые сгенерированные признаки практически не увеличили качество по сравнению с базовым качеством, однако здесь переобучение немного ниже, чем в двух предыдущих. Хорошее качество при минимальной разнице между train и valid достигается на 100 итерациях, а затем модель начинает переобучаться на train.