In [2]:
%load_ext autoreload
%autoreload 2 

In [None]:
import pandas as pd
pd.set_option('display.max_columns', None)
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
import numpy as np

from preparation_funcs import time_based_split, apply_ce_encoding, apply_feature_scaling, generate_nonlinear_features
from choose_models_funcs import evaluate_models, custom_gini, logistic_regression_feature_selection, grid_search_cv_and_validate, evaluate_final_model, evaluate_classification_metrics
from custom_models import CustomLogisticRegression, CustomKNNClassifier, CustomGaussianNB


# Data loading

In [3]:
df = pd.read_csv(f"data/training.csv")

In [None]:
df.head()

Unnamed: 0,RefId,IsBadBuy,PurchDate,Auction,VehYear,VehicleAge,Make,Model,Trim,SubModel,Color,Transmission,WheelTypeID,WheelType,VehOdo,Nationality,Size,TopThreeAmericanName,MMRAcquisitionAuctionAveragePrice,MMRAcquisitionAuctionCleanPrice,MMRAcquisitionRetailAveragePrice,MMRAcquisitonRetailCleanPrice,MMRCurrentAuctionAveragePrice,MMRCurrentAuctionCleanPrice,MMRCurrentRetailAveragePrice,MMRCurrentRetailCleanPrice,PRIMEUNIT,AUCGUART,BYRNO,VNZIP1,VNST,VehBCost,IsOnlineSale,WarrantyCost
0,1,0,12/7/2009,ADESA,2006,3,MAZDA,MAZDA3,i,4D SEDAN I,RED,AUTO,1.0,Alloy,89046,OTHER ASIAN,MEDIUM,OTHER,8155.0,9829.0,11636.0,13600.0,7451.0,8552.0,11597.0,12409.0,,,21973,33619,FL,7100.0,0,1113
1,2,0,12/7/2009,ADESA,2004,5,DODGE,1500 RAM PICKUP 2WD,ST,QUAD CAB 4.7L SLT,WHITE,AUTO,1.0,Alloy,93593,AMERICAN,LARGE TRUCK,CHRYSLER,6854.0,8383.0,10897.0,12572.0,7456.0,9222.0,11374.0,12791.0,,,19638,33619,FL,7600.0,0,1053
2,3,0,12/7/2009,ADESA,2005,4,DODGE,STRATUS V6,SXT,4D SEDAN SXT FFV,MAROON,AUTO,2.0,Covers,73807,AMERICAN,MEDIUM,CHRYSLER,3202.0,4760.0,6943.0,8457.0,4035.0,5557.0,7146.0,8702.0,,,19638,33619,FL,4900.0,0,1389
3,4,0,12/7/2009,ADESA,2004,5,DODGE,NEON,SXT,4D SEDAN,SILVER,AUTO,1.0,Alloy,65617,AMERICAN,COMPACT,CHRYSLER,1893.0,2675.0,4658.0,5690.0,1844.0,2646.0,4375.0,5518.0,,,19638,33619,FL,4100.0,0,630
4,5,0,12/7/2009,ADESA,2005,4,FORD,FOCUS,ZX3,2D COUPE ZX3,SILVER,MANUAL,2.0,Covers,69367,AMERICAN,COMPACT,FORD,3913.0,5054.0,7723.0,8707.0,3247.0,4384.0,6739.0,7911.0,,,19638,33619,FL,4000.0,0,1020


## Data preparation

In [4]:
train, validation, test = time_based_split(df, "PurchDate")

*В качестве метода кодирования выбран CountEncoder в связи с большим количеством категориальных переменных, потенциально большим количеством категорий и высокой вероятностью уникальных категорий в тесте и валидационной выборке*

In [5]:
train_encoded, val_encoded, test_encoded = apply_ce_encoding(train, validation, test)

*Заполняем пропуски, преобразуем дату в timestamp и после стандартизируем все числовые столбцы, кроме целевого (он бинарный)*

In [6]:
train_scaled, val_scaled, test_scaled = apply_feature_scaling(train_encoded, val_encoded, test_encoded, columns = df.drop(["IsBadBuy"], axis=1).columns)

# Train models

## sklearn

In [12]:
models_to_test = [
    LogisticRegression(random_state=42),
    GaussianNB(),
    KNeighborsClassifier()
]

In [13]:
evaluate_models(
    train_df=train_scaled, 
    val_df=val_scaled, 
    target_column='IsBadBuy', 
    models=models_to_test
)

--- Оценка набора признаков: 'Full Features' (33 признаков) ---

Результаты для 'Full Features' (метрика: gini_score_sklearn):


Unnamed: 0,LogisticRegression,GaussianNB,KNeighborsClassifier
Train,0.497439,0.473706,0.826564
Validation,0.472059,0.453967,0.339951


In [10]:
evaluate_models(
    train_df=train_scaled, 
    val_df=val_scaled, 
    target_column='IsBadBuy', 
    models=models_to_test,
    scoring_func = custom_gini
)

--- Оценка набора признаков: 'Full Features' (33 признаков) ---

Результаты для 'Full Features' (метрика: custom_gini):


Unnamed: 0,LogisticRegression,GaussianNB,KNeighborsClassifier
Train,0.497439,0.473706,0.826564
Validation,0.472059,0.453967,0.339951


*Джини sklearn и кастомный показывают одинаковые результаты*

## custom

In [49]:
models_to_test_custom = [
    CustomLogisticRegression(),
    CustomGaussianNB(),
    CustomKNNClassifier()
]

In [50]:
evaluate_models(
    train_df=train_scaled, 
    val_df=val_scaled, 
    target_column='IsBadBuy', 
    models=models_to_test_custom
)

--- Оценка набора признаков: 'Full Features' (33 признаков) ---

Результаты для 'Full Features' (метрика: gini_score_sklearn):


Unnamed: 0,CustomLogisticRegression,CustomGaussianNB,CustomKNNClassifier
Train,0.495549,0.473507,0.826564
Validation,0.468196,0.452654,0.339951


# non-linear features

In [11]:
frac_rules = [
    ('VehOdo', 'VehicleAge'),  # Пробег на единицу возраста
    ('VehBCost', 'MMRAcquisitionAuctionAveragePrice'), # Разница между ценой покупки и рыночной
    ('MMRCurrentRetailCleanPrice', 'MMRAcquisitionAuctionCleanPrice') # Потенциальная маржа
]
groupby_rules = {
    # Сравнение пробега авто со средним пробегом по его марке и году выпуска
    'mean_Odo_by_Make_Year': ('Make', 'VehOdo', 'mean'), 
    'std_Odo_by_Make_Year': ('Make', 'VehOdo', 'std'),

    # Сравнение цены покупки со средней ценой по аукциону
    'mean_Cost_by_Auction': ('Auction', 'VehBCost', 'mean'),
    'std_Cost_by_Auction': ('Auction', 'VehBCost', 'std'),

    # Разница в возрасте авто и среднем возрасте по модели
    'mean_Age_by_Model': ('Model', 'VehicleAge', 'mean')
}

transform_rules = {
    'VehOdo': [np.log1p], # Логарифм для пробега
    'VehicleAge': [np.square],      # Квадрат для возраста
    'VehBCost': [np.log1p],         # Логарифм для стоимости
    'WarrantyCost': [np.log1p]      # Логарифм для стоимости гарантии
}

In [12]:
train_new, val_new, test_new = generate_nonlinear_features(train, validation, test, 
                                frac_rules=frac_rules, 
                                groupby_rules=groupby_rules, 
                                transform_rules=transform_rules)

In [13]:
train_encoded_new, val_encoded_new, test_encoded_new = apply_ce_encoding(train_new, val_new, test_new)
train_scaled_new, val_scaled_new, test_scaled_new = apply_feature_scaling(train_encoded_new, val_encoded_new, test_encoded_new, columns = train_encoded_new.drop(["IsBadBuy"], axis=1).columns)

## non-linear features: running models

In [14]:
evaluate_models(
    train_df=train_scaled_new, 
    val_df=val_scaled_new, 
    target_column='IsBadBuy', 
    models=models_to_test
)

--- Оценка набора признаков: 'Full Features' (45 признаков) ---

Результаты для 'Full Features' (метрика: gini_score_sklearn):


Unnamed: 0,LogisticRegression,GaussianNB,KNeighborsClassifier
Train,0.501591,0.474326,0.825257
Validation,0.472769,0.427158,0.331646


**gini был увеличен для LogisticRegression и GaussianNB**

# Eliminate useless features

In [15]:
p_features, l1_features = logistic_regression_feature_selection(train_df=train_scaled_new, 
    val_df=val_scaled_new, 
    target_col='IsBadBuy',
    threshold = 0.85)

Обучение Baseline модели...
Отбор признаков по p-value < 0.05...
Обнаружены и удалены следующие линейно зависимые столбцы: ['Auction', 'PRIMEUNIT', 'IsOnlineSale']
Удалены следующие столбцы из пары с корреляцией выше 0.85: ['VehicleAge', 'MMRAcquisitionAuctionCleanPrice', 'MMRAcquisitionRetailAveragePrice', 'MMRAcquisitonRetailCleanPrice', 'MMRCurrentAuctionAveragePrice', 'MMRCurrentAuctionCleanPrice', 'MMRCurrentRetailAveragePrice', 'MMRCurrentRetailCleanPrice', 'VehOdo_log1p', 'VehicleAge_square', 'VehBCost_log1p', 'WarrantyCost_log1p']
Обучение модели с L1 регуляризацией (C=1.0)...


Unnamed: 0_level_0,Train Gini,Validation Gini,Num Features
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Baseline (All Features),0.501513,0.472478,45
P-value Selection,0.493942,0.464758,15
L1 Regularization,0.50141,0.472851,37


In [17]:
feature_pools_dict = {
    'Initial Features': df.drop(['IsBadBuy'], axis=1).columns,
     'L1 Selected Features': l1_features,
     'P-value Selected Features': p_features
 }

In [18]:
evaluate_models(
    train_df=train_scaled_new, 
    val_df=val_scaled_new, 
    target_column='IsBadBuy', 
    models=models_to_test,
    feature_pools = feature_pools_dict
)

--- Оценка набора признаков: 'Full Features' (45 признаков) ---

Результаты для 'Full Features' (метрика: gini_score_sklearn):


Unnamed: 0,LogisticRegression,GaussianNB,KNeighborsClassifier
Train,0.501591,0.474326,0.825257
Validation,0.472769,0.427158,0.331646


--- Оценка набора признаков: 'Initial Features' (33 признаков) ---

Результаты для 'Initial Features' (метрика: gini_score_sklearn):


Unnamed: 0,LogisticRegression,GaussianNB,KNeighborsClassifier
Train,0.497439,0.473706,0.826564
Validation,0.472059,0.453967,0.339951


--- Оценка набора признаков: 'L1 Selected Features' (37 признаков) ---

Результаты для 'L1 Selected Features' (метрика: gini_score_sklearn):


Unnamed: 0,LogisticRegression,GaussianNB,KNeighborsClassifier
Train,0.501389,0.473183,0.825436
Validation,0.471008,0.415629,0.340181


--- Оценка набора признаков: 'P-value Selected Features' (15 признаков) ---

Результаты для 'P-value Selected Features' (метрика: gini_score_sklearn):


Unnamed: 0,LogisticRegression,GaussianNB,KNeighborsClassifier
Train,0.493954,0.464555,0.82869
Validation,0.464773,0.442727,0.317126


* **Для дальнейшей проверки параметров в качестве базовой модели выбирается LogisticRegression как модель со вторым лучшим результатам на трейне и при этом с отсутствием такого сильного переобучения, как у  KNeighborsClassifier. На всех данных**
* **В качестве экспериментальной модели берется KNeighborsClassifier на сокращенном вручную датасете (по p-value) - с целью сокращения разрыва между трейном и валидацией**

# Gridsearch

## logregression

In [19]:
lr = LogisticRegression()
lr_params = {
    'penalty': ['l1', 'l2'],
    'C': [0.01, 0.1, 0.5, 1, 10],
    'solver': ['liblinear'],
    'class_weight': [None, 'balanced'],
    'random_state': [42]
}

In [20]:
best_parameters = grid_search_cv_and_validate(lr, lr_params, train_scaled_new, val_scaled_new, 'IsBadBuy')

Запускаю GridSearchCV для 'LogisticRegression'...





--- Проверка топ-5 моделей на валидационном сете ---
Параметры: {'C': 0.01, 'class_weight': 'balanced', 'penalty': 'l1', 'random_state': 42, 'solver': 'liblinear'}
  - Средний Gini на CV (train): 0.4878
  - Gini на Validation:          0.4796
Параметры: {'C': 0.01, 'class_weight': 'balanced', 'penalty': 'l2', 'random_state': 42, 'solver': 'liblinear'}
  - Средний Gini на CV (train): 0.4836
  - Gini на Validation:          0.4774
Параметры: {'C': 0.01, 'class_weight': None, 'penalty': 'l1', 'random_state': 42, 'solver': 'liblinear'}
  - Средний Gini на CV (train): 0.4820
  - Gini на Validation:          0.4766
Параметры: {'C': 0.1, 'class_weight': None, 'penalty': 'l1', 'random_state': 42, 'solver': 'liblinear'}
  - Средний Gini на CV (train): 0.4816
  - Gini на Validation:          0.4705
Параметры: {'C': 0.01, 'class_weight': None, 'penalty': 'l2', 'random_state': 42, 'solver': 'liblinear'}
  - Средний Gini на CV (train): 0.4805
  - Gini на Validation:          0.4738

--- Финальная 

**Наибольшее значение имеют параметры регуляризации**

## KNN

In [21]:
knn = KNeighborsClassifier()
knn_params = {
    'n_neighbors': [11, 21, 35, 51, 101],
    'weights': ['uniform', 'distance'],
    'metric': ['minkowski', 'manhattan']
}

In [22]:
best_parameters_knn = grid_search_cv_and_validate(knn, knn_params, train_scaled_new[p_features + ['IsBadBuy']], val_scaled_new[p_features + ['IsBadBuy']], 'IsBadBuy')

Запускаю GridSearchCV для 'KNeighborsClassifier'...

--- Проверка топ-5 моделей на валидационном сете ---
Параметры: {'metric': 'minkowski', 'n_neighbors': 101, 'weights': 'uniform'}
  - Средний Gini на CV (train): 0.4844
  - Gini на Validation:          0.4370
Параметры: {'metric': 'minkowski', 'n_neighbors': 101, 'weights': 'distance'}
  - Средний Gini на CV (train): 0.4836
  - Gini на Validation:          0.4378
Параметры: {'metric': 'manhattan', 'n_neighbors': 101, 'weights': 'uniform'}
  - Средний Gini на CV (train): 0.4822
  - Gini на Validation:          0.4472
Параметры: {'metric': 'manhattan', 'n_neighbors': 101, 'weights': 'distance'}
  - Средний Gini на CV (train): 0.4820
  - Gini на Validation:          0.4498
Параметры: {'metric': 'manhattan', 'n_neighbors': 51, 'weights': 'uniform'}
  - Средний Gini на CV (train): 0.4670
  - Gini на Validation:          0.4300

--- Финальная лучшая модель (по Gini на Validation) ---
Лучшие параметры: {'metric': 'manhattan', 'n_neighbors':

**Наибольшее значение имеют параметры количества соседей, паттерн определения весов и метод расчета расстояния**

## GaussianNB

In [23]:
gnb = GaussianNB()
gnb_params = {
    'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6, 1e-5]
}

In [24]:
best_parameters_gnb = grid_search_cv_and_validate(gnb, gnb_params, train_scaled_new[p_features + ['IsBadBuy']], val_scaled_new[p_features + ['IsBadBuy']], 'IsBadBuy')

Запускаю GridSearchCV для 'GaussianNB'...

--- Проверка топ-5 моделей на валидационном сете ---
Параметры: {'var_smoothing': 1e-05}
  - Средний Gini на CV (train): 0.4453
  - Gini на Validation:          0.4427
Параметры: {'var_smoothing': 1e-09}
  - Средний Gini на CV (train): 0.4453
  - Gini на Validation:          0.4427
Параметры: {'var_smoothing': 1e-08}
  - Средний Gini на CV (train): 0.4453
  - Gini на Validation:          0.4427
Параметры: {'var_smoothing': 1e-07}
  - Средний Gini на CV (train): 0.4453
  - Gini на Validation:          0.4427
Параметры: {'var_smoothing': 1e-06}
  - Средний Gini на CV (train): 0.4453
  - Gini на Validation:          0.4427

--- Финальная лучшая модель (по Gini на Validation) ---
Лучшие параметры: {'var_smoothing': 1e-05}
Gini на Train:      0.4646
Gini на Validation: 0.4427


**В целом тюнить нет необходимости, однако можно проверить показатель var_smoothing, который корректирует дисперсию каждого признака**

# Оценка финальной модели

In [25]:
lr_best = LogisticRegression(**best_parameters) 
knn_best = KNeighborsClassifier(**best_parameters_knn)
gnb_best = GaussianNB(**best_parameters_gnb)

In [26]:
evaluate_final_model(lr_best, train_scaled_new, val_scaled_new, test_scaled_new, 'IsBadBuy')

Обучение финальной модели 'LogisticRegression'...

--- Финальная оценка Gini ---


Unnamed: 0_level_0,Gini
Dataset,Unnamed: 1_level_1
Train,0.504
Validation,0.48
Test,0.349


**У LogisticRegression наблюдаются признаки переобучения - резкое снижение качества между валидацией и тестом**

In [27]:
evaluate_final_model(knn_best, train_scaled_new[p_features + ['IsBadBuy']], val_scaled_new[p_features + ['IsBadBuy']], test_scaled_new[p_features + ['IsBadBuy']], 'IsBadBuy')

Обучение финальной модели 'KNeighborsClassifier'...

--- Финальная оценка Gini ---


Unnamed: 0_level_0,Gini
Dataset,Unnamed: 1_level_1
Train,1.0
Validation,0.45
Test,0.464


**У KNeighborsClassifier переобучения не наблюдается - метрики валидации и теста крайне близки, метрику трейна не учитываем в связи с особенностями расчета**

In [28]:
evaluate_final_model(gnb_best, train_scaled_new[p_features + ['IsBadBuy']], val_scaled_new[p_features + ['IsBadBuy']], test_scaled_new[p_features + ['IsBadBuy']], 'IsBadBuy')

Обучение финальной модели 'GaussianNB'...

--- Финальная оценка Gini ---


Unnamed: 0_level_0,Gini
Dataset,Unnamed: 1_level_1
Train,0.465
Validation,0.443
Test,0.441


**У GaussianNB переобучения не наблюдается - метрики трейна, валидации и теста крайне близки**

# Оценка классификационных метрик

In [29]:
models_to_test_corr = [
    LogisticRegression(**best_parameters),
    GaussianNB(**best_parameters_gnb),
    KNeighborsClassifier(**best_parameters_knn)
]
feature_pools_dict_corr = {
     'P-value Selected Features': p_features
 }

In [30]:
evaluate_classification_metrics(train_scaled_new, val_scaled_new, test_scaled_new, 'IsBadBuy', models=models_to_test_corr, feature_pools=feature_pools_dict_corr)

--- Оценка набора признаков: 'Full Features' (45 признаков) ---


Model,LogisticRegression,LogisticRegression,LogisticRegression,GaussianNB,GaussianNB,GaussianNB,KNeighborsClassifier,KNeighborsClassifier,KNeighborsClassifier
Dataset,Train,Validation,Test,Train,Validation,Test,Train,Validation,Test
Recall,0.607211,0.58976,0.680354,0.265439,0.218233,0.291851,1.0,0.005932,0.008528
Precision,0.245731,0.268706,0.192442,0.463186,0.534813,0.194772,1.0,1.0,0.870968
F1-score,0.349873,0.369198,0.300021,0.337479,0.309978,0.233628,1.0,0.011794,0.016891
AUC PR,0.401769,0.380415,0.185884,0.346521,0.341033,0.164243,1.0,0.384028,0.352669


--- Оценка набора признаков: 'P-value Selected Features' (15 признаков) ---


Model,LogisticRegression,LogisticRegression,LogisticRegression,GaussianNB,GaussianNB,GaussianNB,KNeighborsClassifier,KNeighborsClassifier,KNeighborsClassifier
Dataset,Train,Validation,Test,Train,Validation,Test,Train,Validation,Test
Recall,0.604143,0.575398,0.641503,0.285002,0.302217,0.441567,1.0,0.202623,0.210044
Precision,0.240055,0.265524,0.236686,0.497323,0.497686,0.339815,1.0,0.648352,0.846056
F1-score,0.343586,0.363368,0.34579,0.362351,0.376068,0.384066,1.0,0.308754,0.336538
AUC PR,0.400433,0.381197,0.435983,0.364703,0.358891,0.390579,1.0,0.40482,0.432562


# Which hard label metric do you prefer for the task of detecting "lemon" cars?
* Поскольку цена пропуска "лимона" неизмеримо выше, чем цена ложной тревоги, главная задача — минимизировать количество пропущенных "лимонов". Как следствие, лучшая метрика - это Recall