In [1]:
from IPython.core.display import HTML
style = """
<style>
.container {width: 90% !important;}
div.prompt {min-width: 12ex;}
div.output_area {overflow-y: scroll;}
div.output_area img {max-width: unset;}
div.output_subarea {max-width: unset;}
</style>
"""
HTML(style)

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

from sklearn.impute import SimpleImputer
from category_encoders import TargetEncoder

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures

from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_transformer

from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline

from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

from sklearn.ensemble import RandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier
from xgboost import XGBClassifier

from sklearn.metrics import roc_auc_score

In [3]:
import matplotlib as mpl
for k, v in mpl.rcParams.items():
    if v == 'black':
        mpl.rcParams[k] = '#28324a'

### Подготовка данных

Оформим результаты разведочного анализа в виде функции pre_split_filter.

In [4]:
def pre_split_filter(df, drop_outliers):
    df = df.copy()
    
    df.drop(columns=['device_model', 'geo_country'], inplace=True)
    
    cols_to_lower = ['device_category', 'device_os', 'device_brand', 'device_browser', 'geo_city']
    df.loc[:, cols_to_lower] = df.loc[:, cols_to_lower].apply(lambda x: x.str.lower())

    df.fillna('(none)', inplace=True)
    df.replace('(not set)', '(none)', inplace=True)
    
    
    #device categorical columns
    
    df = df[df['device_browser'] != '(none)']

    def browser_filter(s):
        return ''.join([c for c in s if c.isalpha()])
    
    df.loc[:, 'device_browser'] = df.loc[:, 'device_browser'].apply(browser_filter)
    
    def fillna(cond, field, value): 
        df.loc[cond & (df[field] == '(none)'), field] = value
    
    fillna((df['device_os'] == 'macintosh') | (df['device_os'] == 'ios'), 'device_brand', 'apple')
    fillna((df['device_brand'] == 'apple') & (df['device_category'] == 'desktop'), 'device_os', 'macintosh')
    fillna(df['device_brand'] == 'apple', 'device_os', 'ios')
    
    df.drop(columns=['device_os'], inplace=True)
    
    
    #device numerical columns
    
    df[['device_screen_width', 'device_screen_height']] = df['device_screen_resolution'].str.split('x', expand=True).astype(int)
    df = df[(df['device_screen_width'] > 0) & (df['device_screen_height'] > 0)]
    
    df_screen_size = df[['device_screen_width', 'device_screen_height']]
    df['device_screen_width'], df['device_screen_height'] = df_screen_size.max(axis=1), df_screen_size.min(axis=1)
    
    df['device_screen_area'] = df['device_screen_width'] * df['device_screen_height']
    df['device_screen_aspect'] = df['device_screen_width'] / df['device_screen_height']

    if drop_outliers:
        df = df[df['device_screen_width'] <= 19000]
        
    df.drop(columns=['device_screen_resolution', 'device_screen_width', 'device_screen_height'], inplace=True)
    
    return df

Заполнение пропусков в поле device_brand условной модой у мобильных устройств и планшетов. Apple исключаем из рассмотрения, т.к. пропуски у Apple-устройств заполняются детерминированно перед разбиением на train/test.

In [5]:
class Imputer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.device_brand_imp = SimpleImputer(missing_values='(none)', strategy='most_frequent')
        self.geo_city_imp = SimpleImputer(missing_values='(none)', strategy='most_frequent')
        
    def fit(self, X, y=None):
        cond = (X['device_brand'] != 'apple') & (X['device_category'] != 'desktop')
        self.device_brand_imp.fit(X.loc[cond, ['device_brand']])
        self.geo_city_imp.fit(X.loc[:, ['geo_city']])
        self.output_features_ = X.columns
        return self
        
    def transform(self, X, y=None):
        Xt = X.copy()
        cond = (X['device_brand'] != 'apple') & (X['device_category'] != 'desktop')
        if cond.sum() > 0:
            Xt.loc[cond, ['device_brand']] = self.device_brand_imp.transform(Xt.loc[cond, ['device_brand']])
        Xt.loc[:, ['geo_city']] = self.geo_city_imp.transform(Xt.loc[:, ['geo_city']])
        return Xt
    
    def get_feature_names_out(self, input_features=None):
        return self.output_features_

### Моделирование

In [6]:
import pprint

summary = pd.DataFrame({}, index=['CV Train', 'CV Test', 'CV Overfit', 'Test', 'Time'])

def print_stats(model, model_name):
    print('')
    pprint.pprint(model.best_params_)
    print('')
    
    cv_train = model.cv_results_['mean_train_score'][model.best_index_]
    cv_test = model.cv_results_['mean_test_score'][model.best_index_]
    cv_overfit = (cv_train - cv_test) / cv_test
    test = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
        
    fit_time = model.cv_results_['mean_fit_time'][model.best_index_] / 60
    score_time = model.cv_results_['mean_score_time'][model.best_index_] / 60
    sum_time = fit_time + score_time
    
    global summary
    model_summary = pd.DataFrame(
        {model_name: [cv_train, cv_test, cv_overfit, test, sum_time]},
        index=summary.index
    ).round(3)
    display(model_summary)
    
    summary = model_summary.combine_first(summary)

In [7]:
data_dir = '../../Final_work_data'
df = pre_split_filter(pd.read_csv(os.path.join(data_dir, 'ml_data.csv'), dtype=str), drop_outliers=True)
df['target'] = df['target'].astype(int)

X = df.drop(columns=['target'])
y = df['target']
del df

cols_cat = [x for x in X.columns if not x.startswith('device_screen')]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=11)
scale_pos_weight = (y == 0).sum() / (y == 1).sum()

cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=11)
len_cv_train = (cv.n_splits - 1)  * len(X_train) / cv.n_splits

In [8]:
import multiprocessing
CPU_COUNT = multiprocessing.cpu_count()

In [9]:
import warnings
def ignore_warnings(message):
    warnings.filterwarnings('ignore', message=message)
    os.environ['PYTHONWARNINGS'] = 'ignore:{}'.format(message)
ignore_warnings('Found unknown categories')

encoder = make_column_transformer((OneHotEncoder(drop='if_binary', 
                                                 handle_unknown='infrequent_if_exist'), 
                                   cols_cat), 
                                  remainder='passthrough')

In [47]:
pipe = ImbPipeline(steps=[
    ('imp', Imputer()),
    ('enc', encoder),
    ('res', RandomOverSampler(random_state=11)),
    ('clf', XGBClassifier(random_state=11, n_jobs=CPU_COUNT))
])

param = [{'enc__onehotencoder__min_frequency': [0.0125],
          'clf__n_estimators': [100], 
          'clf__max_depth': [4]}]

xgb = GridSearchCV(pipe, param, cv=cv, scoring='roc_auc', return_train_score=True)
%time xgb.fit(X_train, y_train)
print_stats(xgb, 'XGB')

CPU times: user 30min 42s, sys: 30 s, total: 31min 12s
Wall time: 5min 4s

{'clf__max_depth': 4,
 'clf__n_estimators': 100,
 'enc__onehotencoder__min_frequency': 0.0125}



Unnamed: 0,XGB
CV Train,0.694
CV Test,0.68
CV Overfit,0.021
Test,0.687
Time,1.062


In [48]:
pipe = ImbPipeline(steps=[
    ('imp', Imputer()),
    ('enc', encoder),
    ('res', RandomOverSampler(random_state=11)),
    ('clf', XGBClassifier(random_state=11, n_jobs=CPU_COUNT))
])

param = [{'enc__onehotencoder__min_frequency': [0.0125],
          'clf__n_estimators': [100], 
          'clf__max_depth': [6]}]

xgb = GridSearchCV(pipe, param, cv=cv, scoring='roc_auc', return_train_score=True)
%time xgb.fit(X_train, y_train)
print_stats(xgb, 'XGB')

CPU times: user 44min 25s, sys: 42.1 s, total: 45min 7s
Wall time: 6min 48s

{'clf__max_depth': 6,
 'clf__n_estimators': 100,
 'enc__onehotencoder__min_frequency': 0.0125}



Unnamed: 0,XGB
CV Train,0.718
CV Test,0.679
CV Overfit,0.057
Test,0.688
Time,1.445


In [11]:
pipe = ImbPipeline(steps=[
    ('imp', Imputer()),
    ('enc', encoder),
    ('res', RandomOverSampler(random_state=11)),
    ('clf', XGBClassifier(random_state=11, n_jobs=CPU_COUNT))
])

param = [{'enc__onehotencoder__min_frequency': [0.0125],
          'clf__n_estimators': [1000], 
          'clf__max_depth': [4]}]

xgb = GridSearchCV(pipe, param, cv=cv, scoring='roc_auc', return_train_score=True)
%time xgb.fit(X_train, y_train)
print_stats(xgb, 'XGB')

CPU times: user 4h 2min 58s, sys: 6.49 s, total: 4h 3min 4s
Wall time: 36min 20s

{'clf__max_depth': 4,
 'clf__n_estimators': 1000,
 'enc__onehotencoder__min_frequency': 0.0125}



Unnamed: 0,XGB
CV Train,0.733
CV Test,0.675
CV Overfit,0.086
Test,0.686
Time,7.708


In [49]:
pipe = ImbPipeline(steps=[
    ('imp', Imputer()),
    ('enc', encoder),
    ('res', RandomOverSampler(random_state=11)),
    ('clf', XGBClassifier(random_state=11, n_jobs=CPU_COUNT))
])

param = [{'enc__onehotencoder__min_frequency': [0.025],
          'clf__n_estimators': [100], 
          'clf__max_depth': [6]}]

xgb = GridSearchCV(pipe, param, cv=cv, scoring='roc_auc', return_train_score=True)
%time xgb.fit(X_train, y_train)
print_stats(xgb, 'XGB')

CPU times: user 43min 34s, sys: 40.4 s, total: 44min 14s
Wall time: 6min 38s

{'clf__max_depth': 6,
 'clf__n_estimators': 100,
 'enc__onehotencoder__min_frequency': 0.025}



Unnamed: 0,XGB
CV Train,0.714
CV Test,0.679
CV Overfit,0.051
Test,0.686
Time,1.394
