In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [2]:
data = 'online_shoppers_intention_for_model.csv'

In [3]:
df = pd.read_csv(data)
df

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.000000,0.200000,0.000000,0.0,Feb,1,1,1,1,Returning_Visitor,False,False
1,0,0.0,0,0.0,2,64.000000,0.000000,0.000000,0.0,Feb,2,2,1,2,Returning_Visitor,False,False
2,0,0.0,0,0.0,1,0.000000,0.200000,0.000000,0.0,Feb,4,1,9,3,Returning_Visitor,False,False
3,0,0.0,0,0.0,2,2.666667,0.050000,0.000000,0.0,Feb,3,2,2,4,Returning_Visitor,False,False
4,0,0.0,0,0.0,10,627.500000,0.020000,0.000000,0.0,Feb,3,3,1,4,Returning_Visitor,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12325,3,145.0,0,0.0,53,1783.791667,0.007143,12.241717,0.0,Dec,4,6,1,1,Returning_Visitor,True,False
12326,0,0.0,0,0.0,5,465.750000,0.000000,0.000000,0.0,Nov,3,2,1,8,Returning_Visitor,True,False
12327,0,0.0,0,0.0,6,184.250000,0.083333,0.000000,0.0,Nov,3,2,1,13,Returning_Visitor,True,False
12328,4,75.0,0,0.0,15,346.000000,0.000000,0.000000,0.0,Nov,2,2,3,11,Returning_Visitor,False,False


In [4]:
df[['OperatingSystems', 'Browser', 'Region', 'TrafficType']] = df[['OperatingSystems', 'Browser', 'Region', 'TrafficType']].astype(object)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12330 entries, 0 to 12329
Data columns (total 17 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Administrative           12330 non-null  int64  
 1   Administrative_Duration  12330 non-null  float64
 2   Informational            12330 non-null  int64  
 3   Informational_Duration   12330 non-null  float64
 4   ProductRelated           12330 non-null  int64  
 5   ProductRelated_Duration  12330 non-null  float64
 6   BounceRates              12330 non-null  float64
 7   PageValues               12330 non-null  float64
 8   SpecialDay               12330 non-null  float64
 9   Month                    12330 non-null  object 
 10  OperatingSystems         12330 non-null  object 
 11  Browser                  12330 non-null  object 
 12  Region                   12330 non-null  object 
 13  TrafficType              12330 non-null  object 
 14  VisitorType           

In [6]:
num_cols = ['Administrative', 'Administrative_Duration', 'Informational','Informational_Duration',
'ProductRelated', 'ProductRelated_Duration','BounceRates', 'PageValues', 'SpecialDay']
cat_cols = ['Month','OperatingSystems', 'Browser', 'Region', 'TrafficType', 'VisitorType','Weekend']

#### Построим модель knn на числовых признаках с параметрами по умолчанию

In [7]:
X = df[num_cols]
y = df.Revenue

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
from sklearn.neighbors import KNeighborsClassifier

In [11]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
pred_knn = knn.predict(X_test)
pred_proba_knn = knn.predict_proba(X_test)[:, 1]

In [12]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

In [13]:
accuracy_score(y_test, pred_knn), roc_auc_score(y_test, pred_proba_knn)

(0.851581508515815, 0.7573167338578389)

#### Выполним стандартизацию признаков

In [14]:
from sklearn.preprocessing import StandardScaler

In [15]:
sc = StandardScaler()
X_trainsc = sc.fit_transform(X_train)
X_testsc = sc.transform(X_test)

In [16]:
X_trainsc = pd.DataFrame(X_trainsc, columns = X.columns)
X_testsc = pd.DataFrame(X_testsc, columns = X.columns)

In [17]:
knn.fit(X_trainsc, y_train)
pred_knn2 = knn.predict(X_testsc)
pred_proba_knn2 = knn.predict_proba(X_testsc)[:, 1]

In [18]:
accuracy_score(y_test, pred_knn2), roc_auc_score(y_test, pred_proba_knn2)

(0.8819951338199513, 0.814394894654898)

Получили значительное улучшение метрик

#### Проведем подбор гиперпараметров с помощью GridSearch

In [19]:
from sklearn.model_selection import GridSearchCV

In [20]:
params = {'n_neighbors' : np.arange(5, 50, 5),
          'weights' : ['uniform', 'distance'],
          'p' : [1, 2]
          }

gs = GridSearchCV(knn, params, scoring='roc_auc', cv=3, n_jobs=-1, verbose=2)
gs.fit(X_trainsc, y_train)

Fitting 3 folds for each of 36 candidates, totalling 108 fits


In [21]:
gs.best_score_, gs.best_params_

(0.8872772436231374, {'n_neighbors': 45, 'p': 1, 'weights': 'distance'})

In [22]:
pred_knn3 = gs.best_estimator_.predict(X_testsc)
pred_proba_knn3 = gs.best_estimator_.predict_proba(X_testsc)[:, 1]

In [23]:
accuracy_score(y_test, pred_knn3), roc_auc_score(y_test, pred_proba_knn3)

(0.8803730738037308, 0.8810390655987118)

Получилось поднять целевую метрику roc_auc до 0.88

#### Попробуем добавить в модель категориальные признаки

In [24]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [25]:
X_full = df.drop('Revenue', axis=1)
X_train_full, X_test_full, y_train_full, y_test_full = train_test_split(X_full, y, test_size=0.2, random_state=42)

In [26]:
ct = ColumnTransformer([
    ('ohe', OneHotEncoder(handle_unknown="ignore"), cat_cols),
    ('scaling', StandardScaler(), num_cols)
])

In [27]:
num_cols = ['Administrative', 'Administrative_Duration', 'Informational','Informational_Duration',
'ProductRelated', 'ProductRelated_Duration','BounceRates', 'PageValues', 'SpecialDay']
cat_cols = ['Month', 'OperatingSystems', 'Browser', 'Region', 'TrafficType', 'VisitorType','Weekend']

In [28]:
X_train_transformed = ct.fit_transform(X_train_full)
X_test_transformed = ct.transform(X_test_full)

In [29]:
new_features = list(ct.named_transformers_['ohe'].get_feature_names_out())
new_features.extend(num_cols)

In [30]:
X_train_transformed = pd.DataFrame(X_train_transformed.toarray(), columns=new_features)
X_test_transformed = pd.DataFrame(X_test_transformed.toarray(), columns=new_features)

In [31]:
knn.fit(X_train_transformed, y_train_full)
pred_knn4 = knn.predict(X_test_transformed)
pred_proba_knn4 = knn.predict_proba(X_test_transformed)[:, 1]

In [32]:
accuracy_score(y_test_full, pred_knn4), roc_auc_score(y_test_full, pred_proba_knn4)

(0.8673965936739659, 0.7834502518928966)

Метрики получились хуже, чем у модели на числовых признаках

#### Проведем подбор гиперпараметров с помощью GridSearch

In [33]:
gs2 = GridSearchCV(knn, params, scoring='roc_auc', cv=3, n_jobs=-1, verbose=2)
gs2.fit(X_train_transformed, y_train_full)

Fitting 3 folds for each of 36 candidates, totalling 108 fits


In [34]:
gs2.best_score_, gs2.best_params_

(0.8673293760941267, {'n_neighbors': 45, 'p': 1, 'weights': 'distance'})

In [35]:
pred_knn5 = gs2.best_estimator_.predict(X_test_transformed)
pred_proba_knn5 = gs2.best_estimator_.predict_proba(X_test_transformed)[:, 1]

In [36]:
accuracy_score(y_test_full, pred_knn5), roc_auc_score(y_test_full, pred_proba_knn5)

(0.8560421735604218, 0.8758863610800314)

Метрики выросли, но остались хуже, чем у лучшей модели на числовых признаках

#### Построим сводную таблицу по всем обученным моделям

In [37]:
model = ['num_features', 'num_features_standart', 'num_features_gs', '+cat_features', '+cat_features_gs']
accuracy_score = [accuracy_score(y_test, pred_knn), accuracy_score(y_test, pred_knn2), accuracy_score(y_test, pred_knn3), accuracy_score(y_test_full, pred_knn4), accuracy_score(y_test_full, pred_knn5)]
roc_auc_score = [roc_auc_score(y_test, pred_proba_knn), roc_auc_score(y_test, pred_proba_knn2), roc_auc_score(y_test, pred_proba_knn3), roc_auc_score(y_test_full, pred_proba_knn4), roc_auc_score(y_test_full, pred_proba_knn5)]

In [38]:
res = pd.DataFrame({'model': model,
                    'accuracy' : accuracy_score,
                    'roc_auc' : roc_auc_score}).set_index('model')

In [39]:
res

Unnamed: 0_level_0,accuracy,roc_auc
model,Unnamed: 1_level_1,Unnamed: 2_level_1
num_features,0.851582,0.757317
num_features_standart,0.881995,0.814395
num_features_gs,0.880373,0.881039
+cat_features,0.867397,0.78345
+cat_features_gs,0.856042,0.875886


#### Модель на числовых признаках показывает результаты лучше, чем модель с добавлением категориальных признаков, к тому же она намного легче по требуемым ресурсам для вычислений.Продолжим работу с этой моделью

### Построим интерактивный дашборд с помощью библиотеки ExplainerDashboard для анализа работы нашей модели и объяснения предсказаний модели

In [40]:
from explainerdashboard import ClassifierExplainer, ExplainerDashboard

In [41]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [43]:
explainer = ClassifierExplainer(gs.best_estimator_, X_testsc.iloc[:1000], y_test.iloc[:1000])

Note: shap values for shap='kernel' normally get calculated against X_background, but paramater X_background=None, so setting X_background=shap.sample(X, 50)...
Generating self.shap_explainer = shap.KernelExplainer(model, X, link='identity')


In [44]:
db = ExplainerDashboard(explainer, mode = 'inline')

Building ExplainerDashboard..
For this type of model and model_output interactions don't work, so setting shap_interaction=False...
The explainer object has no decision_trees property. so setting decision_trees=False...
Generating layout...
Calculating shap values...


  0%|          | 0/1000 [00:00<?, ?it/s]

Calculating prediction probabilities...
Calculating metrics...
Calculating confusion matrices...
Calculating classification_dfs...
Calculating roc auc curves...
Calculating pr auc curves...
Calculating liftcurve_dfs...
Calculating dependencies...
Calculating permutation importances (if slow, try setting n_jobs parameter)...
Calculating predictions...
Calculating pred_percentiles...
Reminder: you can store the explainer (including calculated dependencies) with explainer.dump('explainer.joblib') and reload with e.g. ClassifierExplainer.from_file('explainer.joblib')
Registering callbacks...


In [None]:
explainer.dump('num_features.dill')

In [45]:
db.run(port = 8054)

Starting ExplainerDashboard inline (terminate it with ExplainerDashboard.terminate(8054))


### Проведем анализ работы модели с помощью функционала дашборда:

#### Важность признаков:
    Самым значимым признаком по методам Shapley и Permutation Importance является PageValues, другие признаки вносят гораздо меньший вклад в прогноз модели

#### Метрики модели:
    Модель имеет высокие показатели accuracy и roc_auc(0.87 и 0.88), что говорит о хорошем общем качестве предсказаний модели.
    При этом мы имеем сильный дисбаланс классов и показатель recall 0.38, который означает, что модель находит только 38% целевых объектов.Если основной задачей модели и является нахождение целевых объектов, для улучшения данной метрики можно изменить параметры модели. Например, при установке порога классификации 0.27 модель будет выдавать 0.66 recall при 0.6 precision 
    

#### Примеры предсказаний модели:
    Объект с индексом 534 имеет вероятность оказаться целевым - 43.1%. Высокий показатель PageValues увеличивает вероятность на 34% от базовых 13.6%, но малые показатели посещения Informational и ProductRelated страниц снижают ее.
    Объект с индексом 621 имеет вероятность оказаться целевым - 1.5%. Низкий PageValues и показатели посещения страниц сильно уменьшают вероятность.
    Объект с индексом 688 имеет вероятность оказаться целевым - 94.5%. Очень высокий показатель PageValues увеличивает вероятность на 62.75%, остальные признаки также добаляют вероятности.