<a href="https://colab.research.google.com/github/AlexeyTri/MLSeminars/blob/main/MLHSE/Seminar4_MLDS_2022.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**План семинара**


Линейный классификатор в задаче бинарной классификации

Кодирование категориальных признаков

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

In [2]:
np.random.seed(42)

In [3]:
data = pd.read_csv('https://raw.githubusercontent.com/Murcha1990/MLDS_ML_2022/main/%D0%97%D0%B0%D0%BD%D1%8F%D1%82%D0%B8%D0%B5%204/bike_buyers_clean.csv')

In [5]:
data.dtypes

ID                   int64
Marital Status      object
Gender              object
Income               int64
Children             int64
Education           object
Occupation          object
Home Owner          object
Cars                 int64
Commute Distance    object
Region              object
Age                  int64
Purchased Bike      object
dtype: object

In [7]:
X = data.iloc[:, :-1]
X.drop(columns='ID', inplace=True)

In [9]:
y = data['Purchased Bike']

In [11]:
num_col = X.columns[X.dtypes == 'int64'].tolist()
cat_col = X.columns[X.dtypes == 'object']

In [12]:
binary_cols = cat_col[X[cat_col].nunique == 2].tolist()
ordinal_cols = ['Commute Distance', 'Education']
cat_cols = cat_col.difference(binary_cols + ordinal_cols).tolist()

In [15]:
X.describe()

Unnamed: 0,Income,Children,Cars,Age
count,1000.0,1000.0,1000.0,1000.0
mean,56140.0,1.908,1.452,44.19
std,31081.609779,1.626094,1.124705,11.353537
min,10000.0,0.0,0.0,25.0
25%,30000.0,0.0,1.0,35.0
50%,60000.0,2.0,1.0,43.0
75%,70000.0,3.0,2.0,52.0
max,170000.0,5.0,4.0,89.0


In [16]:
y = (y == 'Yes').astype('int')

In [17]:
y

0      0
1      0
2      0
3      1
4      1
      ..
995    1
996    1
997    1
998    0
999    1
Name: Purchased Bike, Length: 1000, dtype: int64

**Подготовка данных**


Кодирование категориальных признаков

In [18]:
!pip install category_encoders

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting category_encoders
  Downloading category_encoders-2.6.0-py2.py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.2/81.2 KB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: category_encoders
Successfully installed category_encoders-2.6.0


In [19]:
from category_encoders.ordinal import OrdinalEncoder # LabelEncoder
from category_encoders.one_hot import OneHotEncoder # OneHotEncoding
from category_encoders.target_encoder import TargetEncoder # счетчики+сглаживание

In [20]:
X['Education'].unique()

array(['Bachelors', 'Partial College', 'High School',
       'Partial High School', 'Graduate Degree'], dtype=object)

In [21]:
# Ordinal: from categories to numbers

ord_enc = OrdinalEncoder()
ord_enc.fit_transform(X['Education'])

Unnamed: 0,Education
0,1
1,2
2,2
3,1
4,1
...,...
995,3
996,5
997,1
998,1


In [26]:
# One hot: from k categories to k dummy columns

one_hot_enc = OneHotEncoder()

one_hot_enc.fit_transform(X['Education'], drop=True)
# * fit -> определить количество новых столбцов (по кол-ву категорий)
# * transform -> создать новые столбцы
# * fit_transform = fit + transform

# Нужно ли удалять какую-то из колонок после такого кодирования ?

Unnamed: 0,Education_1,Education_2,Education_3,Education_4,Education_5
0,1,0,0,0,0
1,0,1,0,0,0
2,0,1,0,0,0
3,1,0,0,0,0
4,1,0,0,0,0
...,...,...,...,...,...
995,0,0,1,0,0
996,0,0,0,0,1
997,1,0,0,0,0
998,1,0,0,0,0


In [27]:
# target encoding: from k categories to posterior probabilites of y == 1 - P(y==1 | category == c1)

tgt_enc = TargetEncoder(smoothing=1)

# smoothing - это коэффициент сглаживания alpha, чем он больше, тем больше регуляризация

tgt_enc.fit_transform(X['Education'], y)

Unnamed: 0,Education
0,0.552288
1,0.449057
2,0.449057
3,0.552288
4,0.552288
...,...
995,0.441341
996,0.540230
997,0.552288
998,0.552288


In [28]:
# энкодер можно применять сразу на весь датафрейм

tgt_enc = TargetEncoder(cols=['Education', 'Gender', 'Region'])
tgt_enc.fit_transform(X, y)

Unnamed: 0,Marital Status,Gender,Income,Children,Education,Occupation,Home Owner,Cars,Commute Distance,Region,Age
0,Married,0.486762,40000,1,0.552288,Skilled Manual,Yes,0,0-1 Miles,0.493333,42
1,Married,0.475442,30000,3,0.449057,Clerical,Yes,1,0-1 Miles,0.493333,43
2,Married,0.475442,80000,5,0.449057,Professional,No,2,2-5 Miles,0.493333,60
3,Single,0.475442,70000,0,0.552288,Professional,Yes,1,5-10 Miles,0.588542,41
4,Single,0.475442,30000,0,0.552288,Clerical,No,0,0-1 Miles,0.493333,36
...,...,...,...,...,...,...,...,...,...,...,...
995,Married,0.475442,60000,2,0.441341,Professional,Yes,2,2-5 Miles,0.433071,54
996,Single,0.475442,70000,4,0.540230,Professional,Yes,0,2-5 Miles,0.433071,35
997,Married,0.475442,60000,2,0.552288,Skilled Manual,Yes,0,0-1 Miles,0.433071,38
998,Single,0.475442,100000,3,0.552288,Management,No,3,1-2 Miles,0.433071,38


Помимо сглаживания, для борьбы с переобучением при таргет энкодинге в лекции предлагались и другие методы

Добавление случайного шума
Вычисление счетчиков на кросс-валидации
Expanding mean encoding
Первые две идеи реализованы в классе LeaveOneOut

значения считаются на основе кросс-валидации вида leave one out (то есть значение энкодинга для конкретного наблюдения будет считаться по всем наблюдениям, кроме этого)
параметр sigma отвечает за дисперсию случайного шума, который добавляется к значению энкодинга (чем больше sigma, тем больше регуляризация)

In [29]:
from category_encoders.leave_one_out import LeaveOneOutEncoder

loo_enc = LeaveOneOutEncoder(sigma=3.)

loo_enc.fit_transform(X['Education'], y)

Unnamed: 0,Education
0,1.379784
1,0.263787
2,1.326609
3,3.067564
4,0.163891
...,...
995,0.068666
996,3.436731
997,1.609786
998,-0.395370


Масштабирование числовых признаков

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler

scaler = StandardScaler() # x -> (x-mean) / std
scaler.fit_transform(X[['Income']])

# scaler.fit(Xtrain)

# scaler.transform(Xtrain)
# scaler.transform(Xtest)

# fit -> вычисляет параметры преобразования: mean, std (по TRAIN)
# transform -> преобразует столбец по формуле. (по TRAIN и по TEST)

Есть две проблемы:

класc StandardScaler не умеет работать только на части колонок датафрейма
классы sklearn возвращают numpy arrays, а не pandas dataframe, что не удобно

In [31]:
num_col

['Income', 'Children', 'Cars', 'Age']

In [33]:
from sklearn.compose import ColumnTransformer

ct = ColumnTransformer([('scaler', StandardScaler(), num_col)], remainder='passthrough') # 'drop'

In [34]:
ct.fit_transform(X)

array([[-0.5195379574051056, -0.5586728696623785, -1.2916513760469168,
        ..., 'Yes', '0-1 Miles', 'Europe'],
       [-0.8414326026375131, 0.6718841119728166, -0.4020843126537234,
        ..., 'Yes', '0-1 Miles', 'Europe'],
       [0.7680406235245242, 1.9024410936080116, 0.48748275073947, ...,
        'No', '2-5 Miles', 'Europe'],
       ...,
       [0.12425133305970927, 0.05660562115521903, -1.2916513760469168,
        ..., 'Yes', '0-1 Miles', 'North America'],
       [1.4118299139893389, 0.6718841119728166, 1.3770498141326635, ...,
        'No', '1-2 Miles', 'North America'],
       [0.12425133305970927, 0.6718841119728166, 0.48748275073947, ...,
        'Yes', '10+ Miles', 'North America']], dtype=object)

In [35]:
# нет удобной реализации - напишем сами !

from sklearn.base import TransformerMixin

class CustomScaler(TransformerMixin):
    def __init__(self, cols, scaler=None):
        self.cols = cols
        self.scaler = scaler or StandardScaler()
        
    def fit(self, X, y=None):
        num_cols = X.copy()[self.cols]
        self.scaler.fit(num_cols)
        return self
    def transform(self, X, y=None):
        X_res = X.copy()
        num_cols_tr = self.scaler.transform(X_res[self.cols])
        for i, col in enumerate(self.cols):
            X_res[col] = num_cols_tr[:,i]
        return X_res

In [37]:
sc = CustomScaler(num_col)
X2 = sc.fit_transform(X)

In [38]:
X2

Unnamed: 0,Marital Status,Gender,Income,Children,Education,Occupation,Home Owner,Cars,Commute Distance,Region,Age
0,Married,Female,-0.519538,-0.558673,Bachelors,Skilled Manual,Yes,-1.291651,0-1 Miles,Europe,-0.192988
1,Married,Male,-0.841433,0.671884,Partial College,Clerical,Yes,-0.402084,0-1 Miles,Europe,-0.104866
2,Married,Male,0.768041,1.902441,Partial College,Professional,No,0.487483,2-5 Miles,Europe,1.393214
3,Single,Male,0.446146,-1.173951,Bachelors,Professional,Yes,-0.402084,5-10 Miles,Pacific,-0.281110
4,Single,Male,-0.841433,-1.173951,Bachelors,Clerical,No,-1.291651,0-1 Miles,Europe,-0.721722
...,...,...,...,...,...,...,...,...,...,...,...
995,Married,Male,0.124251,0.056606,High School,Professional,Yes,0.487483,2-5 Miles,North America,0.864480
996,Single,Male,0.446146,1.287163,Graduate Degree,Professional,Yes,-1.291651,2-5 Miles,North America,-0.809844
997,Married,Male,0.124251,0.056606,Bachelors,Skilled Manual,Yes,-1.291651,0-1 Miles,North America,-0.545477
998,Single,Male,1.411830,0.671884,Bachelors,Management,No,1.377050,1-2 Miles,North America,-0.545477


In [40]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

p1 = Pipeline([
    ('ordinal_encoder_', OrdinalEncoder(cols=ordinal_cols + binary_cols + cat_cols)), # плохо!!!
    ('scaler_', CustomScaler(num_col)),
    ('model_', LogisticRegression())
    ])

p2 = Pipeline([
    ('one_hot_encoder_', OneHotEncoder(cols=ordinal_cols + binary_cols+cat_cols)),
    ('scaler_', CustomScaler(num_col)),
    ('model_', LogisticRegression())
    ])

p3 = Pipeline([
    ('target_encoder_', TargetEncoder(cols=ordinal_cols + binary_cols+cat_cols)),
    ('scaler_', CustomScaler(num_col)),
    ('model_', LogisticRegression())
])

p4 = Pipeline([
    ('ordinal_encoder_', OrdinalEncoder(cols=ordinal_cols)),
    ('one_hot_encoder_', OneHotEncoder(cols=binary_cols+cat_cols)),
    ('scaler_', CustomScaler(num_col)),
    ('model_', LogisticRegression())
    ])

p5 = Pipeline([
    ('ordinal_encoder_', OrdinalEncoder(cols=ordinal_cols)),
    ('one_hot_encoder_', OneHotEncoder(cols=binary_cols)),
    ('target_encoder_', TargetEncoder(cols=cat_cols)),
    ('scaler_', CustomScaler(num_col)),
    ('model_', LogisticRegression())
])

p6 = Pipeline([
    ('one_hot_encoder_', OneHotEncoder(cols=binary_cols)),
    ('target_encoder_', TargetEncoder(cols=cat_cols + ordinal_cols)),
    ('scaler_', CustomScaler(num_col)),
    ('model_', LogisticRegression())
])

In [41]:
# пример работы с пайплайном
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(X, y)

p1.fit(X_train, y_train)

#print(p1)

y_pred = p1.predict(X_test)

print(accuracy_score(y_test, y_pred))

0.632


Сравнение качества классификации при разных пайплайнах преобразования данных

In [42]:
from sklearn.model_selection import cross_validate, cross_val_score
import warnings

warnings.filterwarnings('ignore')

In [43]:
for i, pipe in enumerate([p1, p2, p3, p4, p5, p6]):
    cv_res = cross_validate(pipe,
                            X,
                            y,
                            cv=5,
                            scoring='accuracy'
                           )
    print(f"Pipeline {i + 1}: mean cv accuracy = {cv_res['test_score'].mean()}")

Pipeline 1: mean cv accuracy = 0.606
Pipeline 2: mean cv accuracy = 0.616
Pipeline 3: mean cv accuracy = 0.629
Pipeline 4: mean cv accuracy = 0.617
Pipeline 5: mean cv accuracy = 0.63
Pipeline 6: mean cv accuracy = 0.629


In [None]:
p3.fit(X_train, y_train)

pred = p3.predict(X_test)

accuracy_score(y_test, pred)

Подбор порога

In [45]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score

confusion_matrix(y_test, pred), precision_score(y_test, pred), recall_score(y_test, pred)

(array([[98, 42],
        [45, 65]]), 0.6074766355140186, 0.5909090909090909)

А чего хотим?

Пусть хотим максимизировать полноту при accuracy >= 0.6

In [46]:
probs = p3.predict_proba(X_test)[:,1]

classes = probs > 0.25

In [47]:
confusion_matrix(y_test, classes), precision_score(y_test, classes), recall_score(y_test, classes), accuracy_score(y_test, classes)

(array([[  6, 134],
        [  0, 110]]), 0.45081967213114754, 1.0, 0.464)

In [48]:
max_recall = -1
best_thr = -1
acc = -1

for thr in np.arange(0, 1, 0.01):
    classes = probs > thr
    
    if recall_score(y_test, classes) > max_recall and accuracy_score(y_test, classes) > 0.6:
        max_recall = recall_score(y_test, classes)
        best_thr = thr
        acc = accuracy_score(y_test, classes)
        
max_recall, best_thr, acc

(0.8454545454545455, 0.4, 0.616)

Улучшаем качество модели

In [50]:
from sklearn.svm import SVC

p3_svm = Pipeline([
    ('target_encoder_', TargetEncoder(cols=ordinal_cols + binary_cols+cat_cols)),
    ('scaler_', CustomScaler(num_col)),
    ('model_', SVC(kernel='linear'))
])

p3_svm.fit(X_train, y_train)

pred = p3_svm.predict(X_test)

accuracy_score(y_test, pred)

0.652

In [52]:
for kernel in ['linear', 'rbf', 'poly', 'sigmoid']:
    p3_svm = Pipeline([
    ('target_encoder_', TargetEncoder(cols=ordinal_cols + binary_cols+cat_cols)),
    ('scaler_', CustomScaler(num_col)),
    ('model_', SVC(kernel=kernel))
    ])

    p3_svm.fit(X_train, y_train)

    pred = p3_svm.predict(X_test)

    print(kernel, accuracy_score(y_test, pred))

linear 0.652
rbf 0.672
poly 0.684
sigmoid 0.408


In [53]:
for degree in np.arange(2,10):
    p3_svm = Pipeline([
    ('target_encoder_', TargetEncoder(cols=ordinal_cols + binary_cols+cat_cols)),
    ('scaler_', CustomScaler(num_col)),
    ('model_', SVC(kernel='poly', degree=degree))
    ])

    p3_svm.fit(X_train, y_train)

    pred = p3_svm.predict(X_test)

    print(degree, accuracy_score(y_test, pred))

2 0.656
3 0.684
4 0.66
5 0.704
6 0.684
7 0.692
8 0.684
9 0.696
