Расмотрим пример на датасете из репозитория UCI

Описание данных - https://archive.ics.uci.edu/ml/datasets/banknote+authentication#

In [53]:
import pandas as pd
import numpy as np
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
#from sklearn.feature_extraction.text import TfidfVectorizer
import itertools

import matplotlib.pyplot as plt

%matplotlib inline
data = pd.read_csv("crx.data", header = None, names = ['A'+str(i) for i in range(1, 17)])
data.head(3)

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202,0,+
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43,560,+
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280,824,+


У нас есть 15 признаков и 1 целевая переменная (бинарная) - нужно определить одобрят (+) или отклонят заявку на кредит (-)

In [54]:
print(data.shape)

(690, 16)


In [55]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 690 entries, 0 to 689
Data columns (total 16 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   A1      690 non-null    object 
 1   A2      690 non-null    object 
 2   A3      690 non-null    float64
 3   A4      690 non-null    object 
 4   A5      690 non-null    object 
 5   A6      690 non-null    object 
 6   A7      690 non-null    object 
 7   A8      690 non-null    float64
 8   A9      690 non-null    object 
 9   A10     690 non-null    object 
 10  A11     690 non-null    int64  
 11  A12     690 non-null    object 
 12  A13     690 non-null    object 
 13  A14     690 non-null    object 
 14  A15     690 non-null    int64  
 15  A16     690 non-null    object 
dtypes: float64(2), int64(2), object(12)
memory usage: 86.4+ KB


Всего 690 заявок

In [108]:
categorical_columns = ['A1', 'A2', 'A4', 'A5', 'A6', 'A7', 'A9', 'A10', 'A12', 'A13', 'A14']
continuous_columns = ['A3', 'A8', 'A11', 'A15']

Посмотрим на соотношение классов

In [109]:
data.iloc[:, -1].value_counts()

-    383
+    307
Name: A16, dtype: int64

In [110]:
class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, column):
        self.column = column

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X[self.column]
    
class NumberSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on numeric columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]
    
class OHEEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
        self.columns = []

    def fit(self, X, y=None):
        self.columns = [col for col in pd.get_dummies(X, prefix=self.key).columns]
        return self

    def transform(self, X):
        X = pd.get_dummies(X, prefix=self.key)
        test_columns = [col for col in X.columns]
        for col_ in self.columns:
            if col_ not in test_columns:
                X[col_] = 0
        return X[self.columns]

In [111]:
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
final_transformers = list()

for cat_col in categorical_columns:
    cat_transformer = Pipeline([
                ('selector', FeatureSelector(column=cat_col)),
                ('ohe', OHEEncoder(key=cat_col))
            ])
    final_transformers.append((cat_col, cat_transformer))

for cont_col in continuous_columns:
    cont_transformer = Pipeline([
                ('selector', NumberSelector(key=cont_col))
            ])
    final_transformers.append((cont_col, cont_transformer))

In [112]:
final_transformers

[('A1',
  Pipeline(steps=[('selector', FeatureSelector(column='A1')),
                  ('ohe', OHEEncoder(key='A1'))])),
 ('A2',
  Pipeline(steps=[('selector', FeatureSelector(column='A2')),
                  ('ohe', OHEEncoder(key='A2'))])),
 ('A4',
  Pipeline(steps=[('selector', FeatureSelector(column='A4')),
                  ('ohe', OHEEncoder(key='A4'))])),
 ('A5',
  Pipeline(steps=[('selector', FeatureSelector(column='A5')),
                  ('ohe', OHEEncoder(key='A5'))])),
 ('A6',
  Pipeline(steps=[('selector', FeatureSelector(column='A6')),
                  ('ohe', OHEEncoder(key='A6'))])),
 ('A7',
  Pipeline(steps=[('selector', FeatureSelector(column='A7')),
                  ('ohe', OHEEncoder(key='A7'))])),
 ('A9',
  Pipeline(steps=[('selector', FeatureSelector(column='A9')),
                  ('ohe', OHEEncoder(key='A9'))])),
 ('A10',
  Pipeline(steps=[('selector', FeatureSelector(column='A10')),
                  ('ohe', OHEEncoder(key='A10'))])),
 ('A12',
  Pipeline(s

In [115]:
feats = FeatureUnion(final_transformers)

feature_processing = Pipeline([('feats', feats)])

In [116]:
from sklearn.ensemble import RandomForestClassifier

pipeline = Pipeline([
    ('features', feats),
    ('classifier', RandomForestClassifier(random_state=42)),
])

Разбиваем выборку на тренировочную и тестовую части и обучаем модель (в примере - градиентный бустинг)

In [117]:
from sklearn.model_selection import train_test_split

x_data = data.iloc[:,:-1]
y_data = data.iloc[:,-1].apply(lambda x: 1 if x=='+' else 0, 1)

x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, random_state=7)

In [118]:
pipeline.fit(x_train, y_train)
preds = pipeline.predict_proba(x_test)[:, 1]
y_hat = pipeline.predict(x_test)

Проверяем качество

In [176]:
from sklearn.metrics import recall_score, precision_score, roc_auc_score, accuracy_score, f1_score

def evaluate_results(y_test, y_predict):
    print('Classification results:')
    f1 = f1_score(y_test, y_predict)
    print("f1: %.2f%%" % (f1 * 100.0)) 
    roc = roc_auc_score(y_test, y_predict)
    print("roc: %.2f%%" % (roc * 100.0)) 
    rec = recall_score(y_test, y_predict, average='binary')
    print("recall: %.2f%%" % (rec * 100.0)) 
    prc = precision_score(y_test, y_predict, average='binary')
    print("precision: %.2f%%" % (prc * 100.0)) 

    return f1, roc, rec, prc
    
#evaluate_results(y_test, y_hat)

### Теперь очередь за PU learning

Представим, что нам неизвестны негативы и часть позитивов

In [321]:
mod_data = data.copy()

#get the indices of the positives samples
pos_ind = np.where(mod_data.iloc[:,-1].values == '+')[0]
#shuffle them
np.random.shuffle(pos_ind)
# leave just 30% of the positives marked

P = 0.6

pos_sample_len = int(np.ceil(P * len(pos_ind)))
print(f'Using {pos_sample_len}/{len(pos_ind)} as positives and unlabeling the rest')
pos_sample = pos_ind[:pos_sample_len]

Using 185/307 as positives and unlabeling the rest


Создаем столбец для новой целевой переменной, где у нас два класса - P (1) и U (-1)

In [322]:
mod_data['class_test'] = -1
mod_data.loc[pos_sample,'class_test'] = 1
print('target variable:\n', mod_data.iloc[:,-1].value_counts())

target variable:
 -1    505
 1    185
Name: class_test, dtype: int64


* We now have just 93 positive samples labeled as 1 in the 'class_test' col while the rest is unlabeled as -1.

* Recall that col A16 still holds the actual label

In [323]:
mod_data.head(10)

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16,class_test
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202,0,+,-1
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43,560,+,1
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280,824,+,1
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100,3,+,1
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120,0,+,1
5,b,32.08,4.0,u,g,m,v,2.5,t,f,0,t,g,360,0,+,-1
6,b,33.17,1.04,u,g,r,h,6.5,t,f,0,t,g,164,31285,+,-1
7,a,22.92,11.585,u,g,cc,v,0.04,t,f,0,f,g,80,1349,+,1
8,b,54.42,0.5,y,p,k,h,3.96,t,f,0,f,g,180,314,+,1
9,b,42.5,4.915,y,p,w,v,3.165,t,f,0,t,g,52,1442,+,1


Remember that this data frame (x_data) includes the former target variable that we keep here just to compare the results

[:-2] is the original class label for positive and negative data [:-1] is the new class for positive and unlabeled data

In [324]:
x_data = mod_data.iloc[:,:-2] # just the X 
y_labeled = mod_data.iloc[:,-1] # new class (just the P & U)
y_positive = mod_data.iloc[:,-2] # original class

### 1. random negative sampling

In [325]:
mod_data = mod_data.sample(frac=1)
neg_sample = mod_data[mod_data['class_test']==-1][:len(mod_data[mod_data['class_test']==1])]
sample_test = mod_data[mod_data['class_test']==-1][len(mod_data[mod_data['class_test']==1]):]
pos_sample = mod_data[mod_data['class_test']==1]
print(neg_sample.shape, pos_sample.shape)
sample_train = pd.concat([neg_sample, pos_sample]).sample(frac=1)

(185, 17) (185, 17)


In [326]:
sample_train.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16,class_test
476,b,23.58,0.835,u,g,i,h,0.085,f,f,0,t,g,220,5,-,-1
574,a,20.33,10.0,u,g,c,h,1.0,t,t,4,f,g,50,1465,+,1
57,b,44.33,0.5,u,g,i,h,5.0,t,f,0,t,g,320,0,+,1
354,b,36.67,2.0,u,g,i,v,0.25,f,f,0,t,g,221,0,-,-1
10,b,22.08,0.83,u,g,c,h,2.165,f,f,0,t,g,128,0,+,1


In [327]:
sample_test.iloc[:,-2] = sample_test.iloc[:,-2].apply(lambda x: 1 if x=='+' else 0, 1)
sample_train.iloc[:,-2] = sample_train.iloc[:,-2].apply(lambda x: 1 if x=='+' else 0, 1)

pipeline = Pipeline([
    ('features', feats),
    ('classifier', RandomForestClassifier(random_state=42)),
])

pipeline.fit(sample_train.iloc[:,:-2], 
          sample_train.iloc[:,-2])

preds = pipeline.predict_proba(sample_test.iloc[:,:-2])[:, 1]
y_hat = pipeline.predict(sample_test.iloc[:,:-2])

In [328]:
metrix = evaluate_results(sample_test.iloc[:,-2], y_hat)

Classification results:
f1: 72.90%
roc: 85.03%
recall: 91.76%
precision: 60.47%


In [329]:
#results = pd.DataFrame(columns = ['P', 'f1', 'roc', 'recall', 'precision']).set_index('P')
#results.loc['no'] = [0.8814, 0.9006, 0.9123, 0.8525]

In [330]:
results.loc[P] = metrix

In [331]:
results

Unnamed: 0_level_0,f1,roc,recall,precision
P,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.8814,0.9006,0.9123,0.8525
0.1,0.776886,0.80187,0.934363,0.664835
0.2,0.790076,0.828135,0.932432,0.68543
0.3,0.802817,0.857825,0.944751,0.697959
0.4,0.828829,0.890388,0.945205,0.737968
0.5,0.764045,0.871181,0.953271,0.6375
0.6,0.728972,0.850313,0.917647,0.604651


Порог в 0,4 оказался оптимальным

### Домашнее задание

1. взять любой набор данных для бинарной классификации (можно скачать один из модельных с https://archive.ics.uci.edu/ml/datasets.php)
3. сделать feature engineering
4. обучить любой классификатор (какой вам нравится)
5. далее разделить ваш набор данных на два множества: P (positives) и U (unlabeled). Причем брать нужно не все положительные (класс 1) примеры, а только лишь часть
6. применить random negative sampling для построения классификатора в новых условиях
7. сравнить качество с решением из пункта 4 (построить отчет - таблицу метрик)
8. поэкспериментировать с долей P на шаге 5 (как будет меняться качество модели при уменьшении/увеличении размера P)

<b>Бонусный вопрос:</b>

Как вы думаете, какой из методов на практике является более предпочтительным: random negative sampling или 2-step approach?

Ваш ответ здесь: