Расмотрим пример на датасете из репозитория UCI

Описание данных - https://archive.ics.uci.edu/ml/datasets/TV+News+Channel+Commercial+Detection+Dataset

In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline, make_pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import f1_score, roc_auc_score, precision_score, precision_recall_curve, confusion_matrix

results = pd.DataFrame(index=['classic', 'PU'], 
                       columns=['precision', 'recall', 'f1', 'roc_auc'])

In [2]:
data = pd.read_csv("BBC.csv", sep=';')
data.head(3)

Unnamed: 0,y,1,2,3,4,5,6,7,8,9,...,924,959,1002,1016,1028,1048,1112,1119,4124,4125
0,1,123,1.31644,1.516003,5.605905,5.34676,0.013233,0.010729,0.091743,0.050768,...,0.008475,0.036017,0.006356,0.008475,,0.002119,,,0.422334,0.663918
1,1,124,0.966079,0.54642,4.046537,3.190973,0.008338,0.01149,0.075504,0.065841,...,0.109244,0.117647,0.006303,,,0.008403,,,0.332664,0.766184
2,1,109,2.035407,0.571643,9.551406,5.803685,0.015189,0.014294,0.094209,0.044991,...,0.074519,0.0625,0.004808,,,0.009615,,,0.346674,0.225022


In [3]:
target = data.pop('y')
target.value_counts()

-1    9304
 1    8416
Name: y, dtype: int64

In [4]:
data = data.join(target.replace({-1:0})).fillna(0)
data.head(3)

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,959,1002,1016,1028,1048,1112,1119,4124,4125,y
0,123,1.31644,1.516003,5.605905,5.34676,0.013233,0.010729,0.091743,0.050768,3808.067871,...,0.036017,0.006356,0.008475,0.0,0.002119,0.0,0.0,0.422334,0.663918,1
1,124,0.966079,0.54642,4.046537,3.190973,0.008338,0.01149,0.075504,0.065841,3466.266113,...,0.117647,0.006303,0.0,0.0,0.008403,0.0,0.0,0.332664,0.766184,1
2,109,2.035407,0.571643,9.551406,5.803685,0.015189,0.014294,0.094209,0.044991,3798.196533,...,0.0625,0.004808,0.0,0.0,0.009615,0.0,0.0,0.346674,0.225022,1


In [5]:
#соберем наш простой pipeline, но нам понадобится написать класс для выбора нужного поля
class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, column):
        self.column = column

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X[self.column]
    
class NumberSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on numeric columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]

In [6]:
final_transformers = list()
    
for col in data.columns[:-1]:
    transformer = Pipeline([
                ('selector', NumberSelector(key=col)),
                ('scaler', StandardScaler())
            ])
    final_transformers.append((col, transformer))
    
feats = FeatureUnion(final_transformers)

In [7]:
from sklearn.model_selection import train_test_split

x_data = data.drop('y',axis=1)
y_data = data.y

X_train, X_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.3, random_state=13)

In [8]:
pipeline = Pipeline([
    ('features', feats),
    ('classifier', RandomForestClassifier(random_state=13)),
])
pipeline.fit(X_train, y_train)

Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('1',
                                                 Pipeline(steps=[('selector',
                                                                  NumberSelector(key='1')),
                                                                 ('scaler',
                                                                  StandardScaler())])),
                                                ('2',
                                                 Pipeline(steps=[('selector',
                                                                  NumberSelector(key='2')),
                                                                 ('scaler',
                                                                  StandardScaler())])),
                                                ('3',
                                                 Pipeline(steps=[('selector',
                                                               

In [9]:
y_probs = pipeline.predict_proba(X_test)[:,1]

Проверяем качество

In [10]:
from sklearn.metrics import recall_score, precision_score, roc_auc_score, accuracy_score, f1_score

def evaluate_results(y_test, y_probs, model):
    precision, recall, thresholds = precision_recall_curve(y_test, y_probs)

    fscore = (2 * precision * recall) / (precision + recall)
    # locate the index of the largest f score
    ix = np.argmax(fscore)
    print(f'Best Threshold={thresholds[ix]:.4f}\n')
    print('Classification results:')
    f1 = fscore[ix]
    print("f1: %.2f%%" % (f1 * 100.0)) 
    roc = roc_auc_score(y_test, y_probs)
    print("roc: %.2f%%" % (roc * 100.0)) 
    prc = precision[ix]
    print("precision: %.2f%%" % (prc * 100.0)) 
    rec = recall[ix]
    print("recall: %.2f%%" % (rec * 100.0)) 
    
    results.loc[model,'precision':'roc_auc']= prc, rec, f1, roc

    
evaluate_results(y_test, y_probs, 'classic')

Best Threshold=0.4600

Classification results:
f1: 87.21%
roc: 94.49%
precision: 86.49%
recall: 87.94%


### Теперь очередь за PU learning

Представим, что нам неизвестны негативы и часть позитивов

In [11]:
mod_data = data.copy()
#get the indices of the positives samples
pos_ind = np.where(mod_data.iloc[:,-1].values == 1)[0]
#shuffle them
np.random.shuffle(pos_ind)
# leave just 25% of the positives marked
pos_sample_len = int(np.ceil(0.25 * len(pos_ind)))
print(f'Using {pos_sample_len}/{len(pos_ind)} as positives and unlabeling the rest')
pos_sample = pos_ind[:pos_sample_len]

Using 2104/8416 as positives and unlabeling the rest


Создаем столбец для новой целевой переменной, где у нас два класса - P (1) и U (-1)

In [12]:
mod_data['class_test'] = -1
mod_data.loc[pos_sample,'class_test'] = 1
print('target variable:\n', mod_data.iloc[:,-1].value_counts())

target variable:
 -1    15616
 1     2104
Name: class_test, dtype: int64


In [13]:
mod_data.head(5)

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,1002,1016,1028,1048,1112,1119,4124,4125,y,class_test
0,123,1.31644,1.516003,5.605905,5.34676,0.013233,0.010729,0.091743,0.050768,3808.067871,...,0.006356,0.008475,0.0,0.002119,0.0,0.0,0.422334,0.663918,1,1
1,124,0.966079,0.54642,4.046537,3.190973,0.008338,0.01149,0.075504,0.065841,3466.266113,...,0.006303,0.0,0.0,0.008403,0.0,0.0,0.332664,0.766184,1,-1
2,109,2.035407,0.571643,9.551406,5.803685,0.015189,0.014294,0.094209,0.044991,3798.196533,...,0.004808,0.0,0.0,0.009615,0.0,0.0,0.346674,0.225022,1,-1
3,86,3.206008,0.786326,10.092709,2.693058,0.013962,0.011039,0.092042,0.043756,3761.712402,...,0.012346,0.0,0.0,0.012346,0.003086,0.0,0.993323,0.840083,1,-1
4,76,3.135861,0.896346,10.348035,2.65101,0.020914,0.012061,0.108018,0.052617,3784.488037,...,0.003521,0.0,0.0,0.045775,0.007042,0.0,0.34152,0.71047,1,-1


In [14]:
x_data = mod_data.iloc[:,:-2].values # just the X 
y_labeled = mod_data.iloc[:,-1].values # new class (just the P & U)
y_positive = mod_data.iloc[:,-2].values # original class

### random negative sampling

In [15]:
mod_data = mod_data.sample(frac=1)
neg_sample = mod_data[mod_data['class_test']==-1][:len(mod_data[mod_data['class_test']==1])]
sample_test = mod_data[mod_data['class_test']==-1][len(mod_data[mod_data['class_test']==1]):]
pos_sample = mod_data[mod_data['class_test']==1]
print(neg_sample.shape, pos_sample.shape)
sample_train = pd.concat([neg_sample, pos_sample]).sample(frac=1)

(2104, 232) (2104, 232)


In [16]:
pipeline = Pipeline([
    ('features',feats),
    ('classifier', RandomForestClassifier(random_state=13)),
])
pipeline.fit(sample_train.iloc[:,:-2], 
          sample_train.iloc[:,-2])
y_probs = pipeline.predict_proba(sample_test.iloc[:,:-2])[:,1]
evaluate_results(sample_test.iloc[:,-2], y_probs, 'PU')

Best Threshold=0.6800

Classification results:
f1: 82.47%
roc: 92.74%
precision: 80.03%
recall: 85.07%


In [17]:
results

Unnamed: 0,precision,recall,f1,roc_auc
classic,0.864906,0.879377,0.872082,0.944906
PU,0.800343,0.850684,0.824746,0.927388


В нашем случае модель лучше работает в "классическом" варианте. Вероятно, это связано с малым количеством данных в некоторых признаках, и потому не удается добиться реперезантативности, чтобы составить хороший семпл. На полном объеме тестовой выборки (на котором мы тестировали первую модель) метрики показывают лучший результат (по сути это вариант с 2step).

In [18]:
results.append(pd.Series(name='PU_full'))
y_probs = pipeline.predict_proba(X_test)[:,1]
evaluate_results(y_test, y_probs, 'PU_full')
results

  results.append(pd.Series(name='PU_full'))


Best Threshold=0.6500

Classification results:
f1: 89.14%
roc: 95.72%
precision: 86.63%
recall: 91.79%


Unnamed: 0,precision,recall,f1,roc_auc
classic,0.864906,0.879377,0.872082,0.944906
PU,0.800343,0.850684,0.824746,0.927388
PU_full,0.866324,0.917899,0.891366,0.95719


По сравнению с первой моделью уменьшилось количество ошибок 2 рода при несущественном росте количества ошибок 1 рода.

### Домашнее задание

1. взять любой набор данных для бинарной классификации (можно скачать один из модельных с https://archive.ics.uci.edu/ml/datasets.php)
3. сделать feature engineering
4. обучить любой классификатор (какой вам нравится)
5. далее разделить ваш набор данных на два множества: P (positives) и U (unlabeled). Причем брать нужно не все положительные (класс 1) примеры, а только лишь часть
6. применить random negative sampling для построения классификатора в новых условиях
7. сравнить качество с решением из пункта 4 (построить отчет - таблицу метрик)
8. поэкспериментировать с долей P на шаге 5 (как будет меняться качество модели при уменьшении/увеличении размера P)

<b>Бонусный вопрос:</b>

Как вы думаете, какой из методов на практике является более предпочтительным: random negative sampling или 2-step approach?

Ваш ответ здесь:

Как уже говорилось выше, 2step approach показывает лучший результат, т.к. мы лучше учимся определять 1 класс и учимся на сбалансированной выборке. Хотя всегда есть риски высокого числа ошибок 1 рода, и если это недопустимо в бизнесе - лучше остановиться на random negative.