## 1. Взять любой набор данных для бинарной классификации (можно скачать один из модельных с https://archive.ics.uci.edu/ml/datasets.php)

In [59]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import itertools

import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

Данные отсюда: https://archive.ics.uci.edu/ml/datasets/Bank+Marketing

In [93]:
df = pd.read_csv("bank-full.csv", sep=';')
df.head(3)

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no


In [15]:
df['y'] = df['y'].map({'yes':1, 'no':0})

In [17]:
df['y'].value_counts()

0    39922
1     5289
Name: y, dtype: int64

In [18]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['y']), df['y'], random_state=42)

In [19]:
class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, column):
        self.column = column

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X[self.column]
    

class NumberSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on numeric columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]
    
    
class OHEEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
        self.columns = []

    def fit(self, X, y=None):
        self.columns = [col for col in pd.get_dummies(X, prefix=self.key).columns]
        return self

    def transform(self, X):
        X = pd.get_dummies(X, prefix=self.key)
        test_columns = [col for col in X.columns]
        
        for col_ in self.columns:
            if col_ not in test_columns:
                X[col_] = 0
        return X[self.columns]

In [22]:
categorical_columns = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'poutcome']
continuous_columns = ['age', 'balance', 'duration', 'campaign', 'pdays', 'previous']

In [23]:
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion

In [24]:
final_transformers = list()

for cat_col in categorical_columns:
    cat_transformer = Pipeline([
                ('selector', FeatureSelector(column=cat_col)),
                ('ohe', OHEEncoder(key=cat_col))
            ])
    
    final_transformers.append((cat_col, cat_transformer))
    
for cont_col in continuous_columns:
    cont_transformer = Pipeline([
                ('selector', NumberSelector(key=cont_col))
            ])
    
    final_transformers.append((cont_col, cont_transformer))

In [25]:
feats = FeatureUnion(final_transformers)

In [62]:
from sklearn.ensemble import GradientBoostingClassifier

pipeline = Pipeline([
    ('features', feats),
    ('classifier', GradientBoostingClassifier(random_state=42)),
])

## 2. Обучить любой классификатор (какой вам нравится)

In [63]:
# обучим наш пайплайн
pipeline.fit(X_train, y_train)

In [64]:
# наши прогнозы для тестовой выборки
preds = pipeline.predict_proba(X_test)[:, 1]
preds[:10]

array([0.01284585, 0.0125609 , 0.10245304, 0.24441928, 0.02276925,
       0.04753074, 0.08306362, 0.02032499, 0.00778574, 0.10524364])

Также нам нужно от вероятностей перейти к меткам классов. Для этого нужно подобрать порог, после которого мы считаем, что объект можно отнести к классу 1 (если вероятность больше порога - размечаем объект как класс 1, если нет - класс 0)

In [65]:
from sklearn.metrics import f1_score, roc_auc_score, precision_score, classification_report, precision_recall_curve, confusion_matrix

In [66]:
metrics_df = pd.DataFrame(columns=['model', 'thresh', 'F-Score', 'Precision', 'Recall', 'ROC AUC'])
metrics_df

Unnamed: 0,model,thresh,F-Score,Precision,Recall,ROC AUC


In [67]:
precision, recall, thresholds = precision_recall_curve(y_test, preds)

fscore = (2 * precision * recall) / (precision + recall)
# locate the index of the largest f score
ix = np.argmax(fscore)
print(f'Best Threshold={thresholds[ix]}, F-Score={fscore[ix]:.3f}, Precision={precision[ix]:.3f}, Recall={recall[ix]:.3f}')

Best Threshold=0.2640912656116132, F-Score=0.579, Precision=0.520, Recall=0.655


In [68]:
roc_auc = roc_auc_score(y_test, preds)
roc_auc

0.9018887118519425

In [69]:
metrics_df = metrics_df.append({
    'model': 'supervised',
    'thresh': thresholds[ix],
    'F-Score': fscore[ix],
    'Precision': precision[ix],
    'Recall': recall[ix],
    'ROC AUC': roc_auc
}, ignore_index=True)

metrics_df

Unnamed: 0,model,thresh,F-Score,Precision,Recall,ROC AUC
0,supervised,0.264091,0.579464,0.519648,0.654841,0.901889


## 3. Разделить ваш набор данных на два множества: P (positives) и U (unlabeled). Причем брать нужно не все положительные примеры (класс 1), а только лишь часть


In [70]:
mod_data = X_train.copy()
mod_data['Exited'] = y_train
mod_data.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,Exited
41626,52,admin.,married,secondary,no,118,no,no,cellular,25,sep,105,1,-1,0,unknown,0
31347,28,student,single,secondary,no,459,no,no,cellular,16,mar,83,13,-1,0,unknown,0
22563,36,management,single,tertiary,no,156,no,no,cellular,22,aug,122,1,-1,0,unknown,0
37243,55,entrepreneur,married,tertiary,no,323,yes,yes,cellular,13,may,200,1,-1,0,unknown,0
32259,38,services,divorced,secondary,no,904,yes,no,cellular,16,apr,361,1,339,1,failure,0


In [71]:
# get the indices of the positives samples
pos_ind = mod_data[mod_data['Exited'] == 1].sample(frac=1, random_state=42).index

# leave just 25% of the positives marked
perc = 0.25
pos_sample_len = int(np.ceil(perc * len(pos_ind)))

print(f'Using {pos_sample_len}/{len(pos_ind)} as positives and unlabeling the rest')
pos_sample = pos_ind[:pos_sample_len]

Using 984/3936 as positives and unlabeling the rest


In [72]:
# get the indices of the positives samples
pos_sample = mod_data[mod_data['Exited'] == 1].sample(frac=0.25, random_state=42).index

Создаем столбец для новой целевой переменной, где у нас два класса - P (1) и U (-1)

In [73]:
mod_data['class_test'] = -1
mod_data.loc[pos_sample, 'class_test'] = 1
print('target variable:\n', mod_data.iloc[:,-1].value_counts())

target variable:
 -1    32924
 1      984
Name: class_test, dtype: int64


In [74]:
mod_data.head(5)

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,Exited,class_test
41626,52,admin.,married,secondary,no,118,no,no,cellular,25,sep,105,1,-1,0,unknown,0,-1
31347,28,student,single,secondary,no,459,no,no,cellular,16,mar,83,13,-1,0,unknown,0,-1
22563,36,management,single,tertiary,no,156,no,no,cellular,22,aug,122,1,-1,0,unknown,0,-1
37243,55,entrepreneur,married,tertiary,no,323,yes,yes,cellular,13,may,200,1,-1,0,unknown,0,-1
32259,38,services,divorced,secondary,no,904,yes,no,cellular,16,apr,361,1,339,1,failure,0,-1


## 4. Применить random negative sampling для построения классификатора в новых условиях

In [75]:
data_N = mod_data[mod_data['class_test'] == -1]
data_P = mod_data[mod_data['class_test'] == 1]

neg_sample = data_N[:data_P.shape[0]]
pos_sample = data_P.copy()

print(neg_sample.shape, pos_sample.shape)
sample_train = pd.concat([neg_sample, pos_sample]).sample(frac=1, random_state=42)

(984, 18) (984, 18)


In [76]:
sample_train

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,Exited,class_test
21517,48,unknown,married,unknown,no,3,no,no,cellular,19,aug,104,2,-1,0,unknown,0,-1
31327,49,technician,divorced,tertiary,no,1807,yes,no,cellular,12,mar,229,1,-1,0,unknown,1,1
40414,45,management,married,tertiary,no,1309,no,no,cellular,2,jul,367,1,-1,0,unknown,1,-1
45190,32,blue-collar,married,secondary,no,136,no,no,cellular,16,nov,206,1,188,3,success,1,1
12063,30,admin.,married,secondary,no,-1049,yes,no,unknown,20,jun,160,2,-1,0,unknown,0,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15220,42,blue-collar,divorced,primary,no,301,yes,no,cellular,17,jul,1175,2,-1,0,unknown,1,1
37365,51,blue-collar,divorced,primary,no,-19,yes,no,cellular,13,may,829,1,296,11,failure,1,1
10183,48,admin.,married,secondary,no,155,no,no,unknown,11,jun,222,3,-1,0,unknown,0,-1
42481,72,retired,married,primary,no,3856,no,no,cellular,10,dec,582,4,115,1,other,1,1


In [77]:
sample_train.loc[sample_train['class_test'] == -1, 'class_test'] = 0

pipeline = Pipeline([
    ('features', feats),
    ('classifier', RandomForestClassifier(random_state=42)),
])


pipeline.fit(sample_train.drop(columns=['class_test', 'Exited']), 
             sample_train['class_test'])

In [78]:
# наши прогнозы для тестовой выборки
preds = pipeline.predict_proba(X_test)[:, 1]
preds[:10]

array([0.03, 0.1 , 0.46, 0.83, 0.26, 0.36, 0.21, 0.11, 0.13, 0.55])

In [79]:
precision, recall, thresholds = precision_recall_curve(y_test, preds)

fscore = (2 * precision * recall) / (precision + recall)
# locate the index of the largest f score
ix = np.argmax(fscore)
print(f'Best Threshold={thresholds[ix]}, F-Score={fscore[ix]:.3f}, Precision={precision[ix]:.3f}, Recall={recall[ix]:.3f}')

Best Threshold=0.66, F-Score=0.550, Precision=0.460, Recall=0.685


In [80]:
roc_auc = roc_auc_score(y_test, preds)
roc_auc

0.883703588155114

In [81]:
metrics_df = metrics_df.append({
    'model': 'pu-learning',
    'thresh': thresholds[ix],
    'F-Score': fscore[ix],
    'Precision': precision[ix],
    'Recall': recall[ix],
    'ROC AUC': roc_auc
}, ignore_index=True)

## 5. Сравнить качество с решением из пункта 3 (построить отчет - таблицу метрик)

In [82]:
metrics_df

Unnamed: 0,model,thresh,F-Score,Precision,Recall,ROC AUC
0,supervised,0.264091,0.579464,0.519648,0.654841,0.901889
1,pu-learning,0.66,0.550312,0.459821,0.685144,0.883704


## 6. *Поэкспериментировать с долей P на шаге 5 (как будет меняться качество модели при уменьшении/увеличении размера P)

In [91]:
from tqdm import tqdm

metrics_df = pd.DataFrame(columns=['frac', 'thresh', 'F-Score', 'Precision', 'Recall', 'ROC AUC'])


for frac in tqdm(np.linspace(0.1, 1, 9)):
    mod_data = X_train.copy()
    mod_data['Exited'] = y_train
    mod_data.head()

    # get the indices of the positives samples
    pos_sample = mod_data[mod_data['Exited'] == 1].sample(frac=frac, random_state=42).index

    
    mod_data['class_test'] = -1
    mod_data.loc[pos_sample,'class_test'] = 1
    
    mod_data = mod_data.sample(frac=1, random_state=42)


    data_N = mod_data[mod_data['class_test'] == -1]
    data_P = mod_data[mod_data['class_test'] == 1]

    neg_sample = data_N[:data_P.shape[0]]
    pos_sample = data_P.copy()

    sample_train = pd.concat([neg_sample, pos_sample]).sample(frac=1, random_state=42)

    sample_train.loc[sample_train['class_test'] == -1, 'class_test'] = 0

    pipeline = Pipeline([
        ('features', feats),
        ('classifier', RandomForestClassifier(random_state=42)),
    ])

    pipeline.fit(sample_train.drop(columns=['class_test','Exited']), 
                 sample_train['class_test'])
    
    # наши прогнозы для тестовой выборки
    preds = pipeline.predict_proba(X_test)[:, 1]
    preds[:10]

    precision, recall, thresholds = precision_recall_curve(y_test, preds)

    fscore = (2 * precision * recall) / (precision + recall)
    # locate the index of the largest f score
    ix = np.argmax(fscore)
    roc_auc = roc_auc_score(y_test, preds)

    metrics_df = metrics_df.append({
        'frac': frac,
        'thresh': thresholds[ix],
        'F-Score': fscore[ix],
        'Precision': precision[ix],
        'Recall': recall[ix],
        'ROC AUC': roc_auc
    }, ignore_index=True)

metrics_df

100%|██████████| 9/9 [00:09<00:00,  1.03s/it]


Unnamed: 0,frac,thresh,F-Score,Precision,Recall,ROC AUC
0,0.1,0.98,,0.0,0.0,0.867447
1,0.2125,0.59,0.530634,0.414827,0.736142,0.879703
2,0.325,0.65,0.536486,0.443319,0.679231,0.882168
3,0.4375,0.64,0.534796,0.438863,0.684405,0.884325
4,0.55,0.62,0.540613,0.424947,0.742794,0.886681
5,0.6625,0.68,0.541973,0.450539,0.67997,0.888136
6,0.775,0.7,0.547937,0.464745,0.667406,0.890477
7,0.8875,0.65,0.549695,0.440196,0.731707,0.893476
8,1.0,0.7,0.554586,0.467546,0.681449,0.894584
