Расмотрим пример на датасете из репозитория UCI

Описание данных - https://archive.ics.uci.edu/ml/datasets/banknote+authentication#

In [1]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline, make_pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
data = pd.read_csv("in-vehicle-coupon-recommendation.csv")
data.head(3)

Unnamed: 0,destination,passanger,weather,temperature,time,coupon,expiration,gender,age,maritalStatus,...,CoffeeHouse,CarryAway,RestaurantLessThan20,Restaurant20To50,toCoupon_GEQ5min,toCoupon_GEQ15min,toCoupon_GEQ25min,direction_same,direction_opp,Y
0,No Urgent Place,Alone,Sunny,55,2PM,Restaurant(<20),1d,Female,21,Unmarried partner,...,never,,4~8,1~3,1,0,0,0,1,1
1,No Urgent Place,Friend(s),Sunny,80,10AM,Coffee House,2h,Female,21,Unmarried partner,...,never,,4~8,1~3,1,0,0,0,1,0
2,No Urgent Place,Friend(s),Sunny,80,10AM,Carry out & Take away,2h,Female,21,Unmarried partner,...,never,,4~8,1~3,1,1,0,0,1,1


У нас есть 4 признака и 1 целевая переменная (бинарная) - нужно определить поддельная купюра или нет

In [3]:
print(data.shape)

(12684, 26)


Посмотрим на соотношение классов

In [4]:
data.iloc[:, -1].value_counts()

1    7210
0    5474
Name: Y, dtype: int64

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12684 entries, 0 to 12683
Data columns (total 26 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   destination           12684 non-null  object
 1   passanger             12684 non-null  object
 2   weather               12684 non-null  object
 3   temperature           12684 non-null  int64 
 4   time                  12684 non-null  object
 5   coupon                12684 non-null  object
 6   expiration            12684 non-null  object
 7   gender                12684 non-null  object
 8   age                   12684 non-null  object
 9   maritalStatus         12684 non-null  object
 10  has_children          12684 non-null  int64 
 11  education             12684 non-null  object
 12  occupation            12684 non-null  object
 13  income                12684 non-null  object
 14  car                   108 non-null    object
 15  Bar                   12577 non-null

In [6]:
data['temperature'] = data['temperature'].astype('str')

In [7]:
cols_with_mis = list()
for i in data.columns:
    
    mis = sum(data[i].isna())
    if mis != 0:
        print(f"\n-- HAVE {mis} MISSINGS --\n", '-'*17)
        cols_with_mis.append(i)
    print(data[i].value_counts(), '\n')

No Urgent Place    6283
Home               3237
Work               3164
Name: destination, dtype: int64 

Alone        7305
Friend(s)    3298
Partner      1075
Kid(s)       1006
Name: passanger, dtype: int64 

Sunny    10069
Snowy     1405
Rainy     1210
Name: weather, dtype: int64 

80    6528
55    3840
30    2316
Name: temperature, dtype: int64 

6PM     3230
7AM     3164
10AM    2275
2PM     2009
10PM    2006
Name: time, dtype: int64 

Coffee House             3996
Restaurant(<20)          2786
Carry out & Take away    2393
Bar                      2017
Restaurant(20-50)        1492
Name: coupon, dtype: int64 

1d    7091
2h    5593
Name: expiration, dtype: int64 

Female    6511
Male      6173
Name: gender, dtype: int64 

21         2653
26         2559
31         2039
50plus     1788
36         1319
41         1093
46          686
below21     547
Name: age, dtype: int64 

Married partner      5100
Single               4752
Unmarried partner    2186
Divorced              516
Widow

In [8]:
data.drop(['car', 'toCoupon_GEQ5min'], axis=1, inplace=True)

In [9]:
data['Bar'].fillna(data['Bar'].mode()[0], inplace=True)
data['CoffeeHouse'].fillna(data['CoffeeHouse'].mode()[0], inplace=True)
data['CarryAway'].fillna(data['CarryAway'].mode()[0], inplace=True)
data['RestaurantLessThan20'].fillna(data['RestaurantLessThan20'].mode()[0], inplace=True)
data['Restaurant20To50'].fillna(data['Restaurant20To50'].mode()[0], inplace=True)

In [10]:
data = pd.get_dummies(data)

Разбиваем выборку на тренировочную и тестовую части и обучаем модель (в примере - градиентный бустинг)

In [11]:
from sklearn.model_selection import train_test_split

x_data = data.drop('Y', axis=1)
y_data = data['Y']

X_train, X_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, random_state=7)

In [12]:
from sklearn.ensemble import GradientBoostingClassifier

model = GradientBoostingClassifier(random_state = 42)

model.fit(X_train, y_train)
y_predict = model.predict(X_test)

Проверяем качество

In [13]:
from sklearn.metrics import recall_score, precision_score, roc_auc_score, accuracy_score, f1_score

def evaluate_results(y_test, y_predict):
    print('Classification results:')
    f1 = f1_score(y_test, y_predict)
    print("f1: %.2f%%" % (f1 * 100.0)) 
    roc = roc_auc_score(y_test, y_predict)
    print("roc: %.2f%%" % (roc * 100.0)) 
    rec = recall_score(y_test, y_predict, average='binary')
    print("recall: %.2f%%" % (rec * 100.0)) 
    prc = precision_score(y_test, y_predict, average='binary')
    print("precision: %.2f%%" % (prc * 100.0))
    
    return (f1, roc, rec, prc)

    
ev = evaluate_results(y_test, y_predict)

Classification results:
f1: 76.30%
roc: 69.72%
recall: 80.94%
precision: 72.17%


### Теперь очередь за PU learning

Представим, что нам неизвестны негативы и часть позитивов

In [14]:
mod_data = data.copy()
#get the indices of the positives samples
pos_ind = np.where(mod_data['Y'].values == 1)[0]
#shuffle them
np.random.shuffle(pos_ind)
# leave just 25% of the positives marked
pos_sample_len = int(np.ceil(0.33 * len(pos_ind)))
print(f'Using {pos_sample_len}/{len(pos_ind)} as positives and unlabeling the rest')
pos_sample = pos_ind[:pos_sample_len]

Using 2380/7210 as positives and unlabeling the rest


Создаем столбец для новой целевой переменной, где у нас два класса - P (1) и U (-1)

In [15]:
mod_data['class_test'] = -1
mod_data.loc[pos_sample,'class_test'] = 1
print('target variable:\n', mod_data.iloc[:,-1].value_counts())

target variable:
 -1    10304
 1     2380
Name: class_test, dtype: int64


* We now have just 153 positive samples labeled as 1 in the 'class_test' col while the rest is unlabeled as -1.

* Recall that col 4 still holds the actual label

In [16]:
mod_data.head(10)

Unnamed: 0,has_children,toCoupon_GEQ15min,toCoupon_GEQ25min,direction_same,direction_opp,Y,destination_Home,destination_No Urgent Place,destination_Work,passanger_Alone,...,RestaurantLessThan20_4~8,RestaurantLessThan20_gt8,RestaurantLessThan20_less1,RestaurantLessThan20_never,Restaurant20To50_1~3,Restaurant20To50_4~8,Restaurant20To50_gt8,Restaurant20To50_less1,Restaurant20To50_never,class_test
0,1,0,0,0,1,1,0,1,0,1,...,1,0,0,0,1,0,0,0,0,1
1,1,0,0,0,1,0,0,1,0,0,...,1,0,0,0,1,0,0,0,0,-1
2,1,1,0,0,1,1,0,1,0,0,...,1,0,0,0,1,0,0,0,0,-1
3,1,1,0,0,1,0,0,1,0,0,...,1,0,0,0,1,0,0,0,0,-1
4,1,1,0,0,1,0,0,1,0,0,...,1,0,0,0,1,0,0,0,0,-1
5,1,1,0,0,1,1,0,1,0,0,...,1,0,0,0,1,0,0,0,0,-1
6,1,1,0,0,1,1,0,1,0,0,...,1,0,0,0,1,0,0,0,0,-1
7,1,1,0,0,1,1,0,1,0,0,...,1,0,0,0,1,0,0,0,0,-1
8,1,1,0,0,1,1,0,1,0,0,...,1,0,0,0,1,0,0,0,0,1
9,1,1,0,0,1,0,0,1,0,0,...,1,0,0,0,1,0,0,0,0,-1


In [17]:
x_data = mod_data.drop(['class_test', 'Y'], axis=1).values # just the X 
y_labeled = mod_data['class_test'].values # new class (just the P & U)
y_positive = mod_data['Y'].values # original class

### 1. random negative sampling

In [18]:
mod_data = mod_data.sample(frac=1)
neg_sample = mod_data[mod_data['class_test']==-1][:len(mod_data[mod_data['class_test']==1])]
sample_test = mod_data[mod_data['class_test']==-1][len(mod_data[mod_data['class_test']==1]):]
pos_sample = mod_data[mod_data['class_test']==1]
print(neg_sample.shape, pos_sample.shape)
sample_train = pd.concat([neg_sample, pos_sample]).sample(frac=1)

(2380, 112) (2380, 112)


In [19]:
model.fit(sample_train.drop(['class_test', 'Y'], axis=1).values, 
          sample_train['class_test'].values)
y_predict = model.predict(sample_test.drop(['class_test', 'Y'], axis=1).values)
y_predict[y_predict == -1] = 0
evaluate_results(sample_test['Y'].values, y_predict)

Classification results:
f1: 63.93%
roc: 66.71%
recall: 63.07%
precision: 64.81%


(0.6392832529290144, 0.6671163166961221, 0.630677182485722, 0.6481274455002795)