### Урок 6. Задача lookalike (Positive Unlabeled Learning)#

### Домашнее задание

1. взять любой набор данных для бинарной классификации (можно скачать один из модельных с https://archive.ics.uci.edu/ml/datasets.php)
3. сделать feature engineering
4. обучить любой классификатор (какой вам нравится)
5. далее разделить ваш набор данных на два множества: P (positives) и U (unlabeled). Причем брать нужно не все положительные (класс 1) примеры, а только лишь часть
6. применить random negative sampling для построения классификатора в новых условиях
7. сравнить качество с решением из пункта 4 (построить отчет - таблицу метрик)

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score, precision_score, roc_auc_score, accuracy_score, f1_score

import xgboost as xgb

In [2]:
# dataset https://archive.ics.uci.edu/ml/datasets/Early+stage+diabetes+risk+prediction+dataset.
    
data = pd.read_csv("diabetes_data_upload.csv")
data.head(3)

Unnamed: 0,Age,Gender,Polyuria,Polydipsia,sudden weight loss,weakness,Polyphagia,Genital thrush,visual blurring,Itching,Irritability,delayed healing,partial paresis,muscle stiffness,Alopecia,Obesity,class
0,40,Male,No,Yes,No,Yes,No,No,No,Yes,No,Yes,No,Yes,Yes,Yes,Positive
1,58,Male,No,No,No,Yes,No,No,Yes,No,No,No,Yes,No,Yes,No,Positive
2,41,Male,Yes,No,No,Yes,Yes,No,No,Yes,No,Yes,No,Yes,Yes,No,Positive


In [3]:
for col in data.columns.drop('Age'):
    map_dict = None
    
    if(col == 'Gender'):
        map_dict = {'Male':1, 'Female':0}        
    elif(col == 'class'):
        map_dict = {'Positive':1, 'Negative':0}
    else:
        map_dict = {'Yes':1, 'No':0}
        
    data[col] = data[col].map(map_dict).astype(int)
    
data.head(3)

Unnamed: 0,Age,Gender,Polyuria,Polydipsia,sudden weight loss,weakness,Polyphagia,Genital thrush,visual blurring,Itching,Irritability,delayed healing,partial paresis,muscle stiffness,Alopecia,Obesity,class
0,40,1,0,1,0,1,0,0,0,1,0,1,0,1,1,1,1
1,58,1,0,0,0,1,0,0,1,0,0,0,1,0,1,0,1
2,41,1,1,0,0,1,1,0,0,1,0,1,0,1,1,0,1


In [4]:
data.iloc[:, -1].value_counts()

1    320
0    200
Name: class, dtype: int64

In [5]:
x_data = data.iloc[:,:-1]
y_data = data.iloc[:,-1]

x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, random_state=42)

In [6]:
model = xgb.XGBClassifier()

model.fit(x_train, y_train)
y_predict = model.predict(x_test)

In [7]:
def evaluate_results(y_test, y_predict):
    print('Classification results:')
    f1 = f1_score(y_test, y_predict)
    print("f1: %.2f%%" % (f1 * 100.0)) 
    roc = roc_auc_score(y_test, y_predict)
    print("roc: %.2f%%" % (roc * 100.0)) 
    rec = recall_score(y_test, y_predict, average='binary')
    print("recall: %.2f%%" % (rec * 100.0)) 
    prc = precision_score(y_test, y_predict, average='binary')
    print("precision: %.2f%%" % (prc * 100.0)) 
    
    return f1, roc, rec, prc

metrics = pd.DataFrame(columns=['variant', 'f1', 'roc', 'recall', 'precision'])

f1, roc, rec, prc = evaluate_results(y_test, y_predict)

metrics.loc[0] = 'original', f1, roc, rec, prc

Classification results:
f1: 97.84%
roc: 97.89%
recall: 95.77%
precision: 100.00%


In [8]:
mod_data = data.copy()
#get the indices of the positives samples
pos_ind = np.where(mod_data.iloc[:,-1].values == 1)[0]
#shuffle them
np.random.shuffle(pos_ind)
# leave just 25% of the positives marked
pos_sample_len = int(np.ceil(0.25 * len(pos_ind)))
print(f'Using {pos_sample_len}/{len(pos_ind)} as positives and unlabeling the rest')
pos_sample = pos_ind[:pos_sample_len]

Using 80/320 as positives and unlabeling the rest


In [9]:
mod_data['class_test'] = -1
mod_data.loc[pos_sample,'class_test'] = 1
print('target variable:\n', mod_data.iloc[:,-1].value_counts())

target variable:
 -1    440
 1     80
Name: class_test, dtype: int64


In [10]:
mod_data.head(3)

Unnamed: 0,Age,Gender,Polyuria,Polydipsia,sudden weight loss,weakness,Polyphagia,Genital thrush,visual blurring,Itching,Irritability,delayed healing,partial paresis,muscle stiffness,Alopecia,Obesity,class,class_test
0,40,1,0,1,0,1,0,0,0,1,0,1,0,1,1,1,1,1
1,58,1,0,0,0,1,0,0,1,0,0,0,1,0,1,0,1,-1
2,41,1,1,0,0,1,1,0,0,1,0,1,0,1,1,0,1,1


In [11]:
x_data = mod_data.iloc[:,:-2].values # just the X 
y_labeled = mod_data.iloc[:,-1].values # new class (just the P & U)
y_positive = mod_data.iloc[:,-2].values # original class

In [12]:
mod_data = mod_data.sample(frac=1)
neg_sample = mod_data[mod_data['class_test']==-1][:len(mod_data[mod_data['class_test']==1])]
sample_test = mod_data[mod_data['class_test']==-1][len(mod_data[mod_data['class_test']==1]):]
pos_sample = mod_data[mod_data['class_test']==1]
print(neg_sample.shape, pos_sample.shape)
sample_train = pd.concat([neg_sample, pos_sample]).sample(frac=1)

(80, 18) (80, 18)


In [13]:
model = xgb.XGBClassifier()

model.fit(sample_train.iloc[:,:-2].values, 
          sample_train.iloc[:,-2].values)
y_predict = model.predict(sample_test.iloc[:,:-2].values)
evaluate_results(sample_test.iloc[:,-2].values, y_predict)

f1, roc, rec, prc = evaluate_results(sample_test.iloc[:,-2].values, y_predict)

metrics.loc[1] = 'PU learning', f1, roc, rec, prc

Classification results:
f1: 92.42%
roc: 91.44%
recall: 94.33%
precision: 90.59%
Classification results:
f1: 92.42%
roc: 91.44%
recall: 94.33%
precision: 90.59%


In [14]:
metrics

Unnamed: 0,variant,f1,roc,recall,precision
0,original,0.978417,0.978873,0.957746,1.0
1,PU learning,0.924242,0.914421,0.943299,0.905941
