In [232]:
import pandas as pd
import numpy as np


In [233]:
df = pd.read_excel('Cryotherapy.xlsx')

In [234]:
df.head(5)

Unnamed: 0,sex,age,Time,Number_of_Warts,Type,Area,Result_of_Treatment
0,1,35,12.0,5,1,100,0
1,1,29,7.0,5,1,96,1
2,1,50,8.0,1,3,132,0
3,1,32,11.75,7,3,750,0
4,1,67,9.25,1,1,42,0


In [235]:
df.Result_of_Treatment.value_counts()

1    48
0    42
Name: Result_of_Treatment, dtype: int64

In [236]:
df.isna().sum()

sex                    0
age                    0
Time                   0
Number_of_Warts        0
Type                   0
Area                   0
Result_of_Treatment    0
dtype: int64

In [237]:
from sklearn.model_selection import train_test_split

x_data = df.iloc[:,:-1]
y_data = df.iloc[:,-1]

x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, random_state=7)

In [238]:
#!pip install xgboost

In [239]:
import xgboost as xgb

model = xgb.XGBClassifier()

model.fit(x_train, y_train)
y_predict = model.predict(x_test)





Проверяем качество

In [240]:
y_predict

array([1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0])

In [241]:
models_result = {"model_name": [], "f1_score": [], "roc_auc_score": [], "recall_score": [] , "precision_score": [] }
 

In [242]:
from sklearn.metrics import recall_score, precision_score, roc_auc_score, accuracy_score, f1_score

def evaluate_results(y_test, y_predict, log_dict, model_name):
#     log_dict = {"model_name": [], "f1_score": [], "roc_auc_score": [], "recall_score": [] , "precision_score": [] }
    log_dict["model_name"].append(model_name)
    print('Classification results:')
    f1 = f1_score(y_test, y_predict)
    log_dict["f1_score"].append(f1)
    print("f1: %.2f%%" % (f1 * 100.0)) 
    roc = roc_auc_score(y_test, y_predict)
    log_dict["roc_auc_score"].append(roc)
    print("roc: %.2f%%" % (roc * 100.0)) 
    rec = recall_score(y_test, y_predict, average='binary')
    log_dict["recall_score"].append(rec)
    print("recall: %.2f%%" % (rec * 100.0)) 
    prc = precision_score(y_test, y_predict, average='binary')
    log_dict["precision_score"].append(prc)
    print("precision: %.2f%%" % (prc * 100.0)) 
#     log_dicter.update(log_dict)
    
    


In [243]:
evaluate_results(y_test, y_predict, models_result, "XGBClassifier_norm")

Classification results:
f1: 90.91%
roc: 91.67%
recall: 83.33%
precision: 100.00%


### Теперь очередь за PU learning

Представим, что нам неизвестны негативы и часть позитивов

In [244]:
mod_data = df.copy()
#get the indices of the positives samples
pos_ind = np.where(mod_data.iloc[:,-1].values == 1)[0]
#shuffle them
np.random.shuffle(pos_ind)
# leave just 25% of the positives marked
pos_sample_len = int(np.ceil(0.25 * len(pos_ind)))
print(f'Using {pos_sample_len}/{len(pos_ind)} as positives and unlabeling the rest')
pos_sample = pos_ind[:pos_sample_len]

Using 12/48 as positives and unlabeling the rest


Создаем столбец для новой целевой переменной, где у нас два класса - P (1) и U (-1)

In [245]:
mod_data['class_test'] = -1
mod_data.loc[pos_sample,'class_test'] = 1
print('target variable:\n', mod_data.iloc[:,-1].value_counts())

target variable:
 -1    78
 1    12
Name: class_test, dtype: int64


 * We now have just 12 positive samples labeled as 1 in the 'class_test' col while the rest is unlabeled as -1.

* Recall that col "Result_of_Treatment" still holds the actual label

In [246]:
mod_data.head(10)

Unnamed: 0,sex,age,Time,Number_of_Warts,Type,Area,Result_of_Treatment,class_test
0,1,35,12.0,5,1,100,0,-1
1,1,29,7.0,5,1,96,1,-1
2,1,50,8.0,1,3,132,0,-1
3,1,32,11.75,7,3,750,0,-1
4,1,67,9.25,1,1,42,0,-1
5,1,41,8.0,2,2,20,1,-1
6,1,36,11.0,2,1,8,0,-1
7,1,59,3.5,3,3,20,0,-1
8,1,20,4.5,12,1,6,1,-1
9,2,34,11.25,3,3,150,0,-1


Remember that this data frame (x_data) includes the former target variable that we keep here just to compare the results

[:-2] is the original class label for positive and negative data [:-1] is the new class for positive and unlabeled data

In [247]:
x_data = mod_data.iloc[:,:-2].values # just the X 
y_labeled = mod_data.iloc[:,-1].values # new class (just the P & U)
y_positive = mod_data.iloc[:,-2].values # original class

### 1. random negative sampling

In [248]:
mod_data = mod_data.sample(frac=1)
neg_sample = mod_data[mod_data['class_test']==-1][:len(mod_data[mod_data['class_test']==1])]
sample_test = mod_data[mod_data['class_test']==-1][len(mod_data[mod_data['class_test']==1]):]
pos_sample = mod_data[mod_data['class_test']==1]
print(neg_sample.shape, pos_sample.shape)
sample_train = pd.concat([neg_sample, pos_sample]).sample(frac=1.5, replace=True)

(12, 8) (12, 8)


In [249]:
model = xgb.XGBClassifier()

model.fit(sample_train.iloc[:,:-2].values, 
          sample_train.iloc[:,-2].values)
y_predict = model.predict(sample_test.iloc[:,:-2].values)
evaluate_results(sample_test.iloc[:,-2].values, y_predict, models_result, "XGBClassifier_random")

Classification results:
f1: 76.92%
roc: 77.78%
recall: 83.33%
precision: 71.43%




In [250]:
models_result

{'model_name': ['XGBClassifier_norm', 'XGBClassifier_random'],
 'f1_score': [0.9090909090909091, 0.7692307692307692],
 'roc_auc_score': [0.9166666666666667, 0.7777777777777779],
 'recall_score': [0.8333333333333334, 0.8333333333333334],
 'precision_score': [1.0, 0.7142857142857143]}

In [251]:
pd.DataFrame(models_result)

Unnamed: 0,model_name,f1_score,roc_auc_score,recall_score,precision_score
0,XGBClassifier_norm,0.909091,0.916667,0.833333,1.0
1,XGBClassifier_random,0.769231,0.777778,0.833333,0.714286


In [252]:
pd.DataFrame(models_result)

Unnamed: 0,model_name,f1_score,roc_auc_score,recall_score,precision_score
0,XGBClassifier_norm,0.909091,0.916667,0.833333,1.0
1,XGBClassifier_random,0.769231,0.777778,0.833333,0.714286


### Домашнее задание

1. взять любой набор данных для бинарной классификации (можно скачать один из модельных с https://archive.ics.uci.edu/ml/datasets.php)
3. сделать feature engineering
4. обучить любой классификатор (какой вам нравится)
5. далее разделить ваш набор данных на два множества: P (positives) и U (unlabeled). Причем брать нужно не все положительные (класс 1) примеры, а только лишь часть
6. применить random negative sampling для построения классификатора в новых условиях
7. сравнить качество с решением из пункта 4 (построить отчет - таблицу метрик)
8. поэкспериментировать с долей P на шаге 5 (как будет меняться качество модели при уменьшении/увеличении размера P)

<b>Бонусный вопрос:</b>

Как вы думаете, какой из методов на практике является более предпочтительным: random negative sampling или 2-step approach?

Ваш ответ здесь: