In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score, precision_score, roc_auc_score, accuracy_score, f1_score, log_loss

from xgboost import XGBClassifier

In [2]:
df = pd.read_csv('Skin_NonSkin.txt',
                 sep='\t',
                 header=None,
                 names=['attr_1','attr_2', 'attr_3', 'target'])
df['target'].replace({2: 0}, inplace=True)
df.head(2)

Unnamed: 0,attr_1,attr_2,attr_3,target
0,74,85,123,1
1,73,84,122,1


In [3]:
df['target'].value_counts()

0    194198
1     50859
Name: target, dtype: int64

In [4]:
X = df.loc[:, ~df.columns.isin(['target'])]
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7)

**Обучить любой классификатор и посчитать метрики качества (roc auc, pr/rec/f1, logloss)**

In [5]:
metrics = {
    'method': [],
    'f1_score': [],
    'roc': [],
    'recall':[],
    'precision':[]
}

In [6]:
model = XGBClassifier()

model.fit(X_train, y_train)
y_predict = model.predict(X_test)

In [7]:
def evaluate_results(y_test, y_predict):
    print('Classification results:')
    f1 = f1_score(y_test, y_predict)
    print("f1: %.2f%%" % (f1 * 100.0)) 
    roc = roc_auc_score(y_test, y_predict)
    print("roc: %.2f%%" % (roc * 100.0)) 
    rec = recall_score(y_test, y_predict, average='binary')
    print("recall: %.2f%%" % (rec * 100.0)) 
    prc = precision_score(y_test, y_predict, average='binary')
    print("precision: %.2f%%" % (prc * 100.0)) 
    log_loss_score = log_loss(y_test, y_predict)
    print("log loss:", (log_loss_score * 100.0)) 

    
evaluate_results(y_test, y_predict)

Classification results:
f1: 99.92%
roc: 99.96%
recall: 99.95%
precision: 99.89%
log loss: 1.1275385984868032


In [8]:
metrics['method'].append('original')
metrics['f1_score'].append(f1_score(y_test, y_predict))
metrics['roc'].append(roc_auc_score(y_test, y_predict))
metrics['recall'].append(recall_score(y_test, y_predict, average='binary'))
metrics['precision'].append(precision_score(y_test, y_predict, average='binary'))

**Разделить ваш набор данных на два множества: P (positives) и U (unlabeled). Причем брать нужно не все положительные (класс 1) примеры, а только лишь часть**

In [9]:
mod_df = df.copy()
# получаем индексы positive samples
pos_ind = np.where(mod_df['target'].values == 1)[0]
# перемешиваем
np.random.shuffle(pos_ind)
# оставляем 25% positives размеченными
pos_sample_len = int(np.ceil(0.25 * len(pos_ind)))
print(f'Using {pos_sample_len}/{len(pos_ind)} as positives and unlabeling the rest')
pos_sample = pos_ind[:pos_sample_len]

Using 12715/50859 as positives and unlabeling the rest


**Применить random negative sampling для построения классификатора в новых условиях**

In [10]:
mod_df['class_test'] = -1
mod_df.loc[pos_sample,'class_test'] = 1
print('target variable:\n', mod_df['class_test'].value_counts())

target variable:
 -1    232342
 1     12715
Name: class_test, dtype: int64


In [11]:
mod_df.head(2)

Unnamed: 0,attr_1,attr_2,attr_3,target,class_test
0,74,85,123,1,1
1,73,84,122,1,1


In [12]:
X = mod_df.loc[:, ~mod_df.columns.isin(['target', 'class_test'])].values # just the X 
y_labeled = mod_df['class_test'].values # new class (just the P & U)
y_positive = mod_df['target'].values # original class

In [13]:
mod_df = mod_df.sample(frac=1)
neg_sample = mod_df[mod_df['class_test']==-1][:len(mod_df[mod_df['class_test']==1])]
sample_test = mod_df[mod_df['class_test']==-1][len(mod_df[mod_df['class_test']==1]):]
pos_sample = mod_df[mod_df['class_test']==1]
print(neg_sample.shape, pos_sample.shape)
sample_train = pd.concat([neg_sample, pos_sample]).sample(frac=1)

(12715, 5) (12715, 5)


In [14]:
model = XGBClassifier()

model.fit(sample_train.loc[:, ~sample_train.columns.isin(['target', 'class_test'])].values, 
          sample_train['target'].values)
y_predict = model.predict(sample_test.loc[:, ~sample_test.columns.isin(['target', 'class_test'])].values)
evaluate_results(sample_test['target'].values, y_predict)

Classification results:
f1: 99.53%
roc: 99.90%
recall: 99.99%
precision: 99.07%
log loss: 5.378451916471339


In [15]:
metrics['method'].append('25% negative sampling')
metrics['f1_score'].append(f1_score(sample_test['target'].values, y_predict))
metrics['roc'].append(roc_auc_score(sample_test['target'].values, y_predict))
metrics['recall'].append(recall_score(sample_test['target'].values, y_predict, average='binary'))
metrics['precision'].append(precision_score(sample_test['target'].values, y_predict, average='binary'))

**Поэкспериментировать с долей P на шаге 5 (как будет меняться качество модели при уменьшении/увеличении размера P)**

In [16]:
mod_df = df.copy()
# получаем индексы positive samples
pos_ind = np.where(mod_df['target'].values == 1)[0]
# перемешиваем
np.random.shuffle(pos_ind)
# оставляем 0.2% positives размеченными
pos_sample_len = int(np.ceil(0.002 * len(pos_ind)))
print(f'Using {pos_sample_len}/{len(pos_ind)} as positives and unlabeling the rest')
pos_sample = pos_ind[:pos_sample_len]

mod_df['class_test'] = -1
mod_df.loc[pos_sample,'class_test'] = 1
print('target variable:\n', mod_df['class_test'].value_counts())

X = mod_df.loc[:, ~mod_df.columns.isin(['target', 'class_test'])].values # just the X 
y_labeled = mod_df['class_test'].values # new class (just the P & U)
y_positive = mod_df['target'].values # original class

mod_df = mod_df.sample(frac=1)
neg_sample = mod_df[mod_df['class_test']==-1][:len(mod_df[mod_df['class_test']==1])]
sample_test = mod_df[mod_df['class_test']==-1][len(mod_df[mod_df['class_test']==1]):]
pos_sample = mod_df[mod_df['class_test']==1]
print(neg_sample.shape, pos_sample.shape)
sample_train = pd.concat([neg_sample, pos_sample]).sample(frac=1)

model = XGBClassifier()

model.fit(sample_train.loc[:, ~sample_train.columns.isin(['target', 'class_test'])].values, 
          sample_train['target'].values)
y_predict = model.predict(sample_test.loc[:, ~sample_test.columns.isin(['target', 'class_test'])].values)
evaluate_results(sample_test['target'].values, y_predict)

Using 102/50859 as positives and unlabeling the rest
target variable:
 -1    244955
 1       102
Name: class_test, dtype: int64
(102, 5) (102, 5)
Classification results:
f1: 82.56%
roc: 94.30%
recall: 99.44%
precision: 70.58%
log loss: 300.71694229543874


In [17]:
metrics['method'].append('0.2% negative sampling')
metrics['f1_score'].append(f1_score(sample_test['target'].values, y_predict))
metrics['roc'].append(roc_auc_score(sample_test['target'].values, y_predict))
metrics['recall'].append(recall_score(sample_test['target'].values, y_predict, average='binary'))
metrics['precision'].append(precision_score(sample_test['target'].values, y_predict, average='binary'))

**Cравнить качество с решением из пункта 4 (построить отчет - таблицу метрик)**

In [18]:
metrics_df = pd.DataFrame(metrics)
metrics_df

Unnamed: 0,method,f1_score,roc,recall,precision
0,original,0.999214,0.999613,0.999509,0.99892
1,25% negative sampling,0.995278,0.999035,0.999917,0.990682
2,0.2% negative sampling,0.825597,0.943029,0.994423,0.705775
