### Приступая к работе

Сначала обучу модель обычным способом

In [213]:
import pandas as pd
import numpy as np

In [214]:
df = pd.read_csv("pima-indians-diabetes.csv", header=None)
df.columns = [
    'Pregnant Times',
    'Glucose Tolerance Test',
    'Diastolic Blood Pressure',
    'Triceps Skin Fold Thickness',
    'Serum Insulin',
    'Body Mass Index',
    'Diabetes Pedigree Function',
    'Age',
    'Class'
]
df.head(3)

Unnamed: 0,Pregnant Times,Glucose Tolerance Test,Diastolic Blood Pressure,Triceps Skin Fold Thickness,Serum Insulin,Body Mass Index,Diabetes Pedigree Function,Age,Class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1


In [215]:
from sklearn.model_selection import train_test_split

In [216]:
X = data.drop('Class', axis=1)
y = data['Class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=21)

In [217]:
from sklearn.ensemble import GradientBoostingClassifier

In [218]:
model = GradientBoostingClassifier(random_state=21)
model.fit(X_train, y_train)

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=21, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [219]:
from sklearn.metrics import recall_score, precision_score, roc_auc_score, accuracy_score, f1_score

def evaluate_results(y_test, y_predict):
    print('Classification results:')
    f1 = f1_score(y_test, y_predict)
    print("F1: %.2f%%" % (f1 * 100.0)) 
    roc = roc_auc_score(y_test, y_predict)
    print("ROC-AUC: %.2f%%" % (roc * 100.0)) 
    rec = recall_score(y_test, y_predict)
    print("Recall: %.2f%%" % (rec * 100.0)) 
    pr = precision_score(y_test, y_predict)
    print("Precision: %.2f%%" % (pr * 100.0)) 

In [220]:
pred = model.predict(X_test)
evaluate_results(y_test, pred)

Classification results:
F1: 61.74%
ROC-AUC: 70.88%
Recall: 52.87%
Precision: 74.19%


### PU learning

Теперь применю обучение на основе положительных и неразмеченных данных (PU learning)

In [221]:
def pu(df, P = 0.25):
    pu_df = df.copy()
    pos_ind = np.where(pu_df.iloc[:,-1].values == 1)[0]

    rand_state = np.random.RandomState(21)
    rand_state.shuffle(pos_ind)
    
    pos_sample_len = int(np.ceil(len(pos_ind) * P))
    pos_sample = pos_ind[:pos_sample_len]

    pu_df['PU Class'] = -1
    pu_df.loc[pos_sample, 'PU Class'] = 1
    
    return pu_df

In [222]:
def random_negative_sample(df):
    data = df.copy()
    data = data.sample(frac=1, random_state=21)
    
    neg_sample = data[data['PU Class']==-1][:len(data[data['PU Class']==1])]
    pos_sample = data[data['PU Class']==1]
    
    sample_train = pd.concat([neg_sample, pos_sample]).sample(frac=1)
    sample_train.drop('PU Class', axis=1, inplace=True)
    
    sample_test = data[data['PU Class']==-1][len(data[data['PU Class']==1]):]
    sample_test.drop('PU Class', axis=1, inplace=True)
    
    return (sample_train, sample_test)

In [223]:
pu_df = pu(df)
pu_df['PU Class'].value_counts()

-1    701
 1     67
Name: PU Class, dtype: int64

In [224]:
train, test = random_negative_sample(pu_df)

In [225]:
model = GradientBoostingClassifier(random_state=21)
model.fit(train.drop('Class', axis=1), train['Class'])

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=21, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [231]:
pred = model.predict(test.drop('Class', axis=1))
evaluate_results(test['Class'], pred)

Classification results:
F1: 55.13%
ROC-AUC: 68.30%
Recall: 81.01%
Precision: 41.79%


Оценю качество моделей, обученных на данных с разной долей P

In [232]:
def train_evaluate(P):
    train, test = random_negative_sample(pu(df, P))
    
    model = GradientBoostingClassifier(random_state=21)
    model.fit(train.drop('Class', axis=1), train['Class'])
    
    pred = model.predict(test.drop('Class', axis=1))
    evaluate_results(test['Class'], pred)

In [233]:
train_evaluate(P=0.2)

Classification results:
F1: 56.42%
ROC-AUC: 67.60%
Recall: 84.34%
Precision: 42.39%


In [234]:
train_evaluate(P=0.25)

Classification results:
F1: 55.24%
ROC-AUC: 68.41%
Recall: 81.01%
Precision: 41.91%


In [235]:
train_evaluate(P=0.5)

Classification results:
F1: 46.86%
ROC-AUC: 69.67%
Recall: 81.19%
Precision: 32.93%


Конец!