### Урок 6. Задача lookalike (Positive Unlabeled Learning)

### Домашнее задание

1. взять любой набор данных для бинарной классификации (можно скачать один из модельных с https://archive.ics.uci.edu/ml/datasets.php)
3. сделать feature engineering
4. обучить любой классификатор (какой вам нравится)
5. далее разделить ваш набор данных на два множества: P (positives) и U (unlabeled). Причем брать нужно не все положительные (класс 1) примеры, а только лишь часть
6. применить random negative sampling для построения классификатора в новых условиях
7. сравнить качество с решением из пункта 4 (построить отчет - таблицу метрик)
8. поэкспериментировать с долей P на шаге 5 (как будет меняться качество модели при уменьшении/увеличении размера P)

In [376]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score, roc_auc_score, precision_score, classification_report, precision_recall_curve, confusion_matrix
from sklearn.model_selection import train_test_split
import itertools
import xgboost as xgb

import matplotlib.pyplot as plt

%matplotlib inline


#### 1. Взять любой набор данных для бинарной классификации (можно скачать один из модельных с https://archive.ics.uci.edu/ml/datasets.php)

Сформируем датасет

In [141]:
df = pd.read_csv('heart.dat', ' ')
df.index.name = None

In [142]:
df.columns = ['age', 'sex', 'chest_pain_type', 'pressure', 'cholestoral', 'sugar', 'cardiographic', 'heart_rate', 'angina', 'oldpeak', 'ST_segment', 'flourosopy', 'thal', 'heart_disease']

In [143]:
heart_disease_to_binary = {1: 0, 2: 1}

df['heart_disease'] = df['heart_disease'].replace(heart_disease_to_binary)

In [144]:
df

Unnamed: 0,age,sex,chest_pain_type,pressure,cholestoral,sugar,cardiographic,heart_rate,angina,oldpeak,ST_segment,flourosopy,thal,heart_disease
0,67.0,0.0,3.0,115.0,564.0,0.0,2.0,160.0,0.0,1.6,2.0,0.0,7.0,0
1,57.0,1.0,2.0,124.0,261.0,0.0,0.0,141.0,0.0,0.3,1.0,0.0,7.0,1
2,64.0,1.0,4.0,128.0,263.0,0.0,0.0,105.0,1.0,0.2,2.0,1.0,7.0,0
3,74.0,0.0,2.0,120.0,269.0,0.0,2.0,121.0,1.0,0.2,1.0,1.0,3.0,0
4,65.0,1.0,4.0,120.0,177.0,0.0,0.0,140.0,0.0,0.4,1.0,0.0,7.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
264,52.0,1.0,3.0,172.0,199.0,1.0,0.0,162.0,0.0,0.5,1.0,0.0,7.0,0
265,44.0,1.0,2.0,120.0,263.0,0.0,0.0,173.0,0.0,0.0,1.0,0.0,7.0,0
266,56.0,0.0,2.0,140.0,294.0,0.0,2.0,153.0,0.0,1.3,2.0,0.0,3.0,0
267,57.0,1.0,4.0,140.0,192.0,0.0,0.0,148.0,0.0,0.4,2.0,0.0,6.0,0


In [145]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 269 entries, 0 to 268
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   age              269 non-null    float64
 1   sex              269 non-null    float64
 2   chest_pain_type  269 non-null    float64
 3   pressure         269 non-null    float64
 4   cholestoral      269 non-null    float64
 5   sugar            269 non-null    float64
 6   cardiographic    269 non-null    float64
 7   heart_rate       269 non-null    float64
 8   angina           269 non-null    float64
 9   oldpeak          269 non-null    float64
 10  ST_segment       269 non-null    float64
 11  flourosopy       269 non-null    float64
 12  thal             269 non-null    float64
 13  heart_disease    269 non-null    int64  
dtypes: float64(13), int64(1)
memory usage: 29.5 KB


In [146]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('heart_disease', 1), df['heart_disease'], test_size=0.2, random_state=0)

#### 2. Сделать feature engineering

К полям:

cardiographic, chest_pain_type, thal применим OHE-кодирование\
age, pressure, cholestoral, heart_rate, oldpeak, flourosopy, ST_segment - standardScaler\
sex, sugar, angina - оставим пока как есть

In [147]:
class ColumnSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.key]
    
class NumberSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on numeric columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]
    
class OHEEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
        self.columns = []

    def fit(self, X, y=None):
        self.columns = [col for col in pd.get_dummies(X, prefix=self.key).columns]
        return self

    def transform(self, X):
        X = pd.get_dummies(X, prefix=self.key)
        test_columns = [col for col in X.columns]
        for col_ in test_columns:
            if col_ not in self.columns:
                X[col_] = 0
        return X[self.columns]


from sklearn.preprocessing import StandardScaler


continuos_cols = ['age', 'pressure', 'cholestoral', 'heart_rate', 'oldpeak', 'flourosopy', 'ST_segment']
cat_cols = ['cardiographic', 'chest_pain_type', 'thal']
base_cols = ['sex', 'sugar', 'angina']

continuos_transformers = []
cat_transformers = []
base_transformers = []

for cont_col in continuos_cols:
    transfomer =  Pipeline([
                ('selector', NumberSelector(key=cont_col)),
                ('standard', StandardScaler())
            ])
    continuos_transformers.append((cont_col, transfomer))
    
for cat_col in cat_cols:
    cat_transformer = Pipeline([
                ('selector', ColumnSelector(key=cat_col)),
                ('ohe', OHEEncoder(key=cat_col))
            ])
    cat_transformers.append((cat_col, cat_transformer))
    
for base_col in base_cols:
    base_transformer = Pipeline([
                ('selector', NumberSelector(key=base_col))
            ])
    base_transformers.append((base_col, base_transformer))

In [148]:
final_transformers = list()

for cat_col in cat_cols:
    cat_transformer = Pipeline([
                ('selector', ColumnSelector(key=cat_col)),
                ('ohe', OHEEncoder(key=cat_col))
            ])
    final_transformers.append((cat_col, cat_transformer))
    
for cont_col in continuos_cols:
    cont_transformer = Pipeline([
                ('selector', NumberSelector(key=cont_col)),
                ('scale', StandardScaler())
               
            ])


    final_transformers.append((cont_col, cont_transformer))

In [149]:
feats = FeatureUnion(final_transformers)

feature_processing = Pipeline([('feats', feats)])

#### 3. Обучить любой классификатор (какой вам нравится)

In [150]:
pipeline = Pipeline([
    ('features',feats),
    ('classifier', RandomForestClassifier(random_state = 42)),
])

In [109]:
# pipeline = Pipeline([
#     ('features',feats),
#     ('classifier', xgb.XGBClassifier(random_state = 42)),
# ])

In [151]:
pipeline.fit(X_train, y_train)

Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('cardiographic',
                                                 Pipeline(steps=[('selector',
                                                                  ColumnSelector(key='cardiographic')),
                                                                 ('ohe',
                                                                  OHEEncoder(key='cardiographic'))])),
                                                ('chest_pain_type',
                                                 Pipeline(steps=[('selector',
                                                                  ColumnSelector(key='chest_pain_type')),
                                                                 ('ohe',
                                                                  OHEEncoder(key='chest_pain_type'))])),
                                                ('thal',
                                                 Pipeline(st

In [111]:
# y_predict = pipeline.predict(X_test)

In [152]:
preds = pipeline.predict_proba(X_test)[:, 1]
precision_rf, recall_rf, thresholds_rf = precision_recall_curve(y_test, preds)
fscore_rf = (2*precision_rf * recall_rf) / (precision_rf + recall_rf)
# locate the index of the largest f score
ix = np.argmax(fscore_rf)
print('Best Threshold=%f, F-Score=%.3f, Precision=%.3f, Recall=%.3f' % (thresholds_rf[ix], 
                                                                        fscore_rf[ix],
                                                                        precision_rf[ix],
                                                                        recall_rf[ix]))
Random_Forest_Classifier = thresholds_rf[ix], fscore_rf[ix], precision_rf[ix], recall_rf[ix]


Best Threshold=0.350000, F-Score=0.842, Precision=0.828, Recall=0.857


In [113]:
# from sklearn.metrics import recall_score, precision_score, roc_auc_score, accuracy_score, f1_score

# def evaluate_results(y_test, y_predict):
#     print('Classification results:')
#     f1 = f1_score(y_test, y_predict)
#     print("f1: %.2f%%" % (f1 * 100.0)) 
#     rec = recall_score(y_test, y_predict, average='binary')
#     print("recall: %.2f%%" % (rec * 100.0)) 
#     prc = precision_score(y_test, y_predict, average='binary')
#     print("precision: %.2f%%" % (prc * 100.0)) 
    

    
# evaluate_results(y_test, y_predict)

Classification results:
f1: 76.60%
recall: 64.29%
precision: 94.74%


#### 4. Далее разделить ваш набор данных на два множества: P (positives) и U (unlabeled). Причем брать нужно не все положительные (класс 1) примеры, а только лишь часть

In [222]:
mod_data = df.copy()
#get the indices of the positives samples
pos_ind = np.where(mod_data.iloc[:,-1].values == 1)[0]
#shuffle them
np.random.shuffle(pos_ind)
# leave just 25% of the positives marked
pos_sample_len = int(np.ceil(0.25 * len(pos_ind)))
print(f'Using {pos_sample_len}/{len(pos_ind)} as positives and unlabeling the rest')
pos_sample = pos_ind[:pos_sample_len]

Using 30/119 as positives and unlabeling the rest


In [223]:
mod_data['class_test'] = -1
mod_data.loc[pos_sample,'class_test'] = 1
print('target variable:\n', mod_data.iloc[:,-1].value_counts())

target variable:
 -1    239
 1     30
Name: class_test, dtype: int64


In [224]:
mod_data.tail(10)

Unnamed: 0,age,sex,chest_pain_type,pressure,cholestoral,sugar,cardiographic,heart_rate,angina,oldpeak,ST_segment,flourosopy,thal,heart_disease,class_test
259,58.0,0.0,3.0,120.0,340.0,0.0,0.0,172.0,0.0,0.0,1.0,0.0,3.0,0,-1
260,60.0,1.0,4.0,130.0,206.0,0.0,2.0,132.0,1.0,2.4,2.0,2.0,7.0,1,1
261,58.0,1.0,2.0,120.0,284.0,0.0,2.0,160.0,0.0,1.8,2.0,0.0,3.0,1,-1
262,49.0,1.0,2.0,130.0,266.0,0.0,0.0,171.0,0.0,0.6,1.0,0.0,3.0,0,-1
263,48.0,1.0,2.0,110.0,229.0,0.0,0.0,168.0,0.0,1.0,3.0,0.0,7.0,1,1
264,52.0,1.0,3.0,172.0,199.0,1.0,0.0,162.0,0.0,0.5,1.0,0.0,7.0,0,-1
265,44.0,1.0,2.0,120.0,263.0,0.0,0.0,173.0,0.0,0.0,1.0,0.0,7.0,0,-1
266,56.0,0.0,2.0,140.0,294.0,0.0,2.0,153.0,0.0,1.3,2.0,0.0,3.0,0,-1
267,57.0,1.0,4.0,140.0,192.0,0.0,0.0,148.0,0.0,0.4,2.0,0.0,6.0,0,-1
268,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,1,-1


In [225]:
x_data = mod_data.iloc[:,:-2].values # только X 
y_labeled = mod_data.iloc[:,-1].values # класс для PUL ( P & U)
y_positive = mod_data.iloc[:,-2].values # истиная разметка

In [226]:
mod_data = mod_data.sample(frac=1)
neg_sample = mod_data[mod_data['class_test']==-1][:len(mod_data[mod_data['class_test']==1])]
sample_test = mod_data[mod_data['class_test']==-1][len(mod_data[mod_data['class_test']==1]):]
pos_sample = mod_data[mod_data['class_test']==1]
print(neg_sample.shape, pos_sample.shape)
sample_train = pd.concat([neg_sample, pos_sample]).sample(frac=1)

(30, 15) (30, 15)


In [227]:
sample_train.head()

Unnamed: 0,age,sex,chest_pain_type,pressure,cholestoral,sugar,cardiographic,heart_rate,angina,oldpeak,ST_segment,flourosopy,thal,heart_disease,class_test
223,35.0,0.0,4.0,138.0,183.0,0.0,0.0,182.0,0.0,1.4,1.0,0.0,3.0,0,-1
208,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0,-1
43,59.0,1.0,3.0,126.0,218.0,1.0,0.0,134.0,0.0,2.2,2.0,1.0,6.0,1,1
248,60.0,1.0,4.0,145.0,282.0,0.0,2.0,142.0,1.0,2.8,2.0,2.0,7.0,1,1
92,54.0,1.0,4.0,110.0,206.0,0.0,2.0,108.0,1.0,0.0,2.0,1.0,3.0,1,1


#### 5. Применить random negative sampling для построения классификатора в новых условиях

In [228]:
# model = xgb.XGBClassifier(use_label_encoder=False)
model = RandomForestClassifier()
sample_train.loc[sample_train['class_test']==-1,'class_test']=0
model.fit(sample_train.iloc[:,:-2].values, 
          sample_train.iloc[:,-1].values)
y_predict = model.predict(sample_test.iloc[:,:-2].values)
evaluate_results(sample_test.iloc[:,-2].values, y_predict)

Classification results:
f1: 71.14%
recall: 67.09%
precision: 75.71%


In [229]:
model = xgb.XGBClassifier(use_label_encoder=False)
# model = RandomForestClassifier()
sample_train.loc[sample_train['class_test']==-1,'class_test']=0
model.fit(sample_train.iloc[:,:-2].values, 
          sample_train.iloc[:,-1].values)
y_predict = model.predict(sample_test.iloc[:,:-2].values)
evaluate_results(sample_test.iloc[:,-2].values, y_predict)

Classification results:
f1: 72.48%
recall: 68.35%
precision: 77.14%




#### 6. Сравнить качество с решением из пункта 4 (построить отчет - таблицу метрик), поэкспериментировать с долей P на шаге 5 (как будет меняться качество модели при уменьшении/увеличении размера P)

In [231]:
df1 = pd.read_csv('tab.csv', ';')
df1

Unnamed: 0,Model,F-score(F1),Precision,Recall
0,RF,842,828,857
1,XGB,766,947,643
2,"RF(0,1)",629,607,654
3,"RF(0,25)",698,663,738
4,"RF(0,5)",711,8,64
5,"XGB(0,1)",517,462,587
6,"XGB(0,25)",639,608,675
7,"XGB(0,5)",644,725,58


#### Бонусное задание:

In [364]:
dtf = pd.read_csv('train_PUL.csv', index_col=0)
dtf.head()

Unnamed: 0,id,taxactionSystem,regdt,workerCount,fssdccount,pfrdcCount,fnsdcCount,hasCloudCryptCertificate,OrgCreationDate,documentsCount,cnt_users,target
0,101969,"УСН, Доходы 6%",1994-03-26 00:00:00,5.0,0.0,0.0,0.0,0,2014-12-04,2.0,0.0,1.0
1,108477,ОСНО,2012-12-05 00:00:00,1.0,0.0,0.0,0.0,0,2014-09-22,5.0,0.0,-1.0
2,101476,"УСН, Доходы 6%",2009-11-15 00:00:00,1.0,3.0,3.0,2.0,0,2016-04-05,0.0,0.0,-1.0
3,102579,"УСН, Доходы за вычетом расходов 15%",2015-08-09 00:00:00,1.0,0.0,0.0,0.0,0,2015-09-24,0.0,0.0,-1.0
4,103110,"УСН, Доходы за вычетом расходов 15%",2015-03-29 00:00:00,0.0,0.0,0.0,0.0,0,2015-04-03,4.0,0.0,-1.0


In [365]:
dtf['taxactionSystem'].value_counts()

ОСНО                                           2752
УСН, Доходы 6%                                 2004
УСН, Доходы за вычетом расходов 15%            1295
ЕНВД                                            681
УСН, Доходы за вычетом расходов 7%              230
УСН, Доходы 6% +ЕНВД                            136
УСН, Доходы за вычетом расходов 10%             134
УСН, Доходы за вычетом расходов 5%              133
УСН, Доходы 3%                                  107
УСН, Доходы за вычетом расходов 15% +ЕНВД        66
УСН, Доходы за вычетом расходов 7% +ЕНВД         20
УСН, Доходы за вычетом расходов 12,5%            17
УСН, Доходы за вычетом расходов 5% +ЕНВД         13
УСН, Доходы 3% +ЕНВД                             12
УСН, Доходы за вычетом расходов 6%                8
УСН, Доходы за вычетом расходов 12%               6
УСН, Доходы за вычетом расходов 10% +ЕНВД         4
УСН, Доходы за вычетом расходов 7,5%              2
УСН, Доходы 4%                                    2
УСН, Доходы 

In [366]:
one_hot = pd.get_dummies(dtf['taxactionSystem'])
# Drop column as it is now encoded
dtf = dtf.drop('taxactionSystem',axis = 1)
# Join the encoded df
dtf = dtf.join(one_hot)
dtf 

Unnamed: 0,id,regdt,workerCount,fssdccount,pfrdcCount,fnsdcCount,hasCloudCryptCertificate,OrgCreationDate,documentsCount,cnt_users,...,"УСН, Доходы за вычетом расходов 13%","УСН, Доходы за вычетом расходов 15%","УСН, Доходы за вычетом расходов 15% +ЕНВД","УСН, Доходы за вычетом расходов 5%","УСН, Доходы за вычетом расходов 5% +ЕНВД","УСН, Доходы за вычетом расходов 6%","УСН, Доходы за вычетом расходов 7%","УСН, Доходы за вычетом расходов 7% +ЕНВД","УСН, Доходы за вычетом расходов 7,5%","УСН, Доходы за вычетом расходов 9%"
0,101969,1994-03-26 00:00:00,5.0,0.0,0.0,0.0,0,2014-12-04,2.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,108477,2012-12-05 00:00:00,1.0,0.0,0.0,0.0,0,2014-09-22,5.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,101476,2009-11-15 00:00:00,1.0,3.0,3.0,2.0,0,2016-04-05,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,102579,2015-08-09 00:00:00,1.0,0.0,0.0,0.0,0,2015-09-24,0.0,0.0,...,0,1,0,0,0,0,0,0,0,0
4,103110,2015-03-29 00:00:00,0.0,0.0,0.0,0.0,0,2015-04-03,4.0,0.0,...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7621,102780,2009-01-15 00:00:00,0.0,0.0,0.0,2.0,1,2015-01-19,5.0,0.0,...,0,1,0,0,0,0,0,0,0,0
7622,104557,2012-11-20 00:00:00,0.0,1.0,2.0,3.0,1,2015-01-07,7.0,1.0,...,0,0,0,0,0,0,0,0,0,0
7623,107161,2005-04-19 00:00:00,2.0,0.0,0.0,0.0,0,2000-01-01,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
7624,104526,2008-10-07 00:00:00,0.0,0.0,0.0,0.0,0,2014-11-20,6.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [373]:
cols = list(dtf.columns.values) #Make a list of all of the columns in the df
cols.pop(cols.index('target')) #Remove b from list
dtf = dtf[cols+['target']] #Create new dataframe with columns in the order you want

In [368]:
# dtf['OrgCreationDate'] = pd.to_datetime(dtf['OrgCreationDate'])
# dtf['regdt'] = pd.to_datetime(dtf['regdt'], format = '%Y-%m-%d', errors = 'coerce')

In [369]:
dtf['OrgCreationDate'] = (pd.to_datetime(dtf['OrgCreationDate']) - pd.Timestamp('1970-01-01')) // pd.Timedelta('1s')
dtf['regdt'] = (pd.to_datetime(dtf['regdt'], format = '%Y-%m-%d', errors = 'coerce') - pd.Timestamp('1970-01-01')) // pd.Timedelta('1s') 

In [370]:
dtf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7626 entries, 0 to 7625
Data columns (total 34 columns):
 #   Column                                       Non-Null Count  Dtype  
---  ------                                       --------------  -----  
 0   id                                           7626 non-null   int64  
 1   regdt                                        7598 non-null   float64
 2   workerCount                                  7626 non-null   float64
 3   fssdccount                                   7626 non-null   float64
 4   pfrdcCount                                   7626 non-null   float64
 5   fnsdcCount                                   7626 non-null   float64
 6   hasCloudCryptCertificate                     7626 non-null   int64  
 7   OrgCreationDate                              7626 non-null   int64  
 8   documentsCount                               7626 non-null   float64
 9   cnt_users                                    7626 non-null   float64
 10  

In [388]:
dtf['target'].value_counts()

 1.0    3942
-1.0    3684
Name: target, dtype: int64

In [396]:
dtf = dtf.dropna()

In [397]:
X_train, X_test, y_train, y_test = train_test_split(dtf, dtf['target'], random_state=0)

In [400]:
clf = LogisticRegression(random_state=0)

In [401]:
clf.fit(X_train,  y_train)

LogisticRegression(random_state=0)

In [405]:
preds = clf.predict(X_test)
preds[:10]

array([-1., -1.,  1.,  1., -1., -1., -1., -1., -1., -1.])

In [409]:
rez_df = pd.DataFrame(data = preds, columns=['target_pul'])

rez_df['target_decision_pul'] = 0

rez_df.head()

Unnamed: 0,target_pul,target_decision_pul
0,-1.0,0
1,-1.0,0
2,1.0,0
3,1.0,0
4,-1.0,0


In [410]:
rez_df.to_csv('df_name_decision.csv')

К сожалению оченть мало времени (EDA - так себе получился....) 