In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import precision_recall_curve, roc_curve, roc_auc_score

from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin

In [2]:
from sklearn.preprocessing import StandardScaler

In [3]:
from sklearn.pipeline import FeatureUnion

In [4]:
data = pd.read_csv("C:/Users/li_le/geekbrains_study/ML_business/train_case2.csv", sep=';')
data.head(5)

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70000 entries, 0 to 69999
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           70000 non-null  int64  
 1   age          70000 non-null  int64  
 2   gender       70000 non-null  int64  
 3   height       70000 non-null  int64  
 4   weight       70000 non-null  float64
 5   ap_hi        70000 non-null  int64  
 6   ap_lo        70000 non-null  int64  
 7   cholesterol  70000 non-null  int64  
 8   gluc         70000 non-null  int64  
 9   smoke        70000 non-null  int64  
 10  alco         70000 non-null  int64  
 11  active       70000 non-null  int64  
 12  cardio       70000 non-null  int64  
dtypes: float64(1), int64(12)
memory usage: 6.9 MB


In [6]:
data.shape

(70000, 13)

In [7]:
Y = data['cardio']
Y.head()

0    0
1    1
2    1
3    1
4    0
Name: cardio, dtype: int64

In [8]:
data['cholesterol'].value_counts()

1    52385
2     9549
3     8066
Name: cholesterol, dtype: int64

In [9]:
data['gluc'].value_counts()

1    59479
3     5331
2     5190
Name: gluc, dtype: int64

In [10]:
data['gender'].value_counts()


1    45530
2    24470
Name: gender, dtype: int64

In [11]:
X = data[data.columns.drop('cardio').drop('id')]
X.head()

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active
0,18393,2,168,62.0,110,80,1,1,0,0,1
1,20228,1,156,85.0,140,90,3,1,0,0,1
2,18857,1,165,64.0,130,70,3,1,0,0,0
3,17623,2,169,82.0,150,100,1,1,0,0,1
4,17474,1,156,56.0,100,60,1,1,0,0,0


In [12]:
# temp = X['age']/365
# temp.hist()

In [13]:
# def standard_scale(x):
#     res = (x - x.mean(axis=0)) / x.std(axis=0)
#     return res

In [14]:
# X[['age', 'height', 'weight', 'ap_hi', 'ap_lo']] = standard_scale(X[['age', 'height', 'weight', 'ap_hi', 'ap_lo']])
# X.head()

In [15]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=13)

In [18]:
y_train.value_counts()

1    24528
0    24472
Name: cardio, dtype: int64

In [19]:
y_test.value_counts()

0    10549
1    10451
Name: cardio, dtype: int64

In [20]:
class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, column):
        self.column = column
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return X[self.column]  
    

In [21]:
cont_features = Pipeline([('selector', FeatureSelector(column=['age', 'height', 'weight', 'ap_hi', 'ap_lo'])),
                          ('standard', StandardScaler())
                         ])

In [22]:
# cont_features.fit_transform(x_train)

In [23]:
class OHEEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
        self.columns = []

    def fit(self, X, y=None):
        self.columns = [col for col in pd.get_dummies(X, prefix=self.key, columns=self.key).columns]
        return self

    def transform(self, X):
        X = pd.get_dummies(X, prefix=self.key, columns=self.key)
        test_columns = [col for col in X.columns]
        for col_ in test_columns:
            if col_ not in self.columns:
                X[col_] = 0
        return X[self.columns]

In [24]:
categorical_features = Pipeline([('selector', FeatureSelector(column=['gender', 'cholesterol', 'gluc'])),
                                 ('ohe', OHEEncoder(key=['gender', 'cholesterol', 'gluc']))
                                ])

In [25]:
# FeatureSelector(column=['gender', 'cholesterol', 'gluc']).fit_transform(X)

In [26]:
# pd.get_dummies(X[['gender', 'cholesterol', 'gluc']], columns=['gender', 'cholesterol', 'gluc'], prefix=['gender', 'cholesterol', 'gluc'])

In [27]:
# categorical_features.fit_transform(x_train)

In [28]:
feature_processing = FeatureUnion([('cont_features', cont_features),
                      ('categorical_features', categorical_features),
                      ('other',FeatureSelector(column=['smoke', 'alco', 'active']))
                      ])

In [29]:
classifier1 = Pipeline([('feature_processing', feature_processing),
                     ('clf', LogisticRegression())])

classifier2 = Pipeline([('feature_processing', feature_processing),
                     ('clf', AdaBoostClassifier())])

classifier3 = Pipeline([('feature_processing', feature_processing),
                     ('clf', RandomForestClassifier())])

In [30]:
models = [classifier1, classifier2, classifier3]
names = ['LogisticRegression', 'AdaBoost', 'RandomForest']

In [31]:
results = pd.DataFrame({'roc_auc': [0, 0, 0],
                        'precision': [0, 0, 0],
                        'recall': [0, 0, 0],
                        'f_score': [0, 0, 0]},
                       index=names)

In [32]:
for i, model in enumerate(models):
    cv_scores = cross_val_score(model, x_train, y_train, cv=3, scoring='roc_auc')
    cv_score = np.mean(cv_scores)
        
    model.fit(x_train, y_train)
    y_score = model.predict_proba(x_test)[:, 1]
    
    precision, recall, thresholds = precision_recall_curve(y_test.values, y_score)
    
    fscore = 2*(precision * recall) / (precision + recall)
    ix = np.argmax(fscore)
    results.loc[names[i],'roc_auc'] = cv_score
    results.loc[names[i],'precision'] = precision[ix]
    results.loc[names[i],'recall'] = recall[ix]
    results.loc[names[i],'f_score'] = fscore[ix]    

In [33]:
results

Unnamed: 0,roc_auc,precision,recall,f_score
LogisticRegression,0.782131,0.661961,0.807578,0.727555
AdaBoost,0.796318,0.6681,0.817434,0.735261
RandomForest,0.775208,0.641574,0.823653,0.7213


Из проверенных моделей у AdaBoost - лучшие результаты (хотя и довольно скромные)

(опциональный вопрос) какая метрика (precision_recall_curve или roc_auc_curve) больше подходит в случае сильного дисбаланса классов? (когда объектов одного из классов намного больше чем другого).
p.s.В вопросе проще разобраться, если вспомнить оси на графике roc auc curve и рассмотреть такой пример:

Имеется 100000 объектов, из которых только 100 - класс "1" (99900 - класс "0", соответственно). Допустим, у нас две модели:

первая помечает 100 объектов как класс 1, но TP = 90
вторая помечает 1000 объектов как класс 1, но TP такой же - 90
Какая модель лучше и почему? И что позволяет легче сделать вывод - roc_auc_curve или precision_recall_curve?

В приведенном примере у первой модели precision 0.9, FPR примерно 10^-4, у второй 0.09 и 9*10^-3 соответственно. Т.е. precision просел на порядок, а FPR - на два. Т.е. вроде как roc_auc_curve с большей чувствительностью показывает, какая модель лучше...