In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_recall_curve, roc_auc_score

In [3]:
df = pd.read_csv('train_case2.csv', ';')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70000 entries, 0 to 69999
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           70000 non-null  int64  
 1   age          70000 non-null  int64  
 2   gender       70000 non-null  int64  
 3   height       70000 non-null  int64  
 4   weight       70000 non-null  float64
 5   ap_hi        70000 non-null  int64  
 6   ap_lo        70000 non-null  int64  
 7   cholesterol  70000 non-null  int64  
 8   gluc         70000 non-null  int64  
 9   smoke        70000 non-null  int64  
 10  alco         70000 non-null  int64  
 11  active       70000 non-null  int64  
 12  cardio       70000 non-null  int64  
dtypes: float64(1), int64(12)
memory usage: 6.9 MB


In [5]:
df.head(5)

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0


In [6]:
df = pd.get_dummies(df, columns=['gender', 'cholesterol'])
df.head(5)

Unnamed: 0,id,age,height,weight,ap_hi,ap_lo,gluc,smoke,alco,active,cardio,gender_1,gender_2,cholesterol_1,cholesterol_2,cholesterol_3
0,0,18393,168,62.0,110,80,1,0,0,1,0,0,1,1,0,0
1,1,20228,156,85.0,140,90,1,0,0,1,1,1,0,0,0,1
2,2,18857,165,64.0,130,70,1,0,0,0,1,1,0,0,0,1
3,3,17623,169,82.0,150,100,1,0,0,1,1,0,1,1,0,0
4,4,17474,156,56.0,100,60,1,0,0,0,0,1,0,1,0,0


In [7]:
scaler = StandardScaler()
cols = ['age', 'height', 'weight', 'ap_hi', 'ap_lo']
scaled = scaler.fit_transform(df[cols])
df[cols] = scaled
df.describe()

Unnamed: 0,id,age,height,weight,ap_hi,ap_lo,gluc,smoke,alco,active,cardio,gender_1,gender_2,cholesterol_1,cholesterol_2,cholesterol_3
count,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0
mean,49972.4199,5.272227e-16,1.450116e-15,-2.905105e-16,7.623108000000001e-17,1.7459050000000003e-17,1.226457,0.088129,0.053771,0.803729,0.4997,0.650429,0.349571,0.748357,0.136414,0.115229
std,28851.302323,1.000007,1.000007,1.000007,1.000007,1.000007,0.57227,0.283484,0.225568,0.397179,0.500003,0.476838,0.476838,0.43396,0.34323,0.3193
min,0.0,-3.514407,-13.32014,-4.460075,-1.810381,-0.8841161,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,25006.75,-0.7315341,-0.652763,-0.639477,-0.05725127,-0.0882385,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,50001.5,0.09489744,0.07804703,-0.1532192,-0.05725127,-0.0882385,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
75%,74889.25,0.7531244,0.6870554,0.5414349,0.07261016,-0.03517999,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0
max,99999.0,1.720199,10.43119,8.738353,103.1826,57.85165,3.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [8]:
def get_metrics(y_test, y_score, b=1):
    metrics = {}
    
    precision, recall, thresholds = precision_recall_curve(y_test.values, y_score)
    fscore = (1+b**2)*(precision * recall) / (b**2*precision + recall)
    ix = np.argmax(fscore)
    
    metrics['threshold'] = thresholds[ix]
    metrics['fscore'] = fscore[ix]
    metrics['precision'] = precision[ix]
    metrics['recall'] = recall[ix]
    metrics['roc_auc'] = roc_auc_score(y_test, y_score)
    return metrics

def run_classifier_with_cv(clf, X_train, X_test, y_train, y_test, cv, scoring):
    results = {}
    
    cv_scores = cross_val_score(clf, X_train, y_train, cv=cv, scoring=scoring)
    results['cv_mean'] = np.mean(cv_scores)
    results['cv_std'] = np.std(cv_scores)
    
    clf.fit(X_train, y_train)
    y_score = clf.predict_proba(X_test)[:, 1]
    results.update(get_metrics(y_test, y_score))
    return results

In [9]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(['id', 'cardio'], axis=1),
                                                    df['cardio'], random_state=0)

In [10]:
clfs = {}
clfs['LogisticRegression'] = LogisticRegression(random_state=42)
clfs['RandomForestClassifier'] = RandomForestClassifier(max_depth=7, random_state=42)
clfs['KNeighborsClassifier'] = KNeighborsClassifier(n_neighbors=10)

for name, clf in clfs.items():
    print(name)
    print(run_classifier_with_cv(clf, X_train, X_test, y_train, y_test, cv=16, scoring='roc_auc'))

LogisticRegression
{'cv_mean': 0.7867088338068362, 'cv_std': 0.008521472702692753, 'threshold': 0.3864257110218679, 'fscore': 0.7302558956347216, 'precision': 0.6468444444444444, 'recall': 0.8383640552995392, 'roc_auc': 0.7840057551438394}
RandomForestClassifier
{'cv_mean': 0.7989775933676759, 'cv_std': 0.0068016338292253556, 'threshold': 0.3976489520646465, 'fscore': 0.7389668874172186, 'precision': 0.6840608141245709, 'recall': 0.8034562211981566, 'roc_auc': 0.7994705097860957}
KNeighborsClassifier
{'cv_mean': 0.712426811671652, 'cv_std': 0.0058571280347927334, 'threshold': 0.3, 'fscore': 0.6961884900386033, 'precision': 0.5661398571119289, 'recall': 0.9038018433179723, 'roc_auc': 0.7121452866861029}


Вывод: в данном случае лучше вего себя показала модель на основе RandomForestClassifier; однако результаты могут измениться, если постараться подобрать оптимальные параметры для каждой из моделей