In [79]:
import pandas as pd 
import numpy as np
#from numpy.core.umath_tests import inner1d
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder

from sklearn.feature_selection import SelectKBest, f_classif

from sklearn.model_selection import train_test_split, cross_val_score,  RepeatedStratifiedKFold,RandomizedSearchCV, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score,classification_report,precision_score,recall_score
from sklearn.metrics import classification_report, roc_curve, confusion_matrix
from collections import Counter

# modelos
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.ensemble import  ExtraTreesClassifier,VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

import sys

In [80]:
data = pd.read_csv('healthcare-dataset-stroke-data.csv')

In [81]:
data.head(5)

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [82]:
# data1 : without id
data = data.drop(['id'], axis=1)


In [None]:
# data.isnull().sum()

In [None]:
# data.work_type.value_counts()

In [86]:
label = [1,2,3,4,5]
data["age_mean"] = pd.cut(data["age"], 5, labels = label)
data["age_mean"].value_counts()
bmi_group = data.groupby("age_mean")["bmi"].mean() #将年龄阶段分为五组，分别计算平均的bmi
bmi_group

age_mean
1    20.787661
2    28.601332
3    31.425088
4    31.580618
5    29.415118
Name: bmi, dtype: float64

In [87]:
def bmi_val(cols): #给 bmi 按年龄分配平均值
    bmi = cols[0]
    age_mean = cols[1]
    
    if pd.isnull(bmi):
        if age_mean == 1:
            return 20.7
        elif age_mean == 2:
            return 28.6
        elif age_mean == 3:
            return 31.4
        elif age_mean == 4:
            return 31.6
        elif age_mean == 5:
            return 29.4
    else:
        return bmi

In [88]:
data["bmi"] = data[["bmi","age_mean"]].apply(bmi_val, axis=1)

In [89]:
data.isnull().sum() 

gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
age_mean             0
dtype: int64

In [90]:
data.gender = data.gender.apply(
    lambda x: 0 
    if ((x == 'Female') or (x == 'Other')) 
    else 1)

#将other 改成 Female

In [91]:
data.gender.value_counts()

0    2995
1    2115
Name: gender, dtype: int64

In [93]:
data['age'] = data['age'].apply(lambda x: round(x))
data['age']

0       67
1       61
2       80
3       49
4       79
        ..
5105    80
5106    81
5107    35
5108    51
5109    44
Name: age, Length: 5110, dtype: int64

In [95]:
ohe = OneHotEncoder(drop = 'first', sparse=False, handle_unknown = 'error')
data_t = pd.DataFrame(ohe.fit_transform(data.select_dtypes('object')))

data_t.columns = ohe.get_feature_names()
data_t.head(2)
data_t.shape

(5110, 9)

In [97]:
data_n = data.drop(data.select_dtypes('object'), axis=1)

data = pd.concat([data_t, data_n], axis=1)
data.head(5)

Unnamed: 0,x0_Yes,x1_Never_worked,x1_Private,x1_Self-employed,x1_children,x2_Urban,x3_formerly smoked,x3_never smoked,x3_smokes,gender,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke,age_mean
0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1,67,0,1,228.69,36.6,1,5
1,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0,61,0,0,202.21,31.6,1,4
2,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1,80,0,1,105.92,32.5,1,5
3,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0,49,0,0,171.23,34.4,1,3
4,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0,79,1,0,174.12,24.0,1,5


In [98]:
k_values = [1, 2, 3, 4, 5, 6, 7]
X = data.drop(['stroke'], axis=1)
y = data.stroke

In [99]:
# print(X.head(5))
# print(y.head(2))

Unnamed: 0,x0_Yes,x1_Never_worked,x1_Private,x1_Self-employed,x1_children,x2_Urban,x3_formerly smoked,x3_never smoked,x3_smokes,gender,age,hypertension,heart_disease,avg_glucose_level,bmi,age_mean
0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1,67,0,1,228.69,36.6,5
1,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0,61,0,0,202.21,31.6,4
2,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1,80,0,1,105.92,32.5,5
3,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0,49,0,0,171.23,34.4,3
4,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0,79,1,0,174.12,24.0,5


In [101]:
k_values = [1, 2, 3, 4, 5, 6, 7]
X = data.drop(['stroke'], axis=1)
y = data.stroke

k_scores = []
for k in k_values:
    # define pipeline
    rf = RandomForestClassifier()
    over  = SMOTE(sampling_strategy = 0.1, k_neighbors = k)
    steps = [('over', over), ('model', rf)]
    
    pipeline = Pipeline(steps=steps)
    
    # evaluate pipeline
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    
    scores = cross_val_score(rf, X, y, scoring='roc_auc', cv=cv, n_jobs=-1)
    score = np.mean(scores)
    k_scores.append((k, score))
    print('> k=%d, Mean ROC AUC: %.3f' % (k, score))

> k=1, Mean ROC AUC: 0.809
> k=2, Mean ROC AUC: 0.808
> k=3, Mean ROC AUC: 0.810
> k=4, Mean ROC AUC: 0.809
> k=5, Mean ROC AUC: 0.802
> k=6, Mean ROC AUC: 0.809
> k=7, Mean ROC AUC: 0.807


In [102]:
sorted(k_scores, key=lambda tup: tup[1])

[(5, 0.802101185697828),
 (7, 0.8074595131864695),
 (2, 0.8082619478174653),
 (4, 0.8085190121485086),
 (6, 0.809063855017844),
 (1, 0.8092805937925148),
 (3, 0.8104500673195821)]

In [103]:
best_k = sorted(k_scores, key=lambda tup: tup[1])[-1][0]
best_k

3

In [104]:
over  = SMOTE(k_neighbors=best_k)

steps = [('over', over)]
pipeline = Pipeline(steps=steps)
X, y = pipeline.fit_resample(X, y)
counter = Counter(y)
print(counter)

Counter({1: 4861, 0: 4861})


  return f(*args, **kwargs)


In [None]:
# print(y.value_counts())

In [152]:
 X.isnull().sum()

x0_Yes                  0
x1_Never_worked         0
x1_Private              0
x1_Self-employed        0
x1_children             0
x2_Urban                0
x3_formerly smoked      0
x3_never smoked         0
x3_smokes               0
gender                  0
age                     0
hypertension            0
heart_disease           0
avg_glucose_level       0
bmi                     0
age_mean              496
dtype: int64

In [160]:
X = X.drop(['age_mean'], axis=1)  # 删除 age_mean 列
# X.isnull().sum()


In [164]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, stratify = y, random_state=42)

In [165]:
X_train.columns

Index(['x0_Yes', 'x1_Never_worked', 'x1_Private', 'x1_Self-employed',
       'x1_children', 'x2_Urban', 'x3_formerly smoked', 'x3_never smoked',
       'x3_smokes', 'gender', 'age', 'hypertension', 'heart_disease',
       'avg_glucose_level', 'bmi'],
      dtype='object')

In [166]:
rf = RandomForestClassifier()

n_estimators = [100,500,700,1000]
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]

grid =         {'n_estimators': n_estimators,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

#search_nc = GridSearchCV(estimator = pipeline, param_grid = grid, cv = 5, verbose=1, n_jobs=-1)
search = RandomizedSearchCV(estimator = rf, param_distributions = grid, cv = 5, verbose=1, n_jobs=-1)

In [167]:
search.fit(X_train, y_train)
preds = search.predict(X_test)
print(classification_report(y_test, preds))


Fitting 5 folds for each of 10 candidates, totalling 50 fits
              precision    recall  f1-score   support

           0       0.97      0.99      0.98       973
           1       0.99      0.97      0.98       972

    accuracy                           0.98      1945
   macro avg       0.98      0.98      0.98      1945
weighted avg       0.98      0.98      0.98      1945

