In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
#from IPython.display import display
#import sweetviz as sv
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder

from sklearn.model_selection import train_test_split, cross_val_score,  RepeatedStratifiedKFold,RandomizedSearchCV, GridSearchCV


from sklearn.metrics import classification_report, roc_curve, confusion_matrix

from collections import Counter

from sklearn.impute import KNNImputer

# modelos
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, \
VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

In [2]:
df = pd.read_csv("healthcare-dataset-stroke-data.csv")
df = df.drop(['id'], axis=1)#去掉无意义的ID属性
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gender             5110 non-null   object 
 1   age                5110 non-null   float64
 2   hypertension       5110 non-null   int64  
 3   heart_disease      5110 non-null   int64  
 4   ever_married       5110 non-null   object 
 5   work_type          5110 non-null   object 
 6   Residence_type     5110 non-null   object 
 7   avg_glucose_level  5110 non-null   float64
 8   bmi                4909 non-null   float64
 9   smoking_status     5110 non-null   object 
 10  stroke             5110 non-null   int64  
dtypes: float64(3), int64(3), object(5)
memory usage: 439.3+ KB


In [3]:
labels = [1,2,3,4,5]
df["age_mean"] = pd.cut(df["age"], 5, labels=labels)
df["age_mean"].value_counts()
grp_bmi = df.groupby("age_mean")["bmi"].mean()                 #将年龄阶段分为五组，分别计算平均的bmi
grp_bmi

age_mean
1    20.787661
2    28.601332
3    31.425088
4    31.580618
5    29.415118
Name: bmi, dtype: float64

In [4]:
def bmi_val(cols): #给 bmi 按年龄分配平均值   
    bmi = cols[0]
    age_mean = cols[1]
    
    if pd.isnull(bmi):
        if age_mean == 1:
            return 20.7
        elif age_mean == 2:
            return 28.6
        elif age_mean == 3:
            return 31.4
        elif age_mean == 4:
            return 31.6
        elif age_mean == 5:
            return 29.4
    else:
        return bmi
df["bmi"] = df[["bmi","age_mean"]].apply(bmi_val, axis=1)


In [5]:
df["bmi"] 

0       36.6
1       31.6
2       32.5
3       34.4
4       24.0
        ... 
5105    29.4
5106    40.0
5107    30.6
5108    25.6
5109    26.2
Name: bmi, Length: 5110, dtype: float64

In [6]:
df.drop("age_mean",axis=1, inplace=True)#删掉之前建的age_mean属性
df.isnull().sum() 

gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64

In [7]:
df.gender.unique()

array(['Male', 'Female', 'Other'], dtype=object)

In [8]:
df.gender.value_counts()

Female    2994
Male      2115
Other        1
Name: gender, dtype: int64

In [9]:
df.gender = df.gender.apply(lambda x: 0 if ((x == 'Female') or (x == 'Other')) else 1)#将other 改成 Female

In [10]:
df.gender.value_counts()

0    2995
1    2115
Name: gender, dtype: int64

In [11]:
ohe = OneHotEncoder(drop = 'first', sparse=False, handle_unknown = 'error')#使用onehot 将object类型类型都变成数字
df_t = pd.DataFrame(ohe.fit_transform(df.select_dtypes('object')))

df_t.columns = ohe.get_feature_names()
df_t.head(2)
df_t.shape

(5110, 9)

In [12]:
df

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,1,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,0,61.0,0,0,Yes,Self-employed,Rural,202.21,31.6,never smoked,1
2,1,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,0,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,0,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...
5105,0,80.0,1,0,Yes,Private,Urban,83.75,29.4,never smoked,0
5106,0,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,0,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,1,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [13]:
df_n = df.drop(df.select_dtypes('object'), axis=1)

df = pd.concat([df_t, df_n], axis=1)
df.head(2)


Unnamed: 0,x0_Yes,x1_Never_worked,x1_Private,x1_Self-employed,x1_children,x2_Urban,x3_formerly smoked,x3_never smoked,x3_smokes,gender,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1,67.0,0,1,228.69,36.6,1
1,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0,61.0,0,0,202.21,31.6,1


In [14]:
df.dtypes
from sklearn.model_selection import train_test_split
X = df.drop(columns=['stroke'])
y = df['stroke']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)


In [15]:
X_train

Unnamed: 0,x0_Yes,x1_Never_worked,x1_Private,x1_Self-employed,x1_children,x2_Urban,x3_formerly smoked,x3_never smoked,x3_smokes,gender,age,hypertension,heart_disease,avg_glucose_level,bmi
845,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0,48.0,0,0,69.21,33.1
3744,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1,15.0,0,0,122.25,21.0
4183,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0,67.0,0,0,110.42,24.9
3409,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1,44.0,0,0,65.41,24.8
284,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1,14.0,0,0,82.34,31.6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1434,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0,45.0,0,0,92.86,35.1
461,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0,16.0,0,0,113.47,19.5
1052,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0,61.0,0,0,78.65,36.2
1757,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1,31.0,0,0,74.05,26.0


In [17]:
classifier = LogisticRegression() 
classifier.fit(X_train, y_train) 
score = classifier.score(X_test, y_test)
print("Accuracy :",score)


Accuracy : 0.9510763209393346


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
