In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from scipy.stats import spearmanr
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
import joblib

In [2]:
dataset=pd.read_csv('healthcare-dataset-stroke-data.csv')
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB


In [3]:
dataset.isnull().sum()

id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

In [4]:
imputation = SimpleImputer(missing_values=np.nan, strategy='mean')
imputation.fit(dataset.iloc[:,[9]])
dataset.iloc[:,[9]] = imputation.transform(dataset.iloc[:,[9]])
dataset.isnull().sum()

id                   0
gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64

In [5]:
encoder = LabelEncoder()
dataset['gender'] = encoder.fit_transform(dataset['gender'])
dataset['ever_married'] = encoder.fit_transform(dataset['ever_married'])
dataset['work_type'] = encoder.fit_transform(dataset['work_type'])
dataset['Residence_type'] = encoder.fit_transform(dataset['Residence_type'])
dataset['smoking_status'] = encoder.fit_transform(dataset['smoking_status'])
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   int32  
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   int32  
 6   work_type          5110 non-null   int32  
 7   Residence_type     5110 non-null   int32  
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                5110 non-null   float64
 10  smoking_status     5110 non-null   int32  
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int32(5), int64(4)
memory usage: 379.4 KB


In [6]:
dataset.duplicated()

0       False
1       False
2       False
3       False
4       False
        ...  
5105    False
5106    False
5107    False
5108    False
5109    False
Length: 5110, dtype: bool

In [7]:
X = dataset[['id', 'gender', 'age', 'hypertension', 'heart_disease', 'ever_married', 'work_type', 'Residence_type', 'avg_glucose_level', 'bmi', 'smoking_status']]
Y = dataset['stroke']
for col in X.columns:
    corr, pval = spearmanr(X[col], Y)
    print(f"Variable {col} : Coefficient de corrélation de rang de Spearman = {corr:.3f}, p-value = {pval:.3f}")

Variable id : Coefficient de corrélation de rang de Spearman = 0.006, p-value = 0.642
Variable gender : Coefficient de corrélation de rang de Spearman = 0.009, p-value = 0.521
Variable age : Coefficient de corrélation de rang de Spearman = 0.250, p-value = 0.000
Variable hypertension : Coefficient de corrélation de rang de Spearman = 0.128, p-value = 0.000
Variable heart_disease : Coefficient de corrélation de rang de Spearman = 0.135, p-value = 0.000
Variable ever_married : Coefficient de corrélation de rang de Spearman = 0.108, p-value = 0.000
Variable work_type : Coefficient de corrélation de rang de Spearman = -0.025, p-value = 0.078
Variable Residence_type : Coefficient de corrélation de rang de Spearman = 0.015, p-value = 0.269
Variable avg_glucose_level : Coefficient de corrélation de rang de Spearman = 0.083, p-value = 0.000
Variable bmi : Coefficient de corrélation de rang de Spearman = 0.055, p-value = 0.000
Variable smoking_status : Coefficient de corrélation de rang de Spea

In [8]:
dataset = dataset.drop(['id'], axis=1)
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gender             5110 non-null   int32  
 1   age                5110 non-null   float64
 2   hypertension       5110 non-null   int64  
 3   heart_disease      5110 non-null   int64  
 4   ever_married       5110 non-null   int32  
 5   work_type          5110 non-null   int32  
 6   Residence_type     5110 non-null   int32  
 7   avg_glucose_level  5110 non-null   float64
 8   bmi                5110 non-null   float64
 9   smoking_status     5110 non-null   int32  
 10  stroke             5110 non-null   int64  
dtypes: float64(3), int32(5), int64(3)
memory usage: 339.5 KB


In [9]:
X = dataset.iloc[: , :-1].values.astype(np.float32)
Y = dataset.iloc[: , -1].values.astype(np.float32)
Xtrain , Xtest , Ytrain , Ytest = train_test_split(X , Y , test_size=0.2, random_state=0)

In [10]:
model = LogisticRegression(solver='liblinear', class_weight='balanced')
model.fit(Xtrain, Ytrain)
predict = model.predict(Xtest)
model.score(Xtest, Ytest)

0.7622309197651663

In [11]:
cm = confusion_matrix(Ytest, predict)
cm

array([[742, 226],
       [ 17,  37]], dtype=int64)

In [12]:
data_test = [0, 50, 1, 1, 1, 1, 0, 100, 25, 0]
model.predict([data_test])

array([0.], dtype=float32)

In [13]:
joblib.dump(model, 'stroke_predict')

['stroke_predict']