In [1]:
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [2]:
df = pd.read_csv("Brain_Stroke_Data/full_data.csv")
df

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
2,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
3,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
4,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1
...,...,...,...,...,...,...,...,...,...,...,...
4976,Male,41.0,0,0,No,Private,Rural,70.15,29.8,formerly smoked,0
4977,Male,40.0,0,0,Yes,Private,Urban,191.15,31.1,smokes,0
4978,Female,45.0,1,0,Yes,Govt_job,Rural,95.02,31.8,smokes,0
4979,Male,40.0,0,0,Yes,Private,Rural,83.94,30.0,smokes,0


In [4]:
#dataset information
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4981 entries, 0 to 4980
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gender             4981 non-null   object 
 1   age                4981 non-null   float64
 2   hypertension       4981 non-null   int64  
 3   heart_disease      4981 non-null   int64  
 4   ever_married       4981 non-null   object 
 5   work_type          4981 non-null   object 
 6   Residence_type     4981 non-null   object 
 7   avg_glucose_level  4981 non-null   float64
 8   bmi                4981 non-null   float64
 9   smoking_status     4981 non-null   object 
 10  stroke             4981 non-null   int64  
dtypes: float64(3), int64(3), object(5)
memory usage: 428.2+ KB


In [8]:
X = list(df[["avg_glucose_level", "age", "hypertension", "heart_disease", "bmi"]].apply(tuple, axis=1))
y = list(df["stroke"])


print(f"Labels: {y[:10]}")
print(f"Data: {X[:10]}")

Labels: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Data: [(228.69, 67.0, 0.0, 1.0, 36.6), (105.92, 80.0, 0.0, 1.0, 32.5), (171.23, 49.0, 0.0, 0.0, 34.4), (174.12, 79.0, 1.0, 0.0, 24.0), (186.21, 81.0, 0.0, 0.0, 29.0), (70.09, 74.0, 1.0, 1.0, 27.4), (94.39, 69.0, 0.0, 0.0, 22.8), (58.57, 78.0, 0.0, 0.0, 24.2), (80.43, 81.0, 1.0, 0.0, 29.7), (120.46, 61.0, 0.0, 1.0, 36.8)]


In [10]:
#data description
df.describe()

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
count,4981.0,4981.0,4981.0,4981.0,4981.0,4981.0
mean,43.419859,0.096165,0.05521,105.943562,28.498173,0.049789
std,22.662755,0.294848,0.228412,45.075373,6.790464,0.217531
min,0.08,0.0,0.0,55.12,14.0,0.0
25%,25.0,0.0,0.0,77.23,23.7,0.0
50%,45.0,0.0,0.0,91.85,28.1,0.0
75%,61.0,0.0,0.0,113.86,32.6,0.0
max,82.0,1.0,1.0,271.74,48.9,1.0


In [11]:
#data description summary
df.select_dtypes(include=['object']).describe()

Unnamed: 0,gender,ever_married,work_type,Residence_type,smoking_status
count,4981,4981,4981,4981,4981
unique,2,2,4,2,4
top,Female,Yes,Private,Urban,never smoked
freq,2907,3280,2860,2532,1838


In [19]:
#data description summary display
col=df.select_dtypes(include=['object']).columns.tolist()
#create iteration
for i in col:
    count=df.groupby(['stroke'])[i].value_counts()[1]
    percent=df.groupby(['stroke'])[i].value_counts(normalize=True)[1]
    display(pd.DataFrame({"Patients":count, "Percent":percent*100})\
            .sort_values("Percent", ascending=False)
            .style.set_caption('Variable: {}'.format(i))\
            .format({"Percent": "{:,.1f}%"})
            .highlight_max(props='font-weight:bold; color:Black', axis=0)) 


Unnamed: 0_level_0,Patients,Percent
gender,Unnamed: 1_level_1,Unnamed: 2_level_1
Female,140,56.5%
Male,108,43.5%


Unnamed: 0_level_0,Patients,Percent
ever_married,Unnamed: 1_level_1,Unnamed: 2_level_1
Yes,219,88.3%
No,29,11.7%


Unnamed: 0_level_0,Patients,Percent
work_type,Unnamed: 1_level_1,Unnamed: 2_level_1
Private,148,59.7%
Self-employed,65,26.2%
Govt_job,33,13.3%
children,2,0.8%


Unnamed: 0_level_0,Patients,Percent
Residence_type,Unnamed: 1_level_1,Unnamed: 2_level_1
Urban,135,54.4%
Rural,113,45.6%


Unnamed: 0_level_0,Patients,Percent
smoking_status,Unnamed: 1_level_1,Unnamed: 2_level_1
never smoked,89,35.9%
formerly smoked,70,28.2%
Unknown,47,19.0%
smokes,42,16.9%


In [23]:
#drop 'Other' value
df.drop(df[df.gender == 'Other'].index, inplace=True)
#drop never_worked
df.drop(df[df.work_type == 'Never_worked'].index, inplace=True)

#columns to convert to category
cols = ['gender', 'ever_married', 'work_type','Residence_type','smoking_status']
#convert columns to category
df[cols] = df[cols].astype('category')
#convert age column to int
df['age'] = df['age'].astype('int')

#check columns types
print(df.dtypes)


gender               category
age                     int64
hypertension            int64
heart_disease           int64
ever_married         category
work_type            category
Residence_type       category
avg_glucose_level     float64
bmi                   float64
smoking_status       category
stroke                  int64
dtype: object


In [25]:
import numpy as np # linear algebra

In [27]:

cdf= df.copy()
#change values 1, 0 to have the issue or not
cdf['hypertension'] = np.where(cdf['hypertension'] == 1, 'Hypertension', 'No hypertension')
cdf['heart_disease'] = np.where(cdf['heart_disease'] == 1, 'Heart disease', 'No heart disease')
cdf['stroke'] = np.where(cdf['stroke'] == 1, 'Stroke', 'No stroke')
cdf['ever_married'] = np.where(cdf['ever_married'] == 'Yes', 'Ever married', 'Never')


cdf.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67,No hypertension,Heart disease,Ever married,Private,Urban,228.69,36.6,formerly smoked,Stroke
1,Male,80,No hypertension,Heart disease,Ever married,Private,Rural,105.92,32.5,never smoked,Stroke
2,Female,49,No hypertension,No heart disease,Ever married,Private,Urban,171.23,34.4,smokes,Stroke
3,Female,79,Hypertension,No heart disease,Ever married,Self-employed,Rural,174.12,24.0,never smoked,Stroke
4,Male,81,No hypertension,No heart disease,Ever married,Private,Urban,186.21,29.0,formerly smoked,Stroke


In [29]:
#missing values
df.isnull().mean()


gender               0.0
age                  0.0
hypertension         0.0
heart_disease        0.0
ever_married         0.0
work_type            0.0
Residence_type       0.0
avg_glucose_level    0.0
bmi                  0.0
smoking_status       0.0
stroke               0.0
dtype: float64

In [31]:
#fill null values
df['bmi'].fillna((df['bmi'].mean()), inplace=True)
df.isnull().mean()

gender               0.0
age                  0.0
hypertension         0.0
heart_disease        0.0
ever_married         0.0
work_type            0.0
Residence_type       0.0
avg_glucose_level    0.0
bmi                  0.0
smoking_status       0.0
stroke               0.0
dtype: float64

In [33]:
from sklearn import preprocessing #label encoding


In [36]:

columns_obj = ["gender", "ever_married" ,"Residence_type"]
encoding = preprocessing.LabelEncoder()
for col in columns_obj:
    df[col]=  encoding.fit_transform(df[col])

#convert in 0 and 1 the rest of columns    
df = pd.get_dummies(df)
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,Residence_type,avg_glucose_level,bmi,stroke,work_type_Govt_job,work_type_Private,work_type_Self-employed,work_type_children,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,1,67,0,1,1,1,228.69,36.6,1,0,1,0,0,0,1,0,0
1,1,80,0,1,1,0,105.92,32.5,1,0,1,0,0,0,0,1,0
2,0,49,0,0,1,1,171.23,34.4,1,0,1,0,0,0,0,0,1
3,0,79,1,0,1,0,174.12,24.0,1,0,0,1,0,0,0,1,0
4,1,81,0,0,1,1,186.21,29.0,1,0,1,0,0,0,1,0,0


In [39]:
from imblearn.over_sampling import SMOTE #oversample data

In [40]:
#sepate labels and target
X = df.drop(columns = ['stroke'])
#target
y = df['stroke']
#oversample data
smote = SMOTE(random_state=42)
X , y = smote.fit_resample(X,y)

before = df.stroke.value_counts(normalize=True)
after = y.value_counts(normalize=True)
print('Rows before smote:' + ' {}'.format(df.shape[0]))
print('Rows after smote:' + ' {}'.format(X.shape[0]))

Rows before smote: 4981
Rows after smote: 9466


In [41]:
# separate into training and testing set
X_train, X2, y_train, y2 = train_test_split(
    X,  # 
    y,  
    test_size=0.50, 
    shuffle=True, 
    stratify=y,
    random_state=42) 

X_val, X_test, y_val, y_test = train_test_split(
    X2, y2, test_size=0.5, shuffle=True, stratify=y2, random_state=42)

In [42]:
#round decimals
X_train = round(X_train, 2)
X_test = round(X_test, 2)
X_val =  round(X_val, 2)
#fZscore outliers
def Zscore_outlier(df):
    out=[]
    m = np.mean(df)
    sd = np.std(df)
    for i in df: 
        z = (i-m)/sd
        if np.abs(z) > 3: 
            out.append(i)
    out.sort()
    print(out)
Zscore_outlier(X_train['bmi'])
Zscore_outlier(X_train['avg_glucose_level'])

[47.1, 47.3, 47.3, 47.4, 47.43, 47.5, 47.8, 47.9, 48.0, 48.2, 48.48, 48.5, 48.5, 48.8, 48.8, 48.9]
[]


In [43]:
#change outliers
X_train.loc[X_train.bmi >= 53.4, 'bmi'] = 49

In [48]:
from sklearn.ensemble import AdaBoostClassifier #ML model

In [52]:
from sklearn.model_selection import cross_val_score #cross validation

In [54]:
from sklearn.model_selection import KFold #cross validation by kfold

In [58]:
#Adaboost 
ab_model = AdaBoostClassifier(random_state=42)
#parameters
ab_param = {'n_estimators': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 20],
            'learning_rate': [(0.97 + x / 100) for x in range(0, 8)],'algorithm': ['SAMME', 'SAMME.R']}

grid_ab = GridSearchCV(ab_model, ab_param, scoring = 'roc_auc' ,cv=5,n_jobs=-1)

search_ab = grid_ab.fit(X_train, y_train)

best_ab = search_ab.best_estimator_
#get score
cross_ab =  cross_val_score(
    best_ab,
    X_val, 
    y_val,
    n_jobs=-1,
    scoring='accuracy',
)
#dataframe 
ab_accu = pd.DataFrame(data={'Score': cross_ab, 'Metric': 'Accuracy', 'Model': 'AdaBoost'})

print('AdaBoost')
print('Best AUC: ' + str(round(grid_ab.best_score_,2)))
print('Mean validation set accuracy: ' + str(round(cross_ab.mean()*100, 2)) +"%")
print('Standard deviation: ' + str(round(cross_ab.std()*100, 2)))

AdaBoost
Best AUC: 0.97
Mean validation set accuracy: 89.1%
Standard deviation: 2.94
