In [2]:
# importing needed libraries

import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [3]:
# reading CSV file to DataFrame
df = pd.read_csv('brain_stroke.csv')

---
## Desriptive analysis on the dataset

In [4]:
# first 5 rows of dataset
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
2,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
3,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
4,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1


In [5]:
# last 5 rows of dataset
df.tail()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
4976,Male,41.0,0,0,No,Private,Rural,70.15,29.8,formerly smoked,0
4977,Male,40.0,0,0,Yes,Private,Urban,191.15,31.1,smokes,0
4978,Female,45.0,1,0,Yes,Govt_job,Rural,95.02,31.8,smokes,0
4979,Male,40.0,0,0,Yes,Private,Rural,83.94,30.0,smokes,0
4980,Female,80.0,1,0,Yes,Private,Urban,83.75,29.1,never smoked,0


In [6]:
# info about dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4981 entries, 0 to 4980
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gender             4981 non-null   object 
 1   age                4981 non-null   float64
 2   hypertension       4981 non-null   int64  
 3   heart_disease      4981 non-null   int64  
 4   ever_married       4981 non-null   object 
 5   work_type          4981 non-null   object 
 6   Residence_type     4981 non-null   object 
 7   avg_glucose_level  4981 non-null   float64
 8   bmi                4981 non-null   float64
 9   smoking_status     4981 non-null   object 
 10  stroke             4981 non-null   int64  
dtypes: float64(3), int64(3), object(5)
memory usage: 428.2+ KB


In [7]:
# size of dataset
df.size

54791

In [8]:
# shape of dataset
df.shape

(4981, 11)

In [9]:
# short statistic info about dataset
df.describe()

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
count,4981.0,4981.0,4981.0,4981.0,4981.0,4981.0
mean,43.419859,0.096165,0.05521,105.943562,28.498173,0.049789
std,22.662755,0.294848,0.228412,45.075373,6.790464,0.217531
min,0.08,0.0,0.0,55.12,14.0,0.0
25%,25.0,0.0,0.0,77.23,23.7,0.0
50%,45.0,0.0,0.0,91.85,28.1,0.0
75%,61.0,0.0,0.0,113.86,32.6,0.0
max,82.0,1.0,1.0,271.74,48.9,1.0


In [10]:
# data types of dataset
df.dtypes

gender                object
age                  float64
hypertension           int64
heart_disease          int64
ever_married          object
work_type             object
Residence_type        object
avg_glucose_level    float64
bmi                  float64
smoking_status        object
stroke                 int64
dtype: object

In [11]:
# count of NA values
df.isna().sum()

gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64

----
## Simple Analysis

In [12]:
df.iloc[4]

gender                          Male
age                             81.0
hypertension                       0
heart_disease                      0
ever_married                     Yes
work_type                    Private
Residence_type                 Urban
avg_glucose_level             186.21
bmi                             29.0
smoking_status       formerly smoked
stroke                             1
Name: 4, dtype: object

In [13]:
# e.g. 5 rows of obese people according to bmi
df.loc[df.bmi>24.9].head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
2,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1
5,Male,74.0,1,1,Yes,Private,Rural,70.09,27.4,never smoked,1


---
## Data Manipulation

In [14]:
# adding new column: categorizing bmi

def bmi_categ(bmi):
    if bmi <= 18.5:
        return 'Underweight'
    elif 18.5 <= bmi <= 24.9:
        return 'Normal'
    elif 25 <= bmi <= 29.9:
        return 'Overweight'
    else:
        return 'Obesity'
    
df['bmi_category'] = df['bmi'].apply(bmi_categ)

df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke,bmi_category
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1,Obesity
1,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1,Obesity
2,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1,Obesity
3,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1,Normal
4,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1,Overweight


In [15]:
# removing ages less than 1
df = df[df['age'] >= 1]
df.reset_index(drop=True, inplace = True)
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke,bmi_category
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1,Obesity
1,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1,Obesity
2,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1,Obesity
3,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1,Normal
4,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1,Overweight


----
## Group Statistics according to: `age`, `hypertension`, `avg_glucose_level`, `heart_disease`, `bmi` and `stroke`

In [16]:
df.groupby(by='gender')['age'].mean()

gender
Female    44.152863
Male      43.243320
Name: age, dtype: float64

In [17]:
df.groupby(by='gender')['hypertension'].mean()

gender
Female    0.091286
Male      0.104980
Name: hypertension, dtype: float64

In [18]:
df.groupby(by='gender')['avg_glucose_level'].mean()

gender
Female    103.895014
Male      109.048501
Name: avg_glucose_level, dtype: float64

In [19]:
df.groupby(by='gender')['heart_disease'].mean()

gender
Female    0.038728
Male      0.079590
Name: heart_disease, dtype: float64

In [20]:
df.groupby(by='gender')['bmi'].mean()

gender
Female    28.623409
Male      28.523486
Name: bmi, dtype: float64

In [21]:
df.groupby(by='gender')['stroke'].mean()

gender
Female    0.048409
Male      0.052734
Name: stroke, dtype: float64

In [22]:
# Or with a line of code
df.groupby(by =['gender'])[['age', 'hypertension', 
                            'avg_glucose_level', 'heart_disease', 'bmi', 'stroke']].agg(func=['mean', 'max', 'min'])

Unnamed: 0_level_0,age,age,age,hypertension,hypertension,hypertension,avg_glucose_level,avg_glucose_level,avg_glucose_level,heart_disease,heart_disease,heart_disease,bmi,bmi,bmi,stroke,stroke,stroke
Unnamed: 0_level_1,mean,max,min,mean,max,min,mean,max,min,mean,max,min,mean,max,min,mean,max,min
gender,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2
Female,44.152863,82.0,1.0,0.091286,1,0,103.895014,267.76,55.12,0.038728,1,0,28.623409,48.9,14.0,0.048409,1,0
Male,43.24332,82.0,1.0,0.10498,1,0,109.048501,271.74,55.22,0.07959,1,0,28.523486,48.8,14.2,0.052734,1,0


---
# Model Training - for fun 🤠

In [23]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Load the data
# Assuming df is your DataFrame

# Define the feature columns and the target
X = df.drop('stroke', axis=1)
y = df['stroke']

# Preprocessing for numerical and categorical data
numeric_features = ['age', 'avg_glucose_level', 'bmi']
categorical_features = ['gender', 'hypertension', 'heart_disease', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']

# We create the preprocessing pipelines for both numeric and categorical data
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(drop='first'))])

# We use ColumnTransformer to apply the transformations
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# Create the model pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression())])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.9433198380566802
Confusion Matrix:
 [[932   0]
 [ 56   0]]
Classification Report:
               precision    recall  f1-score   support

           0       0.94      1.00      0.97       932
           1       0.00      0.00      0.00        56

    accuracy                           0.94       988
   macro avg       0.47      0.50      0.49       988
weighted avg       0.89      0.94      0.92       988



In [32]:
# Someone's data

new_data = pd.DataFrame({
    'gender': ['Male'],
    'age': [19],
    'hypertension': [0],
    'heart_disease': [0],
    'ever_married': ['No'],
    'work_type': ['Private'],
    'Residence_type': ['Urban'],
    'avg_glucose_level': [23],
    'bmi': [23.5],
    'smoking_status': ['never smoked']
})

# Predict the probability of stroke
new_prediction = model.predict(new_data)
new_prediction_proba = model.predict_proba(new_data)

print("Prediction (0: No Stroke, 1: Stroke):", new_prediction[0])
print("Prediction Probability (No Stroke, Stroke):", new_prediction_proba[0])


Prediction (0: No Stroke, 1: Stroke): 0
Prediction Probability (No Stroke, Stroke): [0.9972654 0.0027346]
