In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('stroke-dataset.csv')

In [3]:
df.describe()

Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
count,5110.0,5110.0,5110.0,5110.0,5110.0,4909.0,5110.0
mean,36517.829354,43.226614,0.097456,0.054012,106.147677,28.893237,0.048728
std,21161.721625,22.612647,0.296607,0.226063,45.28356,7.854067,0.21532
min,67.0,0.08,0.0,0.0,55.12,10.3,0.0
25%,17741.25,25.0,0.0,0.0,77.245,23.5,0.0
50%,36932.0,45.0,0.0,0.0,91.885,28.1,0.0
75%,54682.0,61.0,0.0,0.0,114.09,33.1,0.0
max,72940.0,82.0,1.0,1.0,271.74,97.6,1.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB


In [5]:
df

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,Female,80.0,1,0,Yes,Private,Urban,83.75,,never smoked,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [6]:
df['work_type'].unique()

array(['Private', 'Self-employed', 'Govt_job', 'children', 'Never_worked'],
      dtype=object)

In [7]:
df['Residence_type'].unique()

array(['Urban', 'Rural'], dtype=object)

In [8]:
df['smoking_status'].unique()

array(['formerly smoked', 'never smoked', 'smokes', 'Unknown'],
      dtype=object)

In [9]:
df = df.dropna(subset=['bmi'])

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4909 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 4909 non-null   int64  
 1   gender             4909 non-null   object 
 2   age                4909 non-null   float64
 3   hypertension       4909 non-null   int64  
 4   heart_disease      4909 non-null   int64  
 5   ever_married       4909 non-null   object 
 6   work_type          4909 non-null   object 
 7   Residence_type     4909 non-null   object 
 8   avg_glucose_level  4909 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     4909 non-null   object 
 11  stroke             4909 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 498.6+ KB


In [11]:
df['smoking_status'].value_counts().get('Unknown', 0)

1483

In [12]:
mode_value = df['smoking_status'].mode()[0]  # Get the most frequent category
df.loc[:, 'smoking_status'] = df['smoking_status'].replace('Unknown', mode_value)
print("Replaced Unknown with:", mode_value)

Replaced Unknown with: never smoked


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'smoking_status'] = df['smoking_status'].replace('Unknown', mode_value)


In [13]:
from sklearn.model_selection import train_test_split

In [14]:
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

In [15]:
input_cols = list(train_df.columns)[1:-1]
target_col = 'stroke'

In [16]:
import numpy as np

In [17]:
numeric_cols = ['age','hypertension','heart_disease','avg_glucose_level','bmi']
categorical_cols = train_df.select_dtypes('object').columns.tolist()

In [18]:
categorical_cols

['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']

In [19]:
from sklearn.preprocessing import OneHotEncoder

In [20]:
enc_brain = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
enc_brain.fit(train_df[categorical_cols])

In [21]:
train_one_hot = enc_brain.transform(train_df[categorical_cols])
val_one_hot = enc_brain.transform(val_df[categorical_cols])

In [22]:
encoded_cols = list(enc_brain.get_feature_names_out(categorical_cols))

In [23]:
train_df[encoded_cols] = train_one_hot
val_df[encoded_cols] = val_one_hot

In [24]:
X_train = train_df[numeric_cols + encoded_cols]
train_targets = train_df[target_col]

X_val = val_df[numeric_cols + encoded_cols]
val_targets = val_df[target_col]

In [25]:
X_train

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,gender_Female,gender_Male,gender_Other,ever_married_No,ever_married_Yes,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
3565,40.0,0,0,65.77,31.2,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
898,59.0,0,0,81.64,32.8,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
2707,57.0,0,0,217.40,36.6,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
4198,81.0,0,0,71.18,23.9,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0
2746,65.0,0,0,95.88,28.5,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4613,19.0,0,0,89.30,22.1,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
511,51.0,0,0,82.93,29.7,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3247,53.0,0,0,90.65,22.1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
3946,11.0,0,0,93.51,20.8,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0


In [26]:
from sklearn.ensemble import RandomForestClassifier

In [27]:
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier

# Apply SMOTE to balance the dataset
smote = SMOTE(random_state=42)
X_train_resampled, train_targets_resampled = smote.fit_resample(X_train, train_targets)

# Initialize and train the model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight={0: 1, 1: 10})  # Increase weight for class 1
rf_model.fit(X_train_resampled, train_targets_resampled)


In [28]:
train_preds = rf_model.predict(X_train)

In [29]:
from sklearn.metrics import accuracy_score

accuracy_score(train_targets, train_preds)

1.0

In [30]:
val_preds = rf_model.predict(X_val)

In [31]:
accuracy_score(val_targets, val_preds)

0.9389002036659878

In [32]:
import pickle

In [33]:
with open('model_brain1.pkl', 'wb') as model_file:
    pickle.dump(rf_model, model_file)
    
# Save the encoder
with open('encoder_brain.pkl', 'wb') as encoder_file:
    pickle.dump(enc_brain, encoder_file)

In [40]:
sample_data = {
    'gender': ['Female'],
    'age': [71.0],
    'hypertension': [0],
    'heart_disease': [1],
    'ever_married': ['yes'],
    'work_type': ['Private'],
    'Residence_type': ['Urban'],
    'avg_glucose_level': [214.23],
    'bmi': [28.4],
    'Oldpeak': ['never smoked'],
    'smoking_status': [1]
}


sample_df = pd.DataFrame(sample_data)

# Step 1: Preprocess the input data

# Define the columns for numeric and categorical values
numeric_cols = ['age','hypertension','heart_disease','avg_glucose_level','bmi']
categorical_cols = train_df.select_dtypes('object').columns.tolist()

In [41]:
sample_df[categorical_cols] = sample_df[categorical_cols].fillna("unknown")

# Ensure all values are strings
sample_df[categorical_cols] = sample_df[categorical_cols].astype(str)

In [42]:
# Step 2: Encode categorical features using OneHotEncoder
encoded_categorical = enc_brain.transform(sample_df[categorical_cols])

# Get encoded feature names
encoded_feature_names = list(enc_brain.get_feature_names_out(categorical_cols))
# Convert encoded features into a DataFrame with column names
encoded_categorical_df = pd.DataFrame(encoded_categorical, columns=encoded_feature_names, index=sample_df.index)


# Combine numeric and encoded categorical columns into a final DataFrame
sample_preprocessed_df = pd.concat([sample_df[numeric_cols], encoded_categorical_df], axis=1)


In [43]:
probabilities = rf_model.predict_proba(sample_preprocessed_df)
    
risk_probability = probabilities[0][1] * 100  # Convert to percentage

output = f"{risk_probability:.2f}% chance of being at risk of brain disease"

print(output)

43.00% chance of being at risk of brain disease


In [44]:
prediction = rf_model.predict(sample_preprocessed_df)
output = "at risk of brain disease" if prediction[0] == 1 else "not at risk of brain disease"
output

'not at risk of brain disease'

In [45]:
print(train_targets.value_counts())

0    3771
1     156
Name: stroke, dtype: int64
