In [1]:
import pandas as pd

data = pd.read_csv('strokeData.csv')

#check for null 
#data.isnull().sum()


In [379]:
"""fill missing values for bmi using median as more accurate, as mean skewed
by extreme data and mode can be skewed by outliers
"""
data['bmi'].fillna(data['bmi'].median(), inplace=True)
data.head()

#count how many stroke and non stroke for class balance
strokes = data['stroke'].value_counts()
#print(strokes)

'''
as the class imbalance is huge SMOTE is needed to artificially balance it out
'''

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['bmi'].fillna(data['bmi'].median(), inplace=True)


'\nas the class imbalance is huge SMOTE is needed to artificially balance it out\n'

In [380]:
#drop non essential data that would be used to predict stroke, use new data frame
dataFiltered = data.copy()
#stroke dropped as we are predicting and id is identifier and no related
dataFiltered.drop(['id'],axis=1,inplace=True)
'''encode the data as categorical data is not allowed in machine learning
hot encoding as no order or hierarchy
'''
dataFiltered = pd.get_dummies(dataFiltered, columns=['gender','ever_married','work_type','Residence_type','smoking_status'])

#correlation test to see what to drop
#dataFiltered.corr()


In [381]:
#target and input data
predictors = dataFiltered.drop(columns=['stroke'])
target = dataFiltered['stroke']

from sklearn.model_selection import train_test_split
from sklearn.linear_model  import LinearRegression
from imblearn.over_sampling import SMOTE

# Apply SMOTE before train-test split
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(predictors, target)


#split data into test and train data
X_train,X_test,y_train,y_test = train_test_split(,target,test_size=0.3,random_state=42)




In [382]:
#using hyper parameters - gridSearchcv for best resutls
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

param_grid = {
    'n_estimators' : [100,200,300,400,500],
    'max_depth': [None,10,20,30],
}


#fit model using gridsearch
#cv = cross validation, spliting data into training and validation sets multiple times in diff ways
#verbose - level of feedback
grid = GridSearchCV(estimator=rf, param_grid=param_grid,cv=5,scoring='accuracy', n_jobs=-1,verbose=1)

grid.fit(X_train_resampled,y_train_resampled)

print(grid.best_params_)
best_params = grid.best_params_


Fitting 5 folds for each of 20 candidates, totalling 100 fits
{'max_depth': 20, 'n_estimators': 500}


In [383]:
best_score = grid.best_score_
print("Best Cross-Validation Accuracy:", best_score)



Best Cross-Validation Accuracy: 0.9735171545132548


In [384]:
#implementing a better forest model using those best parameters
optimizedRf = RandomForestClassifier(**best_params,class_weight='balanced',random_state=42)

#training model
optimizedRf.fit(X_train_resampled,y_train_resampled)

#prediciting model
y_prediction = optimizedRf.predict(X_test)

#using confusion matrix to inprove readability of model evaluation
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
# Confusion Matrix
cm = confusion_matrix(y_test, y_prediction)
print("Confusion Matrix:\n", cm)



Confusion Matrix:
 [[1435    9]
 [  87    2]]
