In [18]:
# Import required libraries and dependencies
import pandas as pd
import hvplot.pandas
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression

In [19]:
# Load the data into a Pandas DataFrame
stroke_data = pd.read_csv("Resources/brain_stroke_data.csv")

# Display sample data
stroke_data.head(10)

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
2,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
3,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
4,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1
5,Male,74.0,1,1,Yes,Private,Rural,70.09,27.4,never smoked,1
6,Female,69.0,0,0,No,Private,Urban,94.39,22.8,never smoked,1
7,Female,78.0,0,0,Yes,Private,Urban,58.57,24.2,Unknown,1
8,Female,81.0,1,0,Yes,Private,Rural,80.43,29.7,never smoked,1
9,Female,61.0,0,1,Yes,Govt_job,Rural,120.46,36.8,smokes,1


In [20]:
#Get the datatypes
stroke_data.dtypes

gender                object
age                  float64
hypertension           int64
heart_disease          int64
ever_married          object
work_type             object
Residence_type        object
avg_glucose_level    float64
bmi                  float64
smoking_status        object
stroke                 int64
dtype: object

In [21]:
# Generate summary statistics
stroke_data.describe()

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
count,4981.0,4981.0,4981.0,4981.0,4981.0,4981.0
mean,43.419859,0.096165,0.05521,105.943562,28.498173,0.049789
std,22.662755,0.294848,0.228412,45.075373,6.790464,0.217531
min,0.08,0.0,0.0,55.12,14.0,0.0
25%,25.0,0.0,0.0,77.23,23.7,0.0
50%,45.0,0.0,0.0,91.85,28.1,0.0
75%,61.0,0.0,0.0,113.86,32.6,0.0
max,82.0,1.0,1.0,271.74,48.9,1.0


Variables that are measurable show great disparity if it can be seen in the graph below

In [22]:
# Plot your data to see what's in your DataFrame
import hvplot.pandas
stroke_data.hvplot.scatter(
    width = 800,
    height = 400,
    rot = 90
)

---

### Prepare the Data

In [23]:
#Get columns names
#df_stroke_data.set_index('stroke',inplace=True)
stroke_data.columns

Index(['gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
       'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',
       'smoking_status', 'stroke'],
      dtype='object')

In [24]:
stroke_data['work_type'].unique()

array(['Private', 'Self-employed', 'Govt_job', 'children'], dtype=object)

In [25]:
stroke_data['smoking_status'].unique()

array(['formerly smoked', 'never smoked', 'smokes', 'Unknown'],
      dtype=object)

In [26]:
stroke_data['ever_married'].unique()

array(['Yes', 'No'], dtype=object)

In [27]:
#Create all dummies...
stroke_test=pd.get_dummies(stroke_data, columns=['gender','ever_married','Residence_type',
                                                 'work_type','Residence_type','smoking_status'])
stroke_test

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke,gender_Female,gender_Male,ever_married_No,ever_married_Yes,...,work_type_Govt_job,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,67.0,0,1,228.69,36.6,1,0,1,0,1,...,0,1,0,0,0,1,0,1,0,0
1,80.0,0,1,105.92,32.5,1,0,1,0,1,...,0,1,0,0,1,0,0,0,1,0
2,49.0,0,0,171.23,34.4,1,1,0,0,1,...,0,1,0,0,0,1,0,0,0,1
3,79.0,1,0,174.12,24.0,1,1,0,0,1,...,0,0,1,0,1,0,0,0,1,0
4,81.0,0,0,186.21,29.0,1,0,1,0,1,...,0,1,0,0,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4976,41.0,0,0,70.15,29.8,0,0,1,1,0,...,0,1,0,0,1,0,0,1,0,0
4977,40.0,0,0,191.15,31.1,0,0,1,0,1,...,0,1,0,0,0,1,0,0,0,1
4978,45.0,1,0,95.02,31.8,0,1,0,0,1,...,1,0,0,0,1,0,0,0,0,1
4979,40.0,0,0,83.94,30.0,0,0,1,0,1,...,0,1,0,0,1,0,0,0,0,1


In [28]:
# Get the correlation matrix taking into account the dummy variables (no continuos, no normal)
stroke_test.corr('spearman')

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke,gender_Female,gender_Male,ever_married_No,ever_married_Yes,...,work_type_Govt_job,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
age,1.0,0.282577,0.271159,0.142399,0.37688,0.251262,0.018037,-0.018037,-0.661365,0.661365,...,0.120193,0.082855,0.328031,-0.590862,-0.016859,0.016859,-0.354612,0.235161,0.107486,0.059699
hypertension,0.282577,1.0,0.111974,0.110959,0.16885,0.131965,-0.021485,0.021485,-0.164534,0.164534,...,0.016378,-0.004177,0.110468,-0.128924,0.004755,-0.004755,-0.139901,0.056797,0.065267,0.030749
heart_disease,0.271159,0.111974,1.0,0.109575,0.076483,0.13461,-0.086476,0.086476,-0.114765,0.114765,...,0.001166,-0.0016,0.087474,-0.092974,-0.002125,0.002125,-0.06671,0.067541,-0.022727,0.044011
avg_glucose_level,0.142399,0.110959,0.109575,1.0,0.121999,0.082897,-0.049636,0.049636,-0.094302,0.094302,...,-0.001363,0.016052,0.028764,-0.052839,0.009397,-0.009397,-0.05237,0.033597,0.013637,0.012976
bmi,0.37688,0.16885,0.076483,0.121999,1.0,0.062258,-0.011811,0.011811,-0.386216,0.386216,...,0.088791,0.204886,0.09523,-0.486002,-0.00772,0.00772,-0.294465,0.126766,0.10398,0.101608
stroke,0.251262,0.131965,0.13461,0.082897,0.062258,1.0,-0.00887,0.00887,-0.108398,0.108398,...,0.002574,0.010459,0.062643,-0.085075,-0.016494,0.016494,-0.055699,0.06532,-0.004806,0.008561
gender_Female,0.018037,-0.021485,-0.086476,-0.049636,-0.011811,-0.00887,1.0,-1.0,-0.028971,0.028971,...,0.017176,0.028706,0.029635,-0.090275,-0.004301,0.004301,-0.059858,-0.045109,0.102387,-0.013349
gender_Male,-0.018037,0.021485,0.086476,0.049636,0.011811,0.00887,-1.0,1.0,0.028971,-0.028971,...,-0.017176,-0.028706,-0.029635,0.090275,0.004301,-0.004301,0.059858,0.045109,-0.102387,0.013349
ever_married_No,-0.661365,-0.164534,-0.114765,-0.094302,-0.386216,-0.108398,-0.028971,0.028971,1.0,-1.0,...,-0.133655,-0.146139,-0.191668,0.548851,0.008191,-0.008191,0.335689,-0.172039,-0.10412,-0.106234
ever_married_Yes,0.661365,0.164534,0.114765,0.094302,0.386216,0.108398,0.028971,-0.028971,-1.0,1.0,...,0.133655,0.146139,0.191668,-0.548851,-0.008191,0.008191,-0.335689,0.172039,0.10412,0.106234


In [29]:
#In this dataset gender Residence_type and ever_married columns are binary. In order to avoid multicollineallity
# we delete one of the binary columns
stroke_df=pd.get_dummies(stroke_data, columns=['gender','ever_married','Residence_type'], drop_first=True)


In [30]:
stroke_df=pd.get_dummies(stroke_df,columns=['work_type','smoking_status'])

In [31]:
stroke_df

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke,gender_Male,ever_married_Yes,Residence_type_Urban,work_type_Govt_job,work_type_Private,work_type_Self-employed,work_type_children,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,67.0,0,1,228.69,36.6,1,1,1,1,0,1,0,0,0,1,0,0
1,80.0,0,1,105.92,32.5,1,1,1,0,0,1,0,0,0,0,1,0
2,49.0,0,0,171.23,34.4,1,0,1,1,0,1,0,0,0,0,0,1
3,79.0,1,0,174.12,24.0,1,0,1,0,0,0,1,0,0,0,1,0
4,81.0,0,0,186.21,29.0,1,1,1,1,0,1,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4976,41.0,0,0,70.15,29.8,0,1,0,0,0,1,0,0,0,1,0,0
4977,40.0,0,0,191.15,31.1,0,1,1,1,0,1,0,0,0,0,0,1
4978,45.0,1,0,95.02,31.8,0,0,1,0,1,0,0,0,0,0,0,1
4979,40.0,0,0,83.94,30.0,0,1,1,0,0,1,0,0,0,0,0,1


In [32]:
# Get the correlation matrix taking into account the dummy variables
stroke_df.corr('spearman')

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke,gender_Male,ever_married_Yes,Residence_type_Urban,work_type_Govt_job,work_type_Private,work_type_Self-employed,work_type_children,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
age,1.0,0.282577,0.271159,0.142399,0.37688,0.251262,-0.018037,0.661365,0.016859,0.120193,0.082855,0.328031,-0.590862,-0.354612,0.235161,0.107486,0.059699
hypertension,0.282577,1.0,0.111974,0.110959,0.16885,0.131965,0.021485,0.164534,-0.004755,0.016378,-0.004177,0.110468,-0.128924,-0.139901,0.056797,0.065267,0.030749
heart_disease,0.271159,0.111974,1.0,0.109575,0.076483,0.13461,0.086476,0.114765,0.002125,0.001166,-0.0016,0.087474,-0.092974,-0.06671,0.067541,-0.022727,0.044011
avg_glucose_level,0.142399,0.110959,0.109575,1.0,0.121999,0.082897,0.049636,0.094302,-0.009397,-0.001363,0.016052,0.028764,-0.052839,-0.05237,0.033597,0.013637,0.012976
bmi,0.37688,0.16885,0.076483,0.121999,1.0,0.062258,0.011811,0.386216,0.00772,0.088791,0.204886,0.09523,-0.486002,-0.294465,0.126766,0.10398,0.101608
stroke,0.251262,0.131965,0.13461,0.082897,0.062258,1.0,0.00887,0.108398,0.016494,0.002574,0.010459,0.062643,-0.085075,-0.055699,0.06532,-0.004806,0.008561
gender_Male,-0.018037,0.021485,0.086476,0.049636,0.011811,0.00887,1.0,-0.028971,-0.004301,-0.017176,-0.028706,-0.029635,0.090275,0.059858,0.045109,-0.102387,0.013349
ever_married_Yes,0.661365,0.164534,0.114765,0.094302,0.386216,0.108398,-0.028971,1.0,0.008191,0.133655,0.146139,0.191668,-0.548851,-0.335689,0.172039,0.10412,0.106234
Residence_type_Urban,0.016859,-0.004755,0.002125,-0.009397,0.00772,0.016494,-0.004301,0.008191,1.0,0.013925,-0.016104,0.013427,-0.004825,-0.003937,0.009825,-0.026892,0.03049
work_type_Govt_job,0.120193,0.016378,0.001166,-0.001363,0.088791,0.002574,-0.017176,0.133655,0.013925,1.0,-0.447467,-0.169061,-0.152306,-0.096437,0.029833,0.045091,0.030804


In [33]:
# Use the `StandardScaler()` module from scikit-learn to normalize the data (not the dummies) from the CSV file
stroke_columns = ['age','avg_glucose_level', 'bmi']
stroke_data_scaled = StandardScaler().fit_transform(stroke_df[stroke_columns])

In [34]:
#Dataframe of the dummies
mktdata_dummies=stroke_df.drop(columns=['age','avg_glucose_level', 'bmi'])
mktdata_dummies.head()

Unnamed: 0,hypertension,heart_disease,stroke,gender_Male,ever_married_Yes,Residence_type_Urban,work_type_Govt_job,work_type_Private,work_type_Self-employed,work_type_children,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,0,1,1,1,1,1,0,1,0,0,0,1,0,0
1,0,1,1,1,1,0,0,1,0,0,0,0,1,0
2,0,0,1,0,1,1,0,1,0,0,0,0,0,1
3,1,0,1,0,1,0,0,0,1,0,0,0,1,0
4,0,0,1,1,1,1,0,1,0,0,0,1,0,0


In [35]:
# Create a DataFrame with the scaled data
columns='age','avg_glucose_level', 'bmi'
mktdata_scaled_df = pd.DataFrame(stroke_data_scaled,columns=stroke_columns)
# Display sample data
mktdata_scaled_df.head()

Unnamed: 0,age,avg_glucose_level,bmi
0,1.040584,2.723411,1.193238
1,1.61427,-0.000523,0.58939
2,0.24625,1.448529,0.869222
3,1.570141,1.51265,-0.662492
4,1.6584,1.780895,0.073909


In [36]:
#Join the two parts of the new database
analysis_df=pd.merge(mktdata_scaled_df,mktdata_dummies, left_index=True,right_index=True) 
analysis_df

Unnamed: 0,age,avg_glucose_level,bmi,hypertension,heart_disease,stroke,gender_Male,ever_married_Yes,Residence_type_Urban,work_type_Govt_job,work_type_Private,work_type_Self-employed,work_type_children,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,1.040584,2.723411,1.193238,0,1,1,1,1,1,0,1,0,0,0,1,0,0
1,1.614270,-0.000523,0.589390,0,1,1,1,1,0,0,1,0,0,0,0,1,0
2,0.246250,1.448529,0.869222,0,0,1,0,1,1,0,1,0,0,0,0,0,1
3,1.570141,1.512650,-0.662492,1,0,1,0,1,0,0,0,1,0,0,0,1,0
4,1.658400,1.780895,0.073909,0,0,1,1,1,1,0,1,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4976,-0.106788,-0.794162,0.191733,0,0,0,1,0,0,0,1,0,0,0,1,0,0
4977,-0.150917,1.890500,0.383197,0,0,0,1,1,1,0,1,0,0,0,0,0,1
4978,0.069731,-0.242364,0.486294,1,0,0,0,1,0,1,0,0,0,0,0,0,1
4979,-0.150917,-0.488199,0.221189,0,0,0,1,1,0,0,1,0,0,0,0,0,1


In [37]:
# Separate the data into labels and features
# Separate the y variable, the labels
y = analysis_df["stroke"]

# Separate the X variable, the features
X = analysis_df.copy()
X.drop("stroke", axis=1, inplace=True)

In [38]:
#Review id the data is balanced
y.value_counts()

0    4733
1     248
Name: stroke, dtype: int64

In [39]:
#Split the data into training and testing
# Import the train_test_learn module
from sklearn.model_selection import train_test_split

# Split the data using train_test_split
# Assign a random_state of 1 to the function and astratify as the data is unbalanced
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, shuffle = True) 

In [42]:
# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
classifier = LogisticRegression(solver='liblinear', random_state=1)

# Fit the model using training data
classifier.fit(X_train, y_train)
# Make a prediction using the testing data
predictions = classifier.predict(X_test)

In [43]:
# Print the balanced_accuracy score of the model
#from sklearn.metrics import accuracy_score

balanced_score = balanced_accuracy_score(y_test, predictions)

# Display the accuracy score for the test dataset.
print(f"The Testing score is :  {balanced_score}")

The Testing score is :  0.5073529411764706


In [44]:
# Generate a confusion matrix for the model
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)
# Print the classification report for the model
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score :  {round(balanced_score,4)}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,1178,0
Actual 1,67,1


Accuracy Score :  0.5074
Classification Report
              precision    recall  f1-score   support

           0       0.95      1.00      0.97      1178
           1       1.00      0.01      0.03        68

    accuracy                           0.95      1246
   macro avg       0.97      0.51      0.50      1246
weighted avg       0.95      0.95      0.92      1246



# Oversample in order to avoid imbalances of the data

In [48]:
# Import the RandomOverSampler module form imbalanced-learn
from imblearn.over_sampling import RandomOverSampler

# Instantiate the random oversampler model
# # Assign a random_state parameter of 1 to the model
ros = RandomOverSampler(random_state=1)

# Fit the original training data to the random_oversampler model
X_res, y_res = ros.fit_resample(X, y)

In [49]:
# Count the distinct values of the resampled labels data
y_res.value_counts()

1    4733
0    4733
Name: stroke, dtype: int64

In [50]:
# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
classifier_res = LogisticRegression(solver='lbfgs', max_iter = 300, random_state=1)

# Fit the model using the resampled training data
classifier_res.fit(X_res, y_res)

# Make a prediction using the testing data
prediction_res = classifier_res.predict(X_test)

In [51]:
# Print the balanced_accuracy score of the model 
balanced_score = balanced_accuracy_score(y_test, prediction_res)

# Display the accuracy score for the test dataset.
print(f"The Testing score is :  {balanced_score}")

The Testing score is :  0.7823079996005193


In [53]:
# Generate a confusion matrix for the model
cm1 = confusion_matrix(y_test, prediction_res)
cm_df1 = pd.DataFrame(
    cm1, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)
# Print the classification report for the model
print("Confusion Matrix")
display(cm_df1)
print(f"Accuracy Score :  {round(balanced_score,4)}")
print("Classification Report")
print(classification_report(y_test, prediction_res))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,873,305
Actual 1,12,56


Accuracy Score :  0.7823
Classification Report
              precision    recall  f1-score   support

           0       0.99      0.74      0.85      1178
           1       0.16      0.82      0.26        68

    accuracy                           0.75      1246
   macro avg       0.57      0.78      0.55      1246
weighted avg       0.94      0.75      0.81      1246

