In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("clean_data/stroke_dataset.csv")

df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
2,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
3,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
4,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1


In [3]:
print(df.shape)
print(df['work_type'].value_counts())

(4237, 11)
Private          2810
Self-employed     775
Govt_job          630
Never_worked       22
Name: work_type, dtype: int64


In [4]:
df.stroke.value_counts()

0    4029
1     208
Name: stroke, dtype: int64

In [5]:
#Condense working_types
df.replace(["Private", "Govt_job", "Never_worked"], "employer_employed", inplace=True)
print(df['work_type'].value_counts())

employer_employed    3462
Self-employed         775
Name: work_type, dtype: int64


In [6]:
df.head()
df['ever_married'].value_counts()

Yes    3204
No     1033
Name: ever_married, dtype: int64

In [7]:
# Change 'ever_married' into binary
df['ever_married'].replace("No", 0, inplace=True)
df['ever_married'].replace("Yes", 1, inplace=True)
df['ever_married'].value_counts()

1    3204
0    1033
Name: ever_married, dtype: int64

In [8]:
machine_df = pd.get_dummies(df, columns=["gender","Residence_type","hypertension", "heart_disease", "ever_married",
                                "work_type", "smoking_status"
                                ])
machine_df.head()

Unnamed: 0,age,avg_glucose_level,bmi,stroke,gender_Female,gender_Male,Residence_type_Rural,Residence_type_Urban,hypertension_0,hypertension_1,heart_disease_0,heart_disease_1,ever_married_0,ever_married_1,work_type_Self-employed,work_type_employer_employed,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,67.0,228.69,36.6,1,0,1,0,1,1,0,0,1,0,1,0,1,0,1,0,0
1,80.0,105.92,32.5,1,0,1,1,0,1,0,0,1,0,1,0,1,0,0,1,0
2,49.0,171.23,34.4,1,1,0,0,1,1,0,1,0,0,1,0,1,0,0,0,1
3,79.0,174.12,24.0,1,1,0,1,0,0,1,1,0,0,1,1,0,0,0,1,0
4,81.0,186.21,29.0,1,0,1,0,1,1,0,1,0,0,1,0,1,0,1,0,0


In [9]:
# Import Machine Learning Dependencies
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# Import SMOTE
from imblearn.over_sampling import SMOTE

In [10]:
#Creating FEATURES & LABEL
X = machine_df.drop(["stroke"], axis=1)
y = machine_df["stroke"]

## Training the data

In [15]:
# Train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.15, random_state=50)

# Scaler for features
X_scaler = StandardScaler().fit(X_train)

# Scale Features
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Fit & score the logisticRegress Classifier
clf = LogisticRegression(solver='liblinear', penalty='l2', C=0.00005)
clf = clf.fit(X=X_train_scaled, y=y_train)
score = clf.score(X_test_scaled,y_test)

print(score)

0.9512578616352201


## Training with SMOTE

In [13]:
# Train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.15, random_state=50)

#SMOTE to handle class imbalance
smote = SMOTE(sampling_strategy=0.2, k_neighbors=2)
X_train_SMOTE, y_train_SMOTE = smote.fit_resample(X_train, y_train.ravel())
y_train_SMOTE = y_train_SMOTE.reshape(-1,1)

# Create Scaler for features
X_SMOTE_scaler = StandardScaler().fit(X_train_SMOTE)

# Scale & transform Features
X_train_SMOTE_scaled = X_SMOTE_scaler.transform(X_train_SMOTE)
X_test_scaled = X_SMOTE_scaler.transform(X_test)

# Fit & score the logisticRegress Classifier
clf = LogisticRegression(solver='liblinear', penalty='l2', C=0.00005)
clf = clf.fit(X=X_train_SMOTE_scaled, y=y_train_SMOTE)
score = clf.score(X_test_scaled,y_test)
 
print(score)

0.889937106918239


  return f(*args, **kwargs)


In [14]:
clf.coef_

array([[ 0.02732137,  0.0158857 , -0.00044471, -0.01332862, -0.00630753,
        -0.01067092, -0.00765884, -0.02246517,  0.00172569, -0.02029854,
         0.00110895, -0.01318343,  0.00403532, -0.00604634, -0.01284248,
        -0.01012376, -0.00501943, -0.00986486, -0.00703905]])

In [16]:
# # CLASSIFIER - KNeighborsClassifier
# model = KNeighborsClassifier()
# model.fit(X_train, y_train.values.reshape(-1))
# y_predict = model.predict(X_test)

# model = KNeighborsClassifier.fit(X_train, y_train.values.ravel())

TypeError: fit() missing 1 required positional argument: 'y'

In [None]:
# # sklearn accuracy_score
# print(accuracy_score(y_test,y_predict))
# pd.crosstab(y_test,y_predict)

In [None]:
# print(1201 / (1201+3))
# print(67 / (67+1))

In [None]:
# SMOTE
# smote = SMOTE()
# X_train_smote, y_train_smote = smote.fit_resample(X_train,y_train)

In [None]:
# from collections import Counter
# print("Before SMOTE: ", Counter(y_train))
# print("After SMOTE: ", Counter(y_train_smote))

In [None]:
# model.fit(X_train_smote, y_train_smote.values.reshape(-1))
# y_predict = model.predict(X_test)
# print(accuracy_score(y_test,y_predict))
# pd.crosstab(y_test,y_predict)

In [None]:
# print(965 / (965+239))
# print(38 / (38+30))

In [None]:
# # Classifier - LogisticRegression
# classifier = LogisticRegression(solver='liblinear', penalty='l2', C=.00005)
# classifier = classifier.fit(X=X_train_smote)