In [1]:
import re
import string
import scipy
import pickle
import pandas as pd
import numpy as np
import os
from sklearn.feature_extraction.text import *
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from astropy.table import Table , Column
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv('BrainStroke.csv') 

In [3]:
data.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
2,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
3,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
4,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1


In [4]:
data.columns.name="index"
print("Train Dataset:")
print(data)

Train Dataset:
index  gender   age  hypertension  heart_disease ever_married      work_type  \
0        Male  67.0             0              1          Yes        Private   
1        Male  80.0             0              1          Yes        Private   
2      Female  49.0             0              0          Yes        Private   
3      Female  79.0             1              0          Yes  Self-employed   
4        Male  81.0             0              0          Yes        Private   
...       ...   ...           ...            ...          ...            ...   
4976     Male  41.0             0              0           No        Private   
4977     Male  40.0             0              0          Yes        Private   
4978   Female  45.0             1              0          Yes       Govt_job   
4979     Male  40.0             0              0          Yes        Private   
4980   Female  80.0             1              0          Yes        Private   

index Residence_type  av

In [5]:
print("Train Data Set Columns:")
trainDatadf=pd.DataFrame(data)
trainDataIndex=trainDatadf.columns
print(trainDataIndex)

print("\n")
print("Number of instances in Train Dataset")
print("Train Instances: %s"% (len(trainDatadf.index)))

Train Data Set Columns:
Index(['gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
       'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',
       'smoking_status', 'stroke'],
      dtype='object', name='index')


Number of instances in Train Dataset
Train Instances: 4981


# pre process data

In [6]:
preprocessed_dataset = data.fillna('0')

print("Train dataset before pre-processing:")
print("=========================================\n")
print(data)

print("\n\n\nTrain dataset after pre-processing:")
print("=========================================\n")
print(preprocessed_dataset)

Train dataset before pre-processing:

index  gender   age  hypertension  heart_disease ever_married      work_type  \
0        Male  67.0             0              1          Yes        Private   
1        Male  80.0             0              1          Yes        Private   
2      Female  49.0             0              0          Yes        Private   
3      Female  79.0             1              0          Yes  Self-employed   
4        Male  81.0             0              0          Yes        Private   
...       ...   ...           ...            ...          ...            ...   
4976     Male  41.0             0              0           No        Private   
4977     Male  40.0             0              0          Yes        Private   
4978   Female  45.0             1              0          Yes       Govt_job   
4979     Male  40.0             0              0          Yes        Private   
4980   Female  80.0             1              0          Yes        Private   

i

# # Label Encoding for Train/Test Data

In [7]:
print("Gender attribute encoding in Train Dataset: \n")

preprocessed_dataset["encode_gender"]=LabelEncoder().fit_transform(preprocessed_dataset["gender"])
print(preprocessed_dataset[["gender","encode_gender"]])

Gender attribute encoding in Train Dataset: 

index  gender  encode_gender
0        Male              1
1        Male              1
2      Female              0
3      Female              0
4        Male              1
...       ...            ...
4976     Male              1
4977     Male              1
4978   Female              0
4979     Male              1
4980   Female              0

[4981 rows x 2 columns]


In [8]:
print("ever_married attribute encoding in train Dataset: \n")

preprocessed_dataset["encode_ever_married"]=LabelEncoder().fit_transform(preprocessed_dataset["ever_married"])
print(preprocessed_dataset[["ever_married","encode_ever_married"]])

ever_married attribute encoding in train Dataset: 

index ever_married  encode_ever_married
0              Yes                    1
1              Yes                    1
2              Yes                    1
3              Yes                    1
4              Yes                    1
...            ...                  ...
4976            No                    0
4977           Yes                    1
4978           Yes                    1
4979           Yes                    1
4980           Yes                    1

[4981 rows x 2 columns]


In [9]:
print("work_type attribute encoding in train Dataset: \n")

preprocessed_dataset["encode_work_type"]=LabelEncoder().fit_transform(preprocessed_dataset["work_type"])
print(preprocessed_dataset[["work_type","encode_work_type"]])

work_type attribute encoding in train Dataset: 

index      work_type  encode_work_type
0            Private                 1
1            Private                 1
2            Private                 1
3      Self-employed                 2
4            Private                 1
...              ...               ...
4976         Private                 1
4977         Private                 1
4978        Govt_job                 0
4979         Private                 1
4980         Private                 1

[4981 rows x 2 columns]


In [10]:
print("Residence_type attribute encoding in train Dataset: \n")

preprocessed_dataset["encode_Residence_type"]=LabelEncoder().fit_transform(preprocessed_dataset["Residence_type"])
print(preprocessed_dataset[["Residence_type","encode_Residence_type"]])

Residence_type attribute encoding in train Dataset: 

index Residence_type  encode_Residence_type
0              Urban                      1
1              Rural                      0
2              Urban                      1
3              Rural                      0
4              Urban                      1
...              ...                    ...
4976           Rural                      0
4977           Urban                      1
4978           Rural                      0
4979           Rural                      0
4980           Urban                      1

[4981 rows x 2 columns]


In [11]:
print("smoking_status attribute encoding in train Dataset: \n")

preprocessed_dataset["encode_smoking_status"]=LabelEncoder().fit_transform(preprocessed_dataset["smoking_status"])
print(preprocessed_dataset[["smoking_status","encode_smoking_status"]])

smoking_status attribute encoding in train Dataset: 

index   smoking_status  encode_smoking_status
0      formerly smoked                      1
1         never smoked                      2
2               smokes                      3
3         never smoked                      2
4      formerly smoked                      1
...                ...                    ...
4976   formerly smoked                      1
4977            smokes                      3
4978            smokes                      3
4979            smokes                      3
4980      never smoked                      2

[4981 rows x 2 columns]


In [12]:
print("Orignal Dataset: \n")
print(data)

print("\n")

print(" Dataset after Label Encoding: \n")

labelTestDatadf= preprocessed_dataset[["encode_gender","age","hypertension","heart_disease","encode_ever_married","encode_work_type","encode_Residence_type","avg_glucose_level","bmi","encode_smoking_status","stroke"]]
fDatadf= labelTestDatadf.rename(columns={"encode_gender":"gender", "encode_ever_married":"ever_married","encode_work_type":"work_type","encode_Residence_type":"Residence_type","encode_smoking_status":"smoking_status"})
print(fDatadf)

Orignal Dataset: 

index  gender   age  hypertension  heart_disease ever_married      work_type  \
0        Male  67.0             0              1          Yes        Private   
1        Male  80.0             0              1          Yes        Private   
2      Female  49.0             0              0          Yes        Private   
3      Female  79.0             1              0          Yes  Self-employed   
4        Male  81.0             0              0          Yes        Private   
...       ...   ...           ...            ...          ...            ...   
4976     Male  41.0             0              0           No        Private   
4977     Male  40.0             0              0          Yes        Private   
4978   Female  45.0             1              0          Yes       Govt_job   
4979     Male  40.0             0              0          Yes        Private   
4980   Female  80.0             1              0          Yes        Private   

index Residence_type

# Train ML Algorithms using Data

In [13]:
x=fDatadf.iloc[:, :-1].values
y=fDatadf.iloc[:, -1:].values

In [14]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

# Train ML Algorithms using Logistic Regression

In [32]:
logisticRegression=LogisticRegression()
logisticRegression.fit(x_train,y_train)
LR = logisticRegression.predict(x_test)

  return f(*args, **kwargs)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## Evaluate ML Algorithms using Test Data


In [33]:
accuracy_score=accuracy_score(y_test, LR)
print("\n accuracy_score is: %s"%accuracy_score)
recall_score=recall_score(y_test, LR)
print("\n recall_score is: %s"%recall_score)
precision_score=precision_score(y_test, LR)
print("\n precision_score is: %s"%precision_score)
f1_score=f1_score(y_test, LR)
print("\n f1_score is: %s"%f1_score)

TypeError: 'numpy.float64' object is not callable

# Train ML Algorithms using Random Forest Classifier

In [26]:
randomForest=RandomForestClassifier()
randomForest.fit(x_train,y_train)
RF = randomForest.predict(x_test)

  randomForest.fit(x_train,y_train)


## Evaluate ML Algorithms using Test Data

In [27]:
accuracy_score=accuracy_score(y_test, RF)
print("\n accuracy_score is: %s"%accuracy_score)
recall_score=recall_score(y_test, RF)
print("\n recall_score is: %s"%recall_score)
precision_score=precision_score(y_test, RF)
print("\n precision_score is: %s"%precision_score)
f1_score=f1_score(y_test, RF)
print("\n f1_score is: %s"%f1_score)


 accuracy_score is: 0.9468405215646941


TypeError: 'numpy.float64' object is not callable

# Train ML Algorithms using linearSvc


In [28]:
linearSvc=LinearSVC()
#print(randomForest)
linearSvc.fit(x_train,y_train)
LS = linearSvc.predict(x_test)

  return f(*args, **kwargs)


## Evaluate ML Algorithms using Test Data

In [29]:
accuracy_score=accuracy_score(y_test, LS)
print("\n accuracy_score is: %s"%accuracy_score)
recall_score=recall_score(y_test, LS)
print("\n recall_score is: %s"%recall_score)
precision_score=precision_score(y_test, LS)
print("\n precision_score is: %s"%precision_score)
f1_score=f1_score(y_test, LS)
print("\n f1_score is: %s"%f1_score)

TypeError: 'numpy.float64' object is not callable

# Train ML Algorithms using BernoulliNB


In [30]:
bernoulliNB=BernoulliNB()
#print(randomForest)
bernoulliNB.fit(x_train,y_train)
NB = bernoulliNB.predict(x_test)


  return f(*args, **kwargs)


 ## Evaluate ML Algorithms using Test Data

In [31]:
accuracy_score=accuracy_score(y_test, NB)
print("\n accuracy_score is: %s"%accuracy_score)
recall_score=recall_score(y_test, NB)
print("\n recall_score is: %s"%recall_score)
precision_score=precision_score(y_test, NB)
print("\n precision_score is: %s"%precision_score)
f1_score=f1_score(y_test, NB)
print("\n f1_score is: %s"%f1_score)

TypeError: 'numpy.float64' object is not callable

# Save the Trained Model as Pickle File

In [None]:
filename='bestModel.sav'
pickle.dump(randomForest,open(filename,'wb'))

# Load the Trained Model


In [None]:
loadBM=pickle.load(open(filename,'rb'))

# Take Input from User

In [None]:
Gender=input("Please enter your gender:")
age=input("Please enter your age here :")
hypertension=input("Pleas eenter your hypertension(0/1):")
heart_disease=input("Do you have heart disease (1/0):")
ever_married=input("Do you ever married (Yes/No):")
work_type=input("Please enter your work type here (Private/Self-employed/Govt_job):")
Residence_type=input("Please enter your Residence type (Urban/Rural):")
avg_glucose_level=input("Please enter your avg glucose level:")
bmi=input("Please enter your bmi(20-50):")
smoking_status=input("Do you smoke (formerly_smoked/never smoked/smokes/Unknown):")

# Convert User Input into Feature Vector


In [None]:
print("user input in actual DataFrame: \n")
newdf=pd.DataFrame({"Gender":Gender,"Age":age,"Hypertension":hypertension,"Heart_disease":heart_disease,"Ever_married":ever_married,"Work_type":work_type,"Residence_type":Residence_type,"Avg_glucose_level":avg_glucose_level,"Bmi":bmi,"Smoking_status":smoking_status},index=[0])
print(newdf)

In [None]:
print("User input in Encoded DataFrame: \n")

if  Gender == "male" or Gender == "Male":
    Gender=1
elif  Gender=="female" or Gender=="Female":
      Gender=0

if  ever_married == "no" or ever_married == "No":
    ever_married=0
elif  ever_married=="yes" or ever_married=="Yes":
      ever_married=1

    
if work_type=="private" or work_type=="Private":
    work_type=1
elif work_type=="self-employed" or work_type=="Self-employed":
    work_type=2
elif work_type=="govt_job" or work_type=="Govt_job":
    work_type=0
    
if Residence_type=="urban" or Residence_type=="Urban":
    Residence_type=1
elif Residence_type=="rural" or Residence_type=="Rural":
    Residence_type=0

if smoking_status=="formerly_smoked" or smoking_status=="Formerly_smoked":
    smoking_status=1
elif smoking_status=="never_smoked" or smoking_status=="never_smoked":
    smoking_status=2
elif smoking_status=="smokes" or smoking_status=="smokes":
    smoking_status=3
elif smoking_status=="Unknown" or smoking_status=="Unknown":
    smoking_status=0
encodeNewdf=pd.DataFrame({"Gender":Gender,"Age":age,"Hypertension":hypertension,"Heart_disease":heart_disease,"Ever_married":ever_married,"Work_type":work_type,"Residence_type":Residence_type,"Avg_glucose_level":avg_glucose_level,"Bmi":bmi,"Smoking_status":smoking_status},index=[0])

print(encodeNewdf)

In [None]:
print("User input in actual DataFrame: \n")
print(newdf)
print("\n \n \nUser input in encoded DataFrame \n")
print(encodeNewdf)

# Apply Trained Model on Feature Vector of Unseen Data and Output Prediction to User

In [None]:
outputPredictionBM= loadBM.predict(encodeNewdf)

if outputPredictionBM==0:
    stroke= "Person has no stroke"
    
elif outputPredictionBM==1:
    stroke="Person has stroke"

print("\n Prediction: %s"%stroke)