<a href="https://colab.research.google.com/github/BijayChodhoury/Machine-Learning/blob/main/Brain%20Stroke%20Prediction/Brain_Stroke_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Importing Dependencies

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
# Loading dataset
brain = pd.read_csv('/content/brain_stroke.csv')

# **Attribute Information**

1) gender: "Male", "Female" or "Other"

2) age: age of the patient

3) hypertension: 0 if the patient doesn't have hypertension, 1 if the patient has hypertension

4) heart disease: 0 if the patient doesn't have any heart diseases, 1 if the patient has a heart disease 5) ever-married: "No" or "Yes"

6) worktype: "children", "Govtjov", "Neverworked", "Private" or "Self-employed" 

7) Residencetype: "Rural" or "Urban"

8) avgglucoselevel: average glucose level in blood

9) bmi: body mass index

10) smoking_status: "formerly smoked", "never smoked", "smokes" or "Unknown"*

11) stroke: 1 if the patient had a stroke or 0 if not

*Note: "Unknown" in smoking_status means that the information is unavailable for this patient

In [3]:
brain.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
2,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
3,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
4,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1


In [4]:
brain.isnull().sum()

gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64

In [5]:
brain.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4981 entries, 0 to 4980
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gender             4981 non-null   object 
 1   age                4981 non-null   float64
 2   hypertension       4981 non-null   int64  
 3   heart_disease      4981 non-null   int64  
 4   ever_married       4981 non-null   object 
 5   work_type          4981 non-null   object 
 6   Residence_type     4981 non-null   object 
 7   avg_glucose_level  4981 non-null   float64
 8   bmi                4981 non-null   float64
 9   smoking_status     4981 non-null   object 
 10  stroke             4981 non-null   int64  
dtypes: float64(3), int64(3), object(5)
memory usage: 428.2+ KB


In [6]:
brain.describe()

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
count,4981.0,4981.0,4981.0,4981.0,4981.0,4981.0
mean,43.419859,0.096165,0.05521,105.943562,28.498173,0.049789
std,22.662755,0.294848,0.228412,45.075373,6.790464,0.217531
min,0.08,0.0,0.0,55.12,14.0,0.0
25%,25.0,0.0,0.0,77.23,23.7,0.0
50%,45.0,0.0,0.0,91.85,28.1,0.0
75%,61.0,0.0,0.0,113.86,32.6,0.0
max,82.0,1.0,1.0,271.74,48.9,1.0


## Printing value counts of some of the categorical columns

In [7]:
print(brain['gender'].value_counts())
print(brain['ever_married'].value_counts())
print(brain['work_type'].value_counts())
print(brain['Residence_type'].value_counts())
print(brain['smoking_status'].value_counts())

Female    2907
Male      2074
Name: gender, dtype: int64
Yes    3280
No     1701
Name: ever_married, dtype: int64
Private          2860
Self-employed     804
children          673
Govt_job          644
Name: work_type, dtype: int64
Urban    2532
Rural    2449
Name: Residence_type, dtype: int64
never smoked       1838
Unknown            1500
formerly smoked     867
smokes              776
Name: smoking_status, dtype: int64


## Converting Categorical data to numerical data

In [8]:
brain.replace({
    
    'gender':{'Male':1, 'Female':0},
    'ever_married':{'Yes':1, 'No':0},
    'work_type':{'Private':0, 'Self-employed':1, 'children':2, 'Govt_job':3},
    'Residence_type':{'Urban':1, 'Rural':0},
    'smoking_status':{'never smoked':0, 'smokes':1, 'formerly smoked':2, 'Unknown':3}
    
}, inplace=True)

In [9]:
brain.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,1,67.0,0,1,1,0,1,228.69,36.6,2,1
1,1,80.0,0,1,1,0,0,105.92,32.5,0,1
2,0,49.0,0,0,1,0,1,171.23,34.4,1,1
3,0,79.0,1,0,1,1,0,174.12,24.0,0,1
4,1,81.0,0,0,1,0,1,186.21,29.0,2,1


## Separating Data and Labels

In [10]:
x = brain.drop(['stroke'], axis=1)
y = brain['stroke']
print(x, y)

      gender   age  hypertension  heart_disease  ever_married  work_type  \
0          1  67.0             0              1             1          0   
1          1  80.0             0              1             1          0   
2          0  49.0             0              0             1          0   
3          0  79.0             1              0             1          1   
4          1  81.0             0              0             1          0   
...      ...   ...           ...            ...           ...        ...   
4976       1  41.0             0              0             0          0   
4977       1  40.0             0              0             1          0   
4978       0  45.0             1              0             1          3   
4979       1  40.0             0              0             1          0   
4980       0  80.0             1              0             1          0   

      Residence_type  avg_glucose_level   bmi  smoking_status  
0                  1   

## Standardization of the data

In [11]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(x)
x = scaler.transform(x)

## Separating Training and Test data

In [12]:
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=2, stratify=y, test_size=0.2)

## Training Model (RandomForest)

In [13]:
from sklearn.ensemble import RandomForestClassifier

# Model training
modelRF = RandomForestClassifier()
modelRF.fit(x_train, y_train)

# Model evaluation
print("Accuracy on test data: ", accuracy_score(modelRF.predict(x_test), y_test))

Accuracy on test data:  0.950852557673019


## Making a Predictive Sytem

In [16]:
def Brain_Stroke_Predict(*patient_data):
  brain_as_array = np.asarray(patient_data)
  brain_as_reshaped = brain_as_array.reshape(1, -1)

  # Converting Categorical data to Numerical data
  brain_as_reshaped[brain_as_reshaped == 'Male'] = 1
  brain_as_reshaped[brain_as_reshaped == 'Female'] = 0

  brain_as_reshaped[brain_as_reshaped == 'Yes'] = 1
  brain_as_reshaped[brain_as_reshaped == 'No'] = 0

  brain_as_reshaped[brain_as_reshaped == 'Urban'] = 1
  brain_as_reshaped[brain_as_reshaped == 'Rural'] = 0

  brain_as_reshaped[brain_as_reshaped == 'Self-employed'] = 1
  brain_as_reshaped[brain_as_reshaped == 'Private'] = 0
  brain_as_reshaped[brain_as_reshaped == 'children'] = 2
  brain_as_reshaped[brain_as_reshaped == 'Govt_job'] = 3

  brain_as_reshaped[brain_as_reshaped == 'smokes'] = 1
  brain_as_reshaped[brain_as_reshaped == 'never smoked'] = 0
  brain_as_reshaped[brain_as_reshaped == 'formerly smoked'] = 2
  brain_as_reshaped[brain_as_reshaped == 'Unknown'] = 3

  # Standardize
  std_brain = scaler.transform(brain_as_reshaped)
  #print(std_brain)

  # Prediction
  prediction = modelRF.predict(std_brain)
  #print(prediction)
  if(prediction[0] == 0):
    print("The Patient did not had a Stroke")
  else:
    print("The Patient had a Stroke")

In [22]:
# Testing the Predictive System
lst = []
for i in range(0, 10):
  ele = input()
  lst.append(ele)

# Female,81,0,0,Yes,Self-employed,Rural,81.95,16.9,never smoked

## Calling the Method and passing the list
Brain_Stroke_Predict(lst)

Female
81
0
0
Yes
Self-employed
Rural
81.95
16.9
never smoked
The Patient had a Stroke


  "X does not have valid feature names, but"
