In [17]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import imblearn as ib
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [30]:
df = pd.read_csv("dataset.csv")

In [31]:
# The 'id' column is dropped since the attribute holds no significant importance to the problem at hand
df = df.drop(['id'],axis=1)

In [32]:
# Checking the values in the gender column
df['gender'].value_counts()

Unnamed: 0_level_0,count
gender,Unnamed: 1_level_1
Female,2994
Male,2115
Other,1


In [34]:
# Removing the 'other' gender instance inorder to reduce the dimension
df['gender'] = df['gender'].replace('Other','Male')
# plotting a pie chart to see the gender count distribution
df['gender'].value_counts()

Unnamed: 0_level_0,count
gender,Unnamed: 1_level_1
Female,2994
Male,2116


In [35]:
  # Value count in the stroke attribute
df['stroke'].value_counts()

Unnamed: 0_level_0,count
stroke,Unnamed: 1_level_1
0,4861
1,249


In [37]:
print("% of people who actualy got a stroke : ",(df['stroke'].value_counts()[1]/df['stroke'].value_counts().sum()).round(3)*100)

% of people who actualy got a stroke :  4.9


In [38]:
df['work_type'].value_counts()

Unnamed: 0_level_0,count
work_type,Unnamed: 1_level_1
Private,2925
Self-employed,819
children,687
Govt_job,657
Never_worked,22


In [40]:
df['smoking_status'].value_counts()

Unnamed: 0_level_0,count
smoking_status,Unnamed: 1_level_1
never smoked,1892
Unknown,1544
formerly smoked,885
smokes,789


In [42]:
df['Residence_type'].value_counts()

Unnamed: 0_level_0,count
Residence_type,Unnamed: 1_level_1
Urban,2596
Rural,2514


In [43]:
# Number of BMI - NULL values
df['bmi'].isnull().sum()

201

In [44]:
# Finding the count of outliers based on those instances which are out of iqr
Q1 = df['bmi'].quantile(0.25)
Q3 = df['bmi'].quantile(0.75)
# Finding IQR
IQR = Q3 - Q1
da=(df['bmi'] < (Q1 - 1.5 * IQR)) | (df['bmi'] > (Q3 + 1.5 * IQR))
da.value_counts()

Unnamed: 0_level_0,count
bmi,Unnamed: 1_level_1
False,5000
True,110


In [46]:
print("Percentage of NULL values in bmi : ",df['bmi'].isna().sum()/len(df['bmi'])*100)

Percentage of NULL values in bmi :  3.9334637964774952


In [48]:
print(" Percentage of instances who got stroke : ",df['stroke'].sum()/len(df)*100)

 Percentage of instances who got stroke :  4.87279843444227


In [49]:
# Analysing whether to drop NA values in Bmi column
df_na=df.loc[df['bmi'].isnull()]
print("Nan BMI values where people have stroke:",df_na['stroke'].sum())
print("overall BMI values where people have stroke:",df['stroke'].sum())

Nan BMI values where people have stroke: 40
overall BMI values where people have stroke: 249


In [50]:
# Imputing the missing N/A values using the median of bmi column
print("median of bmi",df['bmi'].median())
df['bmi']=df['bmi'].fillna(df['bmi'].median())

median of bmi 28.1


In [51]:
# Finding the count of outliers based on those instances which are out of iqr
Q1 = df['avg_glucose_level'].quantile(0.25)
Q3 = df['avg_glucose_level'].quantile(0.75)
IQR = Q3 - Q1
da=(df['avg_glucose_level'] < (Q1 - 1.5 * IQR)) | (df['avg_glucose_level'] > (Q3 + 1.5 * IQR))
da.value_counts()

Unnamed: 0_level_0,count
avg_glucose_level,Unnamed: 1_level_1
False,4483
True,627


In [52]:
# Value count of heart disease attribute
df['heart_disease'].value_counts()

Unnamed: 0_level_0,count
heart_disease,Unnamed: 1_level_1
0,4834
1,276


In [55]:
# Value count of evver married attribute
df['ever_married'].value_counts()

Unnamed: 0_level_0,count
ever_married,Unnamed: 1_level_1
Yes,3353
No,1757


In [56]:
# Converting numeric-binary value attributes to string
df[['hypertension', 'heart_disease', 'stroke']] = df[['hypertension', 'heart_disease', 'stroke']].astype(str)
# Generating dummy attributes - one hot encoding format
df = pd.get_dummies(df, drop_first= True)


In [57]:
# The data frame after performing dummy attributes
df.head()

Unnamed: 0,age,avg_glucose_level,bmi,gender_Male,hypertension_1,heart_disease_1,ever_married_Yes,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Urban,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes,stroke_1
0,67.0,228.69,36.6,True,False,True,True,False,True,False,False,True,True,False,False,True
1,61.0,202.21,28.1,False,False,False,True,False,False,True,False,False,False,True,False,True
2,80.0,105.92,32.5,True,False,True,True,False,True,False,False,False,False,True,False,True
3,49.0,171.23,34.4,False,False,False,True,False,True,False,False,True,False,False,True,True
4,79.0,174.12,24.0,False,True,False,True,False,False,True,False,False,False,True,False,True


In [58]:
# Since our Dataset is highly undersampled (based on target instances) we are going to perform a over sampling method to have equal representation of both the target classes
# Using random oversampling - importing the library
from imblearn.over_sampling import RandomOverSampler

# Performing a minority oversampling
oversample = RandomOverSampler(sampling_strategy='minority')
X=df.drop(['stroke_1'],axis=1)
y=df['stroke_1']

# Obtaining the oversampled dataframes - testing and training
X_over, y_over = oversample.fit_resample(X, y)

In [59]:
# importing a scaling modeule
from sklearn.preprocessing import StandardScaler

# Since the numeric attributes in the dataset is in different ranges and three are outliers persent we are usign a scaler to get all the values into the same range.
s = StandardScaler()
# Scaling the numeric attributes
df[['bmi', 'avg_glucose_level', 'age']] = s.fit_transform(df[['bmi', 'avg_glucose_level', 'age']])

In [60]:
# creating dataset split for training and testing the model
from sklearn.model_selection import train_test_split
# Performing a 80-20 test-train split
#X_train, X_test, y_train, y_test = train_test_split(X_over, y_over, test_size= 0.20, random_state= 42)
X_train, X_test, y_train, y_test = train_test_split(X_over, y_over, test_size= 0.20, random_state= 42)

In [88]:
# Checking the size of the splits
print('X_train:', X_train.shape)
print('y_train:', y_train.shape)
print('X_test:', X_test.shape)
print('y_test:', y_test.shape)

X_train: (7777, 15)
y_train: (7777,)
X_test: (1945, 15)
y_test: (1945,)


In [89]:

# importing random forest classifier module for training
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

# Create the classifier object
rf_clf = RandomForestClassifier(n_estimators = 100)

# Train the model using the training sets
rf_clf.fit(X_train, y_train)

# performing predictions on the test dataset
y_pred_rf = rf_clf.predict(X_test)

# Printing accuracy of the model
print('Accuracy:', accuracy_score(y_test, y_pred_rf)*100)

Accuracy: 99.33161953727506


In [90]:
# Making sample predictions based on manual value entry
age=37
avg_glucose_level=91.72
bmi=29.2
gender_Male=1
ever_married_Yes=1
work_type_Never_worked=0
work_type_Private=1
work_type_Self_employed=0
work_type_children=0
Residence_type_Urban=1
smoking_status_formerly_smoked=0
smoking_status_never_smoked=1
smoking_status_smokes=0
hypertension_1=0
heart_disease_1=0
input_features = [age	,avg_glucose_level,	bmi	,gender_Male,hypertension_1,	heart_disease_1,ever_married_Yes,	work_type_Never_worked,	work_type_Private,	work_type_Self_employed,	work_type_children	,Residence_type_Urban,	smoking_status_formerly_smoked,smoking_status_never_smoked	,smoking_status_smokes]

features_value = [np.array(input_features)]
features_name = ['age'	,'avg_glucose_level',	'bmi'	,'gender_Male'	,'hypertension_1',	'heart_disease_1','ever_married_Yes',	'work_type_Never_worked',	'work_type_Private',	'work_type_Self-employed',	'work_type_children'	,'Residence_type_Urban',	'smoking_status_formerly smoked','smoking_status_never smoked'	,'smoking_status_smokes']

df = pd.DataFrame(features_value, columns=features_name)
prediction = rf_clf.predict(df)[0]
print(prediction)


False
