In [1]:
import pyspark
from pyspark.sql.types import BooleanType
from pyspark.sql.functions import udf, split, col, regexp_extract,regexp_replace
from pyspark.sql import functions as F
from pyspark.sql import SparkSession, DataFrame
from pyspark.ml import Pipeline
from pyspark.ml.feature import  NGram, Tokenizer,CountVectorizer, StopWordsRemover
import pandas as pd 
from functools import reduce
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression 
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import confusion_matrix 

In [2]:
spark=SparkSession.builder.getOrCreate()

In [3]:
test_dataset = spark.read.csv('/home/gifty/Documents/bs_test/healthcare-dataset-stroke-data/test_2v.csv', header=True, inferSchema=True)
train_dataset = spark.read.csv('/home/gifty/Documents/bs_test/healthcare-dataset-stroke-data/train_2v.csv', header=True, inferSchema=True)

AnalysisException: 'Path does not exist: file:/home/gifty/Documents/bs_test/healthcare-dataset-stroke-data/test_2v.csv;'

In [None]:
test_dataset.show(truncate=0)

In [None]:
train_dataset.show(truncate=0)

In [None]:
test_dataset.columns

In [None]:
def count_empty(df,sort=True):
    """
    Counts number of nulls and nans in each column
    """
    df = df.select([F.count(F.when(F.isnull(c), c)).alias(c) for (c,c_type) in df.dtypes ]).toPandas()

    if len(df) == 0:
        print("There are no any missing values!")
        return None

    if sort:
        return df.rename(index={0: 'count'}).T.sort_values("count",ascending=False)

    return df

In [None]:
count_empty(test_dataset)

In [None]:
count_empty(train_dataset)

In [None]:
test_dataset = test_dataset.toPandas()
train_dataset = train_dataset.toPandas()

In [None]:
combined_dataset = pd.concat([test_dataset, train_dataset], sort=False, ignore_index=True)

In [None]:
combined_dataset.head(50)

In [None]:
combined_dataset = spark.createDataFrame(combined_dataset)

In [None]:
combined_dataset.groupBy().max('age', 'avg_glucose_level', 'bmi').collect()

In [None]:
combined_dataset.groupBy().min('age', 'avg_glucose_level', 'bmi').collect()

In [None]:
combined_dataset = combined_dataset.toPandas()

In [None]:
f,ax=plt.subplots(1,2,figsize=(18,8))
combined_dataset['stroke'].value_counts().plot.pie(explode=[0,0.1],autopct='%1.1f%%',ax=ax[0],shadow=True)
ax[0].set_title('stroke')
ax[0].set_ylabel('')
sns.countplot('stroke',data=combined_dataset,ax=ax[1])
ax[1].set_title('stroke')
plt.show()

### Observation
18% of the patients had stroke

In [None]:
#combined_dataset.groupby([''])

In [None]:
f,ax=plt.subplots(1,2,figsize=(20,10))
combined_dataset[combined_dataset['stroke']==0].age.plot.hist(ax=ax[0],color='green',bins=20,edgecolor='black')
ax[0].set_title('stroke= 0')
x1=list(range(0,100,5))
ax[0].set_xticks(x1)
combined_dataset[combined_dataset['stroke']==1].age.plot.hist(ax=ax[1],bins=20,edgecolor='black',color='red')
ax[1].set_title('stroke= 1')
x2=list(range(0,100,5))
ax[1].set_xticks(x2)
plt.show()

### Observation
+ People under age 30 rarely had stroke
+ Older people were more prone to stroke

In [None]:
f,ax=plt.subplots(1,2,figsize=(18,8))
combined_dataset[['gender','stroke']].groupby(['gender']).mean().plot.bar(ax=ax[0])
ax[0].set_title('Stroke vs Gender')
sns.countplot('gender',hue='stroke',data=combined_dataset,ax=ax[1])
ax[1].set_title('gender:No Stroke vs Had Stroke')
plt.show()

### Observation
More women had stroke than men

In [None]:
sns.catplot(x='smoking_status',y='avg_glucose_level',hue='stroke',data=combined_dataset, kind="bar")
plt.show()

### Observation
+ People who have never smoked were the least likely to get stroke
+ People who formerly smoked had higher chance of getting stroke
+ People who currently smoked had the highest chance of getting stroke

In [None]:
sns.catplot(x='gender',y='bmi',hue='stroke',data=combined_dataset, kind="bar")
plt.show()

In [None]:
sns.catplot(x='heart_disease',y='bmi',hue='stroke',data=combined_dataset, kind="bar")
plt.show()

In [None]:
sns.catplot(x='hypertension',y='age',hue='stroke',data=combined_dataset, kind="bar")
plt.show()

In [None]:
sns.heatmap(train_dataset.corr(),annot=True,cmap='RdYlGn',linewidths=0.2)
fig=plt.gcf()
fig.set_size_inches(10,8)
plt.show()

In [None]:
combined_dataset.drop(['age', 'bmi', 'avg_glucose_level'], axis=1, inplace=True)

The data we have been working with has lots of missing data and is imbalanced

In [None]:
print ('Combined Data Shape: {}'.format(combined_dataset.shape))

In [None]:
combined_dataset.isnull().sum()/len(combined_dataset)*100

In [None]:
combined_dataset.head()

In [None]:
## Dropping Unwanted Columns
combined_dataset.drop(['age', 'bmi', 'avg_glucose_level'], axis=1, inplace=True)

## Categorical Values

Let's convert columns with continous values into categorical Values.
+ Age - The age of patients will be classified into five categories - 0-15, 16-35, 36-45, 46-65, 66 and above
+ BMI - will be classified in four categories - 0-18 as thin, 19-25 as normal, 26-30 as overweight and 31 and above as obese
+ Average Glucose Level - It will be considered that the data is a record of random blood sugar level measured in mg/dl. This will be classified in tree categories - 0-140, 140 and above 140

In [None]:
combined_dataset['age_band']=0
combined_dataset.loc[combined_dataset['age']<=15,'age_band']=0
combined_dataset.loc[(combined_dataset['age']>15)&(combined_dataset['age']<=35),'age_band']=1
combined_dataset.loc[(combined_dataset['age']>35)&(combined_dataset['age']<=45),'age_band']=2
combined_dataset.loc[(combined_dataset['age']>45)&(combined_dataset['age']<=65),'age_band']=3
combined_dataset.loc[combined_dataset['age']>65,'age_band']=4
combined_dataset.head(2)

In [None]:
combined_dataset['age_band'].value_counts().to_frame().style.background_gradient(cmap='summer')

In [None]:
sns.factorplot('age_band','stroke',data=combined_dataset, hue='gender')
plt.show()

In [None]:
combined_dataset['bmi_band']=0
combined_dataset.loc[combined_dataset['bmi']<=18,'bmi_band']=0
combined_dataset.loc[(combined_dataset['bmi']>18)&(combined_dataset['bmi']<=25),'bmi_band']=1
combined_dataset.loc[(combined_dataset['bmi']>25)&(combined_dataset['bmi']<=30),'bmi_band']=2
combined_dataset.loc[combined_dataset['bmi']>30,'bmi_band']=3
combined_dataset.head(2)

sns.factorplot('bmi_band','stroke',data=combined_dataset,hue='age_band')
plt.show()

In [None]:
combined_dataset['glucose_band']=0
combined_dataset.loc[combined_dataset['avg_glucose_level']<=140,'glucose_band']=0
combined_dataset.loc[combined_dataset['avg_glucose_level']==140,'glucose_band']=1
combined_dataset.loc[combined_dataset['avg_glucose_level']>140,'glucose_band']=2
combined_dataset.head(2)

sns.factorplot('glucose_band','stroke',data=combined_dataset,hue='gender')
plt.show()


In [None]:
## Dropping Unwanted Columns
combined_dataset.drop(['age', 'bmi', 'avg_glucose_level'], axis=1, inplace=True)

In [None]:
combined_dataset.head(20)

In [None]:
combined_dataset['gender'].replace(['Male','Female'],[0,1],inplace=True)
combined_dataset['smoking_status'].replace(['formerly smoked','never smoked','smokes'],[0,1,2],inplace=True)

In [None]:
combined_dataset.head()

In [None]:
## Dropping Unwanted Columns
combined_dataset.drop(['age', 'bmi', 'avg_glucose_level'], axis=1, inplace=True)

In [None]:
sns.heatmap(combined_dataset.corr(),annot=True,cmap='RdYlGn',linewidths=0.2) 
fig=plt.gcf()
fig.set_size_inches(10,8)
plt.show()

## Predictive Modeling

#### splitting the data, 70% for training and 30% for testing

In [None]:
feature_col_names = ['id','glucose_band','heart_disease','hypertension','gender','bmi_band','age_band']
predicted_class_names = ['stroke']

X = combined_dataset[feature_col_names].values     # predictor feature columns (8 x m ) 
y = combined_dataset[predicted_class_names].values  # predicted class (1 = true, 0 = false) column (1 x m)
split_test_size = 0.30


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = split_test_size, random_state = 42)




In [None]:
combined_dataset.head()

In [None]:
train,test=train_test_split(combined_dataset,test_size=0.3,random_state=0,stratify=combined_dataset['stroke'])
train_X=train[train.columns[1:]]
train_Y=train[train.columns[:1]]
test_X=test[test.columns[1:]]
test_Y=test[test.columns[:1]]
X=combined_dataset[combined_dataset.columns[1:]]
Y=combined_dataset['stroke']