In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Data Exploration

In [None]:
def plotPieChart(x,labels,title,colors = sns.color_palette("pastel"),autopct='%.0f%%',shadow=True, startangle=90, explode=(0.1, 0)):
    plt.subplot(())
    plt.title(title.capitalize(),fontsize=16)
    plt.pie(x=x,labels=labels,colors = colors, autopct=autopct,shadow=shadow, startangle=startangle)
    plt.legend()
    plt.show()
    

In [None]:
def plotChartForAllDataset(dataset,excludeList=[]):
    for column_name in set(dataset.columns).difference(excludeList):
        if dataset[column_name].dtype == object or (dataset[column_name].dtype == 'int64' and len(dataset[column_name].unique())<5):
            data=[len(dataset[dataset[column_name] == value]) for value in dataset[column_name].unique()]
            plotPieChart(x=data,title=column_name,labels=dataset[column_name].unique())
        else:
            his = sns.histplot(data=dataset, x=column_name)
            his.set_ylabel("# of Records")
            plt.show()

## Classification

### Healthcare dataset stroke 

https://nbviewer.org/github/PBPatil/Exploratory_Data_Analysis-Wine_Quality_Dataset/blob/master/winequality_white.ipynb
https://towardsdatascience.com/exploratory-data-analysis-8fc1cb20fd15

In [None]:
# Importing the healthcare dataset
strokeDataset = pd.read_csv('datasets/healthcare-dataset-stroke-data.csv')

In [None]:
strokeDataset.info()

In [None]:
strokeDataset.head()

In [None]:
strokeDataset.describe()

#### Feature explination


Each record hold an information regarding the patient's health and if he/she had stroke.
For the classification we will discrad the Id attribute bacuse it does not hold any prediction power.

1) id: unique identifier
2) gender: "Male", "Female" or "Other"
3) age: age of the patient
4) hypertension: 0 if the patient doesn't have hypertension, 1 if the patient has hypertension
5) heart_disease: 0 if the patient doesn't have any heart diseases, 1 if the patient has a heart disease
6) ever_married: "No" or "Yes"
7) work_type: "children", "Govt_jov", "Never_worked", "Private" or "Self-employed"
8) Residence_type: "Rural" or "Urban"
9) avg_glucose_level: average glucose level in blood
10) bmi: body mass index
11) smoking_status: "formerly smoked", "never smoked", "smokes" or "Unknown"*
12) stroke: 1 if the patient had a stroke or 0 if not
*Note: "Unknown" in smoking_status means that the information is unavailable for this patient

In [None]:
plotChartForAllDataset(strokeDataset,excludeList=['id'])

# Model Evaluation

In [None]:
# Utils
def preprocess(dataset):
    le = preprocessing.LabelEncoder()
    for column_name in dataset.columns:
        if dataset[column_name].dtype == object:
            dataset[column_name] = le.fit_transform(dataset[column_name])
        else:
            pass
    dataset.fillna(strokeDataset.mean(), inplace=True)
    return dataset

## Classifiers

### Healthcare dataset stroke 

In [None]:
from sklearn.tree import DecisionTreeClassifier
from SoftSplitDecisionTrees import SoftSplitDecisionTreeClassifier 
from sklearn.model_selection import cross_val_score
from sklearn import preprocessing

## regular classifier
strokeDataset=preprocess(strokeDataset)
treeClassifier = DecisionTreeClassifier()
treeSoftSplitClassifier = SoftSplitDecisionTreeClassifier()
scoresRegular = cross_val_score(estimator=treeClassifier, X=strokeDataset.loc[:, strokeDataset.columns!='stroke'], y=strokeDataset['stroke'], cv=5)
scoresSoftSplit = cross_val_score(estimator=treeSoftSplitClassifier, X=strokeDataset.loc[:, strokeDataset.columns!='stroke'], y=strokeDataset['stroke'], cv=5)
display(f'Regular Model accuracy {scoresRegular.mean()}')
display(f'SoftSplit Model accuracy {scoresSoftSplit.mean()}')
