In [1]:
from pmlb import fetch_data
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from nice import NICE
import pandas as pd

# 導入資料集


In [2]:
df = pd.read_csv('C:/Users/Calvin/OneDrive/桌面\台科/data Mining/HW2/healthcare-dataset-stroke-data.csv')
df

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,Female,80.0,1,0,Yes,Private,Urban,83.75,,never smoked,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


# 前處理

In [3]:
df = df.drop(labels=['id'], axis = 1)

In [4]:
label_encoder = LabelEncoder()

df['gender'] = label_encoder.fit_transform(df['gender'])
df['ever_married'] = label_encoder.fit_transform(df['ever_married'])
df['work_type'] = label_encoder.fit_transform(df['work_type'])
df['Residence_type'] = label_encoder.fit_transform(df['Residence_type'])
df['smoking_status'] = label_encoder.fit_transform(df['smoking_status'])
df.fillna(df.mean(), inplace=True)

df


Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,1,67.0,0,1,1,2,1,228.69,36.600000,1,1
1,0,61.0,0,0,1,3,0,202.21,28.893237,2,1
2,1,80.0,0,1,1,2,0,105.92,32.500000,2,1
3,0,49.0,0,0,1,2,1,171.23,34.400000,3,1
4,0,79.0,1,0,1,3,0,174.12,24.000000,2,1
...,...,...,...,...,...,...,...,...,...,...,...
5105,0,80.0,1,0,1,2,1,83.75,28.893237,2,0
5106,0,81.0,0,0,1,3,1,125.20,40.000000,2,0
5107,0,35.0,0,0,1,3,0,82.99,30.600000,2,0
5108,1,51.0,0,0,1,2,0,166.29,25.600000,1,0


In [5]:
X = df.drop(labels=['stroke'], axis = 1)
y = df.loc[:, 'stroke']
feature_names = list(X.columns)

In [6]:
y

0       1
1       1
2       1
3       1
4       1
       ..
5105    0
5106    0
5107    0
5108    0
5109    0
Name: stroke, Length: 5110, dtype: int64

In [7]:
X

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status
0,1,67.0,0,1,1,2,1,228.69,36.600000,1
1,0,61.0,0,0,1,3,0,202.21,28.893237,2
2,1,80.0,0,1,1,2,0,105.92,32.500000,2
3,0,49.0,0,0,1,2,1,171.23,34.400000,3
4,0,79.0,1,0,1,3,0,174.12,24.000000,2
...,...,...,...,...,...,...,...,...,...,...
5105,0,80.0,1,0,1,2,1,83.75,28.893237,2
5106,0,81.0,0,0,1,3,1,125.20,40.000000,2
5107,0,35.0,0,0,1,3,0,82.99,30.600000,2
5108,1,51.0,0,0,1,2,0,166.29,25.600000,1


# 列出哪些features為 category，哪些為numeric

In [8]:
print(feature_names)
cat_feat = [0, 2, 3, 4, 5, 6, 9]
num_feat = [1,7, 8]

['gender', 'age', 'hypertension', 'heart_disease', 'ever_married', 'work_type', 'Residence_type', 'avg_glucose_level', 'bmi', 'smoking_status']


In [9]:
X = X.values  #only supports arrays atm
y = y.values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# 對 category的feature做OneHotEncoder，numeric的做standardscaler

In [10]:
clf = Pipeline([
    ('PP', ColumnTransformer([
        ('num', StandardScaler(), num_feat),
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_feat)])),
    ('RF', RandomForestClassifier())])

clf.fit(X_train, y_train)

In [11]:
predict_fn = lambda x: clf.predict_proba(x)

In [12]:
NICE_adult = NICE(
    X_train=X_train,
    predict_fn=predict_fn,
    y_train=y_train,
    cat_feat=cat_feat,
    num_feat=num_feat,
    distance_metric='HEOM',
    num_normalization='minmax',
    optimization='proximity',
    justified_cf=True
)

In [13]:
to_explain = X_test[0:1, :]

In [14]:
X_test

array([[  0.  ,  40.  ,   0.  , ..., 158.93,  31.3 ,   3.  ],
       [  1.  ,  51.  ,   0.  , ...,  66.11,  26.3 ,   2.  ],
       [  1.  ,  52.  ,   1.  , ...,  74.64,  30.7 ,   3.  ],
       ...,
       [  1.  ,  18.  ,   0.  , ...,  97.39,  22.8 ,   2.  ],
       [  0.  ,  81.  ,   0.  , ...,  69.01,  32.6 ,   2.  ],
       [  1.  ,  18.  ,   0.  , ...,  60.56,  33.  ,   2.  ]])

In [15]:
y_test

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [16]:
to_explain

array([[  0.  ,  40.  ,   0.  ,   0.  ,   0.  ,   2.  ,   0.  , 158.93,
         31.3 ,   3.  ]])

In [17]:
CF = NICE_adult.explain(to_explain)

In [18]:
de = pd.DataFrame(data=[X_test[0:1, :][0], CF[0]], columns=feature_names)
de


Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status
0,0.0,40.0,0.0,0.0,0.0,2.0,0.0,158.93,31.3,3.0
1,1.0,49.0,0.0,0.0,0.0,2.0,0.0,104.86,31.9,3.0


In [19]:
if y_test[0]== 0:
    de['stroke'] = [0,1]
if y_test[0]== 1:
    de['stroke'] = [1,0]

# 結果

In [20]:
de

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,0.0,40.0,0.0,0.0,0.0,2.0,0.0,158.93,31.3,3.0,0
1,1.0,49.0,0.0,0.0,0.0,2.0,0.0,104.86,31.9,3.0,1
