In [521]:
## Imports ##
import pandas as pd
import numpy as np
from utility import FeatureSelection
from matplotlib import pyplot as plt
import seaborn as sns
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
from sklearn.cluster import AgglomerativeClustering
from pyclustering.cluster.kmedoids import kmedoids
from pyclustering.utils import calculate_distance_matrix
from sklearn.preprocessing import OneHotEncoder, MultiLabelBinarizer, StandardScaler
from sklearn.feature_selection import mutual_info_classif, SelectKBest, chi2
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import silhouette_score, silhouette_samples
from sklearn.preprocessing import OrdinalEncoder, MinMaxScaler
import gower

In [522]:
## Read the dataset ##
df_raw = pd.read_csv("datasets/healthcare_dataset.csv")
# df_raw = df_raw.head(5000)
print(df_raw.info())
df=df_raw.copy()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55500 entries, 0 to 55499
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Name                55500 non-null  object 
 1   Age                 55500 non-null  int64  
 2   Gender              55500 non-null  object 
 3   Blood Type          55500 non-null  object 
 4   Medical Condition   55500 non-null  object 
 5   Date of Admission   55500 non-null  object 
 6   Doctor              55500 non-null  object 
 7   Hospital            55500 non-null  object 
 8   Insurance Provider  55500 non-null  object 
 9   Billing Amount      55500 non-null  float64
 10  Room Number         55500 non-null  int64  
 11  Admission Type      55500 non-null  object 
 12  Discharge Date      55500 non-null  object 
 13  Medication          55500 non-null  object 
 14  Test Results        55500 non-null  object 
dtypes: float64(1), int64(2), object(12)
memory usage: 6.4

In [524]:
## Data Preprocessing ##
# Calculate Length of Stay #
df['Date of Admission'] = pd.to_datetime(df['Date of Admission'])
df['Discharge Date'] = pd.to_datetime(df['Discharge Date'])
df['Length of Stay'] = (df['Discharge Date'] - df['Date of Admission']).dt.days
df.drop(columns=['Date of Admission', 'Discharge Date'], inplace=True)

X_encoded = df.copy()
X_encoded = X_encoded.drop(columns=['Test Results'])
cat_cols = X_encoded.select_dtypes(include=['object']).columns.tolist()
num_cols = X_encoded.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Esegui encoding temporaneo delle categoriche
encoder_cat = OrdinalEncoder()
X_encoded[cat_cols] = encoder_cat.fit_transform(df[cat_cols])

encoder_num = MinMaxScaler()
X_encoded[num_cols] = encoder_num.fit_transform(X_encoded[num_cols])

X_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55500 entries, 0 to 55499
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Name                55500 non-null  float64
 1   Age                 55500 non-null  float64
 2   Gender              55500 non-null  float64
 3   Blood Type          55500 non-null  float64
 4   Medical Condition   55500 non-null  float64
 5   Doctor              55500 non-null  float64
 6   Hospital            55500 non-null  float64
 7   Insurance Provider  55500 non-null  float64
 8   Billing Amount      55500 non-null  float64
 9   Room Number         55500 non-null  float64
 10  Admission Type      55500 non-null  float64
 11  Medication          55500 non-null  float64
 12  Length of Stay      55500 non-null  float64
dtypes: float64(13)
memory usage: 5.5 MB


In [525]:
target = df['Test Results']
df_rank = FeatureSelection(X_encoded, target)
print(df_rank)




               Feature        MI          Chi2        RF  MeanScore
0                 Name  0.999886  1.000000e+00  0.995872   0.998586
6             Hospital  0.812405  3.370223e-01  0.998725   0.716051
5               Doctor  0.831347  2.304744e-01  0.992886   0.684903
8       Billing Amount  1.000000  7.269938e-07  1.000000   0.666667
9          Room Number  0.007783  0.000000e+00  0.941622   0.316468
1                  Age  0.001216  6.984964e-06  0.773313   0.258178
12      Length of Stay  0.000467  2.530172e-08  0.678552   0.226340
3           Blood Type  0.000052  2.448312e-05  0.398565   0.132880
4    Medical Condition  0.000098  4.778571e-06  0.313001   0.104368
11          Medication  0.000020  1.598292e-05  0.267140   0.089059
7   Insurance Provider  0.000058  1.186725e-04  0.238317   0.079498
10      Admission Type  0.000000  6.368444e-06  0.114383   0.038130
2               Gender  0.000006  1.737979e-05  0.000000   0.000008


✅ Tieni | Test Results, Billing Amount, Age | Le uniche chiaramente informative

⚠️ Rivedi | Length of Stay, Medication, Medical Condition | Solo se rielaborate bene

❌ Scarta | Name, Doctor, Hospital, Room Number, Gender, Insurance Provider, Admission Type | Non informative o rischiose

In [526]:
X_encoded.drop(columns=['Name', 'Hospital', 'Room Number', 'Doctor', 'Blood Type', 'Insurance Provider', 'Gender', 'Admission Type'], inplace=True)

df_rank = FeatureSelection(X_encoded, target)
print(df_rank)




             Feature        MI      Chi2        RF  MeanScore
2     Billing Amount  1.000000  0.043972  1.000000   0.681324
3         Medication  0.000000  1.000000  0.000000   0.333333
0                Age  0.001196  0.436134  0.516195   0.317842
1  Medical Condition  0.000078  0.297868  0.026412   0.108120
4     Length of Stay  0.000448  0.000000  0.307105   0.102518


In [527]:
import shap
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test)
shap.summary_plot(shap_values, X_test)


KeyboardInterrupt: 