In [None]:
import pickle
import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
data = pickle.load( open( "encodings.p", "rb" ) )
data.head()

Unnamed: 0,patient_nbr,age,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,...,discharge_disposition_id_grouped_HomeDischarge,discharge_disposition_id_grouped_HomeWFollowUp,discharge_disposition_id_grouped_InstitutionalCare,discharge_disposition_id_grouped_Unknown,admission_source_grouped_Births,admission_source_grouped_Data_Issues,admission_source_grouped_Emergency_or_Law,admission_source_grouped_Referrals,admission_source_grouped_Transfers,payer_code_labeled
0,8222157.0,0.0,0.0,0.39604,0.0,0.0,0.0,0.0,0.0,0.493769,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,17.0
1,55629189.0,0.111111,0.166667,0.574257,0.0,0.435897,0.0,0.0,0.0,0.636364,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,17.0
2,86047875.0,0.222222,0.083333,0.09901,0.833333,0.307692,0.5,0.0,0.25,0.363636,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,17.0
3,82442376.0,0.333333,0.083333,0.425743,0.166667,0.384615,0.0,0.0,0.0,0.454545,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,17.0
4,42519267.0,0.444444,0.0,0.49505,0.0,0.179487,0.0,0.0,0.0,0.272727,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,17.0


In [None]:
#pd.set_option('display.max_columns', None)

# Training classification models
The goal is to determine if a patient will be readmitted within 30 days. To this end we want to train classification models. We will select three classification models which we will train with different levels of encoding and preprocessing of the dataset and by adjusting hyperparameters. With this method, we want to compare different models as well as different prerequisites within each model.

These are the classifier model we want to train:
* k-Nearest Neighbors: KNeighborsClassifier
* Support Vector Machine Algorithm (SVM): Linear Support Vector Classification (Linear SVC)

## Splitting the dataset
Before training any models, the dataset has to be split into training and test dataset. We will use the standard 80-20 approach.

But before this can be done, the dataset needs another round of special attention. As 'patient_nbr' indicates (as shown in the previous notebook), one patient can appear as multiple encounters in this dataset, recognizable by 'patient_nbr'. We don't want encounters of the same patient appear in the training dataset as well as in the test dataset because even when dropping the column 'patient_nbr', it could distort the results.

There are two approaches which we could take:
* having only one encounter for each patient, therefore dropping all lines of reappearing patient numbers (this means losing information)
* having a patient appear only in one part of the split (this means sorting the dataset by 'patient_nbr' before splitting, then randomizing the order again and dropping 'patient_nbr' after the split)

Below, we will implement both approaches. Ultimately, we decided to take the second approach to lose as little data as possible.

In [None]:
unique_patients = len(data["patient_nbr"].unique())
number_encounters = len(data)
print(f"Number of unique patients: {unique_patients}")
print(f"Number of encounters: {number_encounters}")
print(f"Number of encounters in which a patient reappears in the dataset: {number_encounters-unique_patients}")

Number of unique patients: 71518
Number of encounters: 101766
Number of encounters in which a patient reappears in the dataset: 30248


Roughly 30% of the current state of the dataset are reappearing patients. Therefore, when dropping reappearing patient encounters, we will lose 30% of the data.

In [None]:
patient_encounters = data.groupby('patient_nbr').size()
max_encounters = patient_encounters.max()
patients_with_max_encounters = patient_encounters[patient_encounters == max_encounters].index.tolist()
avg_encounters = patient_encounters.mean()
median_encounters = patient_encounters.median()
single_encounter = (patient_encounters == 1).sum()

print(f"Maximum encounters of a patient: {max_encounters}")
print(f"Patient with {max_encounters} encounters: {patients_with_max_encounters}")
print(f"Average encounters of a patient: {avg_encounters}")
print(f"Median encounters of a patient: {median_encounters}")
print(f"Number of patients with one encounter: {single_encounter}")

Maximum encounters of a patient: 40
Patient with 40 encounters: [88785891.0]
Average encounters of a patient: 1.4229424760200229
Median encounters of a patient: 1.0
Number of patients with one encounter: 54745


The average number of encounters is 1.4. Therefore, the average patients appears more than once in this dataset.

In [None]:
data.loc[data["patient_nbr"] == patients_with_max_encounters[0]]

Unnamed: 0,patient_nbr,age,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,...,discharge_disposition_id_grouped_HomeDischarge,discharge_disposition_id_grouped_HomeWFollowUp,discharge_disposition_id_grouped_InstitutionalCare,discharge_disposition_id_grouped_Unknown,admission_source_grouped_Births,admission_source_grouped_Data_Issues,admission_source_grouped_Emergency_or_Law,admission_source_grouped_Referrals,admission_source_grouped_Transfers,payer_code_labeled
38307,88785891.0,0.222222,0.0,0.306931,0.0,0.230769,0.0,0.0,0.0,0.090909,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,10.0
40252,88785891.0,0.222222,0.0,0.089109,0.0,0.230769,0.25,0.0,0.5,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
40661,88785891.0,0.222222,0.333333,0.336634,0.0,0.358974,0.25,0.0,0.75,0.363636,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
44515,88785891.0,0.222222,0.083333,0.0,0.0,0.179487,0.25,0.0,0.128757,0.090909,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,14.0
45147,88785891.0,0.222222,0.083333,0.306931,0.0,0.076923,0.25,0.0,0.128757,0.636364,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
45986,88785891.0,0.222222,0.25,0.069307,0.0,0.333333,0.25,0.0,0.128757,0.636364,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
50167,88785891.0,0.222222,0.0,0.009901,0.0,0.205128,1.0,0.5,0.128757,0.454545,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
50393,88785891.0,0.222222,0.25,0.623762,0.0,0.461538,1.0,0.5,0.128757,0.363636,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,14.0
50773,88785891.0,0.222222,0.0,0.316832,0.0,0.230769,1.0,0.5,0.128757,0.363636,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
51519,88785891.0,0.222222,0.0,0.336634,0.0,0.230769,0.75,0.5,0.128757,0.181818,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


#### Approach 1: Dropping reappearing patients

In [None]:
len(data["patient_nbr"].unique())

71518

**Method 'get_split_for_unique_patients'**

In [None]:
def get_split_for_unique_patients(data):
  data_unique_patients = data.drop_duplicates(subset='patient_nbr', keep="first")
  data_unique_patients = data_unique_patients.drop("patient_nbr", axis=1)
  features = data_unique_patients.columns[data_unique_patients.columns != "readmitted"].tolist()
  target_value = ["readmitted"]

  X = data_unique_patients[features]
  y = data_unique_patients[target_value]

  return train_test_split(X, y, test_size=0.2)

In [None]:
# example how to use the function to get the 80-20 split
X_train, X_test, y_train, y_test = get_split_for_unique_patients(data)

In [None]:
len(X_train) + len(X_test)

71518

### Approach 2: Sorting, splitting and mixing
Steps:
1. Sort dataset by 'patient_nbr'
2. Split dataset
3. Ensure that no patient from training appears in test
4. Mix both datasets (otherwise it will be ordered)

One problem that has to be addressed: It could be analyzed whether having a highly recurring patient in one dataset has a negative impact on the result.

**Method 'get_split_for_sorted_patients'**

In [None]:
from sklearn.model_selection import GroupShuffleSplit
from sklearn.utils import shuffle

def get_split_for_sorted_patients(data):
  # define data set and target value
  X = data.drop(columns=["readmitted"])
  y = data["readmitted"]

  # define group
  groups = data["patient_nbr"]

  # instance of GroupShuffleSplit
  gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

  # Split dataset
  for train_idx, test_idx in gss.split(X, y, groups=groups):
      X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
      y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

  # check whether patient_nbr appear in both datasets
  common_patient_nbrs = set(X_train["patient_nbr"]).intersection(set(X_test["patient_nbr"]))
  print("common patient numbers:", common_patient_nbrs)

  # Remove patient_nbr column from both datasets
  X_train = X_train.drop(columns=["patient_nbr"])
  X_test = X_test.drop(columns=["patient_nbr"])

  # shuffle all datasets before returning than (using random_state to ensure that rows still match after shuffling)
  X_train = shuffle(X_train, random_state=42)
  X_test = shuffle(X_test, random_state=42)
  y_train = shuffle(y_train, random_state=42)
  y_test = shuffle(y_test, random_state=42)

  if len(common_patient_nbrs) == 0:
    return X_train, X_test, y_train, y_test


In [None]:
# example how to use the function to get the 80-20 split
X_train, X_test, y_train, y_test = get_split_for_sorted_patients(data)

common patient numbers: set()


In [None]:
pickle.dump( X_train, open( "X_train.p", "wb" ) )
pickle.dump( X_test, open( "X_test.p", "wb" ) )
pickle.dump( y_train, open( "y_train.p", "wb" ) )
pickle.dump( y_test, open( "y_test.p", "wb" ) )
