# Baseline for diabetes readmission prediction

## Preprocessing

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [2]:
df = pd.read_csv('data/diabetic_data_no_na_diag.csv')

### Shift evolutive variables

In [3]:
df = df.sort_values(['patient_nbr', 'encounter_id'])

last_vars = [
    'diag_1',
    'diag_2',
    'diag_3',

    'metformin',
    'repaglinide',
    'nateglinide',
    'chlorpropamide',
    'glimepiride',
    'acetohexamide',
    'glipizide',
    'glyburide',
    'tolbutamide',
    'pioglitazone',
    'rosiglitazone',
    'acarbose',
    'miglitol',
    'troglitazone',
    'tolazamide',
    'insulin',
    'glyburide-metformin',
    'glipizide-metformin',
    'glimepiride-pioglitazone',
    'metformin-rosiglitazone',
    'metformin-pioglitazone',
]

renamed_last_vars = ['last_' + v for v in last_vars]
df.loc[:, renamed_last_vars] = df.loc[:, last_vars].shift()
# Previous diagnoses during first encounters are managed later
df.loc[:, renamed_last_vars[:3]][df.patient_nbr != df.patient_nbr.shift()] = pd.NA
# The previous medication is set to 0 (NO)
df.loc[:, renamed_last_vars[3:]][df.patient_nbr != df.patient_nbr.shift()] = 0

### Drop irrelevant and identifier data

In [4]:
df = df.drop([
    'encounter_id', 
    'patient_nbr',
    'discharge_disposition_id',
    'admission_source_id'
], axis=1)

### Transform distributions

In [5]:
count_vars = [c for c in df.columns if 'num' in c]
x = df[count_vars].to_numpy()
x = (x - x.min(0)) / (x.max(0) - x.min(0)) * 1000 + 1
df[count_vars] = np.log(x)

### Encode diagnosis labels and cast to matrices

In [6]:
label_vars = [c for c in df.columns if 'diag_' in c]
target_var = 'readmitted'
other_vars = [c for c in df.columns if c not in label_vars + [target_var]]
others = df[other_vars].to_numpy()
labels = df[label_vars].to_numpy()
y = df[target_var].to_numpy()

label_to_emb = np.load('data/diag_embeddings.npy')
label_nans = np.isnan(labels)
labels[label_nans] = 0
embeddings = label_to_emb[labels.astype('int')]
embeddings[label_nans] = np.nan
embeddings = embeddings.reshape(embeddings.shape[0], -1)

X = np.concatenate([others, embeddings], 1)

### Simplify task

In [7]:
y = (y > 0).astype(int)

### Split data and discard data

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y)
s = X_train.std(0) > 0
X_train, X_test = X_train[:, s], X_test[:, s]

### Normalize data

In [9]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

### Impute missings to center
We do not exclude NA in order not to exclude patients' first visits\
There are none in y

In [10]:
X_train = np.nan_to_num(X_train, nan=0)
X_test = np.nan_to_num(X_test, nan=0)

### Reduce dimensionality and uncorrelate features

In [11]:
pca = PCA(0.95)
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)

## Modeling

### Regression

In [12]:
from sklearn.linear_model import LogisticRegression

reg = LogisticRegression()
reg.fit(X_train, y_train)

print('Train')
print(reg.score(X_train, y_train))
print('Test')
print(reg.score(X_test, y_test))

Train
0.6231984696818825
Test
0.6219243770143856


### KNN

In [13]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

print('Train')
print(knn.score(X_train, y_train))
print('Test')
print(knn.score(X_test, y_test))

Train


[WinError 2] The system cannot find the file specified
  File "C:\Users\nilma\AppData\Roaming\Python\Python312\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
               ^^^^^^^^^^^^^^^
  File "c:\Program Files\Python312\Lib\subprocess.py", line 548, in run
    with Popen(*popenargs, **kwargs) as process:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Program Files\Python312\Lib\subprocess.py", line 1026, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "c:\Program Files\Python312\Lib\subprocess.py", line 1538, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^


0.723756616529532
Test
0.5733432906218064


### Decision Tree

In [14]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)

print('Train')
print(dt.score(X_train, y_train))
print('Test')
print(dt.score(X_test, y_test))

Train
1.0
Test
0.5444933574404528


### SVM

In [15]:
from sklearn.svm import LinearSVC

svm = LinearSVC()
svm.fit(X_train, y_train)

print('Train')
print(svm.score(X_train, y_train))
print('Test')
print(svm.score(X_test, y_test))

Train
0.6231591635658509
Test
0.6216099363257606


### Random Forest

In [16]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf.fit(X_train, y_train)

print('Train')
print(rf.score(X_train, y_train))
print('Test')
print(rf.score(X_test, y_test))

Train
1.0
Test
0.6101721562770223


### MLP

In [17]:
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier()
mlp.fit(X_train, y_train)

print('Train')
print(mlp.score(X_train, y_train))
print('Test')
print(mlp.score(X_test, y_test))

Train
0.6896389078140559
Test
0.6030579356968792




## Conclusion
Es perd bastanta informació relativa a la medicació ja que majoirtàriament no es recepten i per tant no es pot entrenar el model perquè aprengui a utilitzar aquesta informació. Un bon enfocament seria conseguir representacions significatives de les madicacions.\
Com a punt de partida, funcionen millor mètodes senzills de classificació lineal com la regressió logística i SVM lineal. El punt de partida serà aquest 62% d'accuracy (tenint en compte la simplificació de la tasca), i queda clar que els models avançats s'hauràn d'ajustar i regularitzar amb cura per millorar els resultats.