In [53]:
from imblearn.over_sampling import SMOTE
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, accuracy_score, f1_score
from sklearn.linear_model import LogisticRegression
from torch.utils.data import DataLoader, TensorDataset


In [54]:
# load dataset
df = pd.read_pickle('next_wave_admission_prediction_corrected.pkl')

In [55]:
# drop columns
print(df.shape)
df.drop(columns=['first_admission'], inplace=True)
df.drop(columns=['hhidpn'], inplace=True)  # id info (not helpful for training)
df.drop(columns=['hhhid'], inplace=True)  # id info (not helpful for training)
df.drop(columns=['shhidpn'], inplace=True)  # id info (not helpful for training)
df.drop(columns=['rnhmmvy'], inplace=True)  # nursing home info
df.drop(columns=['rnhmmvm'], inplace=True)  # nursing home info
df.drop(columns=['rnhmday'], inplace=True)  # nursing home info
# df.drop(columns=['snhmliv'], inplace=True)  # nursing home info
df.drop(columns=['hnhmliv'], inplace=True)  # nursing home info
df.drop(columns=['rnrshom'], inplace=True)  # nursing home info

df.drop(columns=['wave'], inplace=True)  # not relevent

columns_to_drop = ['rabplacf', 'raestrat', 'rawtsamp', 'raestrat', 'raehsamp']
df.drop(columns=columns_to_drop, inplace=True)

# find columns that end with f and drop them (these are flags, irrelevant to us)
f_columns = [col for col in df.columns if col.endswith('f')]
df_columns = df.drop(columns=f_columns)

print("Dropped the following cols ending with 'f':")
print(f_columns)

print(df.shape)




(38915, 1822)
Dropped the following cols ending with 'f':
['sbplacf', 'smstatf', 'sltactf', 'svgactf', 'shswrkf', 'scondef', 'shibpf', 'sdiabf', 'scancrf', 'slungf', 'sheartf', 'sstrokf', 'spsychf', 'sarthrf', 'shibpef', 'sdiabef', 'scancref', 'slungef', 'sheartef', 'sstrokef', 'spsychef', 'sarthref', 'slbrf', 'sjltenf', 'sinlbrf', 'sevbrnf', 'sjcpenf', 'sshltcf', 'scondsf', 'soopmdf', 'swork62f', 'swork65f', 'sliv75f', 'sliv85f', 'sliv8xf', 'sfsizef', 'sunionf', 'shltc5f', 'stotmdf', 'stotmbf', 'ssadlf', 'shlpdysf', 'shlpdysnf', 'shlphrsf', 'shlphrsnf', 'smemryef', 'smemryf', 'spuff', 'spmwghtf', 'slbsatwlf', 'salzheef', 'sdemenef', 'soopmdof', 'salzhef', 'sdemenf', 'swork70f', 'swork70af', 'ssleepef', 'rmstatf', 'rltactf', 'rvgactf', 'rhswrkf', 'rasleepef', 'rhibpf', 'rahibpef', 'rdiabf', 'radiabef', 'rcancrf', 'racancref', 'rlungf', 'ralungef', 'rheartf', 'raheartef', 'rstrokf', 'rastrokef', 'rpsychf', 'rapsychef', 'rarthrf', 'raarthref', 'ramemryef', 'raalzheef', 'rademenef', 'rcon

In [56]:
# remove variables with >90% missing vals
threshold = 0.90  # 10% missing threshold
missing_percentage = df.isnull().mean()
columns_to_drop_missing = missing_percentage[missing_percentage > threshold].index.tolist()
df.drop(columns=columns_to_drop_missing, inplace=True)

In [57]:
print(df.shape)
unique_dtypes = set(df.dtypes.unique())
dtypes_counts = df.dtypes.value_counts()
print(unique_dtypes)
print(dtypes_counts)

for dtype in df.dtypes.unique():
    sample_values = df.select_dtypes(include=[dtype]).iloc[:, :5].head() 
    print(f"\nData Type: {dtype}\n")
    print(sample_values, "\n" + "-"*50)




(38915, 1194)
{dtype('O'), dtype('int64'), dtype('float64'), dtype('bool'), dtype('float32'), dtype('int8')}
object     751
float64    424
int8        13
float32      4
int64        1
bool         1
Name: count, dtype: int64

Data Type: float64

        sbmonth  sbyear  sbdate  sdmonth  sdyear
0           NaN     NaN     NaN      NaN     NaN
127216      NaN     NaN     NaN      NaN     NaN
381647      9.0  1938.0 -7778.0     10.0  2015.0
424053      1.0  1936.0 -8752.0      8.0  2013.0
466459      NaN     NaN     NaN      NaN     NaN 
--------------------------------------------------

Data Type: object

            sbflag                          sdsrc             sracem  \
0              NaN                            NaN                NaN   
127216         NaN                            NaN                NaN   
381647  0.mo/yr ok  1.respondent's exit interview  1.white/caucasian   
424053  0.mo/yr ok  1.respondent's exit interview  1.white/caucasian   
466459         NaN          

In [58]:
# set threshold
threshold = 0.9

# separate features and target
y = df['will_admit_next'].copy()
X = df.drop(columns=['will_admit_next'])


# separate numerical and categorical features
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = X.select_dtypes(include=['object', 'category']).columns

# process numerical columns (fill NaNs with median)
X_numerical = X[numerical_cols]
X_numerical_filled = X_numerical.fillna(X_numerical.median())

# process categorical columns (label encode NaN as "missing")
X_categorical_encoded = pd.DataFrame(index=X.index)
for col in categorical_cols:
    filled_col = X[col].fillna('missing')  # Treat NaN as 'missing'
    filled_col = filled_col.astype(str)
    le = LabelEncoder()
    X_categorical_encoded[col] = le.fit_transform(filled_col)

# combine numerical and categorical data
X_processed = pd.concat([X_numerical_filled, X_categorical_encoded], axis=1)

# scale the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_processed)

# convert back to df for analysis
X_scaled_df = pd.DataFrame(X_scaled, columns=X_processed.columns)

# Print shape of final dataset
#print(f"Processed Data Shape: {X_scaled_df.shape}")

# Get and print unique datatypes
unique_dtypes = X_scaled_df.dtypes.unique()
#print(f"Unique Data Types: {unique_dtypes}")

# Print 5 examples per datatype
#for dtype in unique_dtypes:
#    print(f"\nExamples for {dtype}:")
#    print(X_scaled_df.select_dtypes(include=[dtype]).head(5))

  X_categorical_encoded[col] = le.fit_transform(filled_col)
  X_categorical_encoded[col] = le.fit_transform(filled_col)
  X_categorical_encoded[col] = le.fit_transform(filled_col)
  X_categorical_encoded[col] = le.fit_transform(filled_col)
  X_categorical_encoded[col] = le.fit_transform(filled_col)
  X_categorical_encoded[col] = le.fit_transform(filled_col)
  X_categorical_encoded[col] = le.fit_transform(filled_col)
  X_categorical_encoded[col] = le.fit_transform(filled_col)
  X_categorical_encoded[col] = le.fit_transform(filled_col)
  X_categorical_encoded[col] = le.fit_transform(filled_col)
  X_categorical_encoded[col] = le.fit_transform(filled_col)
  X_categorical_encoded[col] = le.fit_transform(filled_col)
  X_categorical_encoded[col] = le.fit_transform(filled_col)
  X_categorical_encoded[col] = le.fit_transform(filled_col)
  X_categorical_encoded[col] = le.fit_transform(filled_col)
  X_categorical_encoded[col] = le.fit_transform(filled_col)
  X_categorical_encoded[col] = le.fit_tr

In [73]:
# LOG REG WITH NO CLASS WEIGHT

# split data (70% train, 15% validation, 15% test)
X_train, X_temp, y_train, y_temp = train_test_split(X_scaled_df, y, test_size=0.30, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.50, random_state=42, stratify=y_temp)

print(f"Train Shape: {X_train.shape}, Validation Shape: {X_val.shape}, Test Shape: {X_test.shape}")

# train Logistic Regression Model
logreg = LogisticRegression(max_iter=1000)  # Using liblinear for small datasets
logreg.fit(X_train, y_train)

# evaluate Model
y_train_pred_base = logreg.predict(X_train)
y_val_pred_base = logreg.predict(X_val)
#y_test_pred = logreg.predict(X_test)

# compute F1-score
train_f1_base = f1_score(y_train, y_train_pred_base)
val_f1_base = f1_score(y_val, y_val_pred_base)

# print results
#print(f"Train F1-score: {train_f1:.4f}")
#print(f"Validation F1-score: {val_f1:.4f}")

"""
# Step 4: Print Summary Statistics
print("\nValidation Accuracy:", accuracy_score(y_val, y_val_pred))
#print("\nTest Accuracy:", accuracy_score(y_test, y_test_pred))
print("\nValidation Classification Report:\n", classification_report(y_val, y_val_pred))
#print("\nTest Classification Report:\n", classification_report(y_test, y_test_pred))

# Step 5: Print Confusion Matrix
print("\nValidation Confusion Matrix:\n", confusion_matrix(y_val, y_val_pred))
#print("\nTest Confusion Matrix:\n", confusion_matrix(y_test, y_test_pred))
"""

# Print results
print("\n BASE LOGISTIC REGRESSION (NO CLASS WEIGHT)")
print(f"Train F1-score: {train_f1_base:.4f}")
print(f"Validation F1-score: {val_f1_base:.4f}")
print("\nValidation Classification Report:\n", classification_report(y_val, y_val_pred_base))
print("\nValidation Confusion Matrix:\n", confusion_matrix(y_val, y_val_pred_base))

Train Shape: (27240, 1176), Validation Shape: (5837, 1176), Test Shape: (5838, 1176)

 BASE LOGISTIC REGRESSION (NO CLASS WEIGHT)
Train F1-score: 0.6110
Validation F1-score: 0.5616

Validation Classification Report:
               precision    recall  f1-score   support

       False       0.90      0.96      0.93      4868
        True       0.70      0.47      0.56       969

    accuracy                           0.88      5837
   macro avg       0.80      0.71      0.75      5837
weighted avg       0.87      0.88      0.87      5837


Validation Confusion Matrix:
 [[4669  199]
 [ 513  456]]


In [74]:
# LOG REG WITH OPTIMAL CLASS WEIGHT (class_weight = "balanced")
# this accounts for the more "false" than "true" values we have (~ 5 to 1 ratio)

# define logistic regression with class weighting
logistic_model = LogisticRegression(class_weight="balanced", max_iter = 1000, random_state=42)

# train the model
logistic_model.fit(X_train, y_train)

# evaluate Model
y_train_pred_balanced = logistic_model.predict(X_train)
y_val_pred_balanced = logistic_model.predict(X_val)
#y_test_pred = logistic_model.predict(X_test)

# compute F1-score
train_f1_balanced = f1_score(y_train, y_train_pred_balanced)
val_f1_balanced = f1_score(y_val, y_val_pred_balanced)

# print results
#print(f"Train F1-score: {train_f1:.4f}")
#print(f"Validation F1-score: {val_f1:.4f}")

"""
# print summary statistics
print("\nValidation Accuracy:", accuracy_score(y_val, y_val_pred))
#print("\nTest Accuracy:", accuracy_score(y_test, y_test_pred))
print("\nValidation Classification Report:\n", classification_report(y_val, y_val_pred))
#print("\nTest Classification Report:\n", classification_report(y_test, y_test_pred))

# print confusion matrix
print("\nValidation Confusion Matrix:\n", confusion_matrix(y_val, y_val_pred))
#print("\nTest Confusion Matrix:\n", confusion_matrix(y_test, y_test_pred))
"""
# print results
print("\n BALANCED LOGISTIC REGRESSION (CLASS WEIGHT='BALANCED') ")
print(f"Train F1-score: {train_f1_balanced:.4f}")
print(f"Validation F1-score: {val_f1_balanced:.4f}")
print("\nValidation Classification Report:\n", classification_report(y_val, y_val_pred_balanced))
print("\nValidation Confusion Matrix:\n", confusion_matrix(y_val, y_val_pred_balanced))

# get the coefficients of the logistic regression model
coefficients = logistic_model.coef_[0]

# create a DataFrame to store features and their corresponding coefficients
feature_importance = pd.DataFrame({
    'Feature': X_train.columns,
    'Coefficient': coefficients
})

# sort the features by the absolute value of the coefficients (importance)
feature_importance['Abs_Coefficient'] = feature_importance['Coefficient'].abs()
feature_importance = feature_importance.sort_values(by='Abs_Coefficient', ascending=False)

# print the top 10 features
top_10_features = feature_importance.head(10)
print("\nTop 10 Features Based on Coefficients (Logistic Regression):")
print(top_10_features[['Feature', 'Coefficient']])



 BALANCED LOGISTIC REGRESSION (CLASS WEIGHT='BALANCED') 
Train F1-score: 0.6143
Validation F1-score: 0.5648

Validation Classification Report:
               precision    recall  f1-score   support

       False       0.95      0.80      0.87      4868
        True       0.44      0.80      0.56       969

    accuracy                           0.80      5837
   macro avg       0.69      0.80      0.72      5837
weighted avg       0.87      0.80      0.82      5837


Validation Confusion Matrix:
 [[3878  990]
 [ 198  771]]

Top 10 Features Based on Coefficients (Logistic Regression):
      Feature  Coefficient
512   sifwcmp     1.668869
201   rabyear    -1.533313
202   rabdate    -1.531698
508   sfslfme     1.517200
541     swork    -1.295138
1021   rtoilt    -1.163654
958   rcondsp    -1.160114
484   scancrq    -1.090113
483    sdiabq     1.087144
661     seata    -1.032895


In [75]:
# NN 2 layer
# LR = 0.01
# run for 50 epochs

# convert data to tensors
X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32)

X_val_tensor = torch.tensor(X_val.values, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val.values, dtype=torch.float32)

X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32)

# create DataLoaders
train_loader = DataLoader(TensorDataset(X_train_tensor, y_train_tensor), batch_size=64, shuffle=True)
val_loader = DataLoader(TensorDataset(X_val_tensor, y_val_tensor), batch_size=64, shuffle=False)
# test_loader = DataLoader(TensorDataset(X_test_tensor, y_test_tensor), batch_size=64, shuffle=False)  # 🔹 Test is commented out

# define a simple 2-layer nn
class SimpleNN(nn.Module):
    def __init__(self, input_size):
        super(SimpleNN, self).__init__()
        self.hidden = nn.Linear(input_size, 16)  # hidden layer with 16 neurons
        self.output = nn.Linear(16, 1)  # output layer (binary classification)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, x):
        x = self.relu(self.hidden(x))
        x = self.sigmoid(self.output(x))
        return x

# initialize model, loss, and optimizer
model = SimpleNN(X_train.shape[1])
criterion = nn.BCELoss()  # binary cross-entropy loss
optimizer = optim.Adam(model.parameters(), lr=0.01)

# training
epochs = 50
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        outputs = model(X_batch).squeeze()
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    # print loss every 10 epochs
    if (epoch + 1) % 10 == 0:
        print(f"Epoch [{epoch+1}/{epochs}], Loss: {total_loss:.4f}")

# evaluation function (no test evaluation yet)
def evaluate(model, dataloader, y_true):
    model.eval()
    predictions = []
    with torch.no_grad():
        for X_batch, _ in dataloader:
            preds = model(X_batch).squeeze().numpy()
            predictions.extend(preds)

    predictions = [1 if p >= 0.5 else 0 for p in predictions]
    
    # compute and print metrics
    f1 = f1_score(y_true, predictions)
    print(f"F1-score: {f1:.4f}")
    print("\nClassification Report:\n", classification_report(y_true, predictions))
    print("\nConfusion Matrix:\n", confusion_matrix(y_true, predictions))

# evaluate on validation set (test is commented out)
print("\n  Validation Results:")
evaluate(model, val_loader, y_val.to_numpy())

# print("\n  Test Results:")  #  Commented out for later
# evaluate(model, test_loader, y_test.numpy())

Epoch [10/50], Loss: 94.8294
Epoch [20/50], Loss: 77.3883
Epoch [30/50], Loss: 56.4657
Epoch [40/50], Loss: 57.7351
Epoch [50/50], Loss: 45.0967

  Validation Results:
F1-score: 0.5398

Classification Report:
               precision    recall  f1-score   support

       False       0.90      0.95      0.92      4868
        True       0.64      0.47      0.54       969

    accuracy                           0.87      5837
   macro avg       0.77      0.71      0.73      5837
weighted avg       0.86      0.87      0.86      5837


Confusion Matrix:
 [[4617  251]
 [ 518  451]]


In [76]:
# SMOTE + Log reg

# split data (70% train, 15% validation, 15% test)
X_train, X_temp, y_train, y_temp = train_test_split(X_scaled_df, y, test_size=0.30, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.50, random_state=42, stratify=y_temp)

print(f"Train Shape: {X_train.shape}, Validation Shape: {X_val.shape}, Test Shape: {X_test.shape}")

# apply SMOTE to the training data only
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# print the class distribution after SMOTE
print("\nClass distribution after SMOTE (Training Set):")
print(pd.Series(y_train_resampled).value_counts())

# define logistic regression with class weighting
logistic_model_smote = LogisticRegression(max_iter = 1000, random_state=42)

# train the model
logistic_model_smote.fit(X_train_resampled, y_train_resampled)

# evaluate
y_train_pred_smote = logistic_model_smote.predict(X_train)
y_val_pred_smote = logistic_model_smote.predict(X_val)
#y_test_smote = logistic_model_smote.predict(X_test)

# F1-score
train_f1_smote = f1_score(y_train, y_train_pred_smote)
val_f1_smote = f1_score(y_val, y_val_pred_smote)

# print results
print("\n SMOTE LOGISTIC REGRESSION ")
print(f"Train F1-score: {train_f1_smote:.4f}")
print(f"Validation F1-score: {val_f1_smote:.4f}")
print("\nValidation Classification Report:\n", classification_report(y_val, y_val_pred_smote))
print("\nValidation Confusion Matrix:\n", confusion_matrix(y_val, y_val_pred_smote))

Train Shape: (27240, 1176), Validation Shape: (5837, 1176), Test Shape: (5838, 1176)

Class distribution after SMOTE (Training Set):
will_admit_next
False    22715
True     22715
Name: count, dtype: int64

 SMOTE LOGISTIC REGRESSION 
Train F1-score: 0.6121
Validation F1-score: 0.5587

Validation Classification Report:
               precision    recall  f1-score   support

       False       0.94      0.81      0.87      4868
        True       0.44      0.76      0.56       969

    accuracy                           0.80      5837
   macro avg       0.69      0.79      0.71      5837
weighted avg       0.86      0.80      0.82      5837


Validation Confusion Matrix:
 [[3928  940]
 [ 229  740]]


In [77]:
# SMOTE + NN

# split data (70% train, 15% validation, 15% test)
X_train, X_temp, y_train, y_temp = train_test_split(X_scaled_df, y, test_size=0.30, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.50, random_state=42, stratify=y_temp)

print(f"Train Shape: {X_train.shape}, Validation Shape: {X_val.shape}, Test Shape: {X_test.shape}")

# apply SMOTE to the training data only
smote = SMOTE(random_state=42)
X_train_nn_smote, y_train_nn_smote = smote.fit_resample(X_train, y_train)

# print the class distribution after SMOTE
print("\nClass distribution after SMOTE (Training Set):")
print(pd.Series(y_train_resampled).value_counts())

# convert to tensors
X_train_tensor = torch.tensor(X_train_nn_smote.values, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train_nn_smote.values, dtype=torch.float32)
X_val_tensor = torch.tensor(X_val.values, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val.values, dtype=torch.float32)

# create DataLoaders for training and validation sets
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)

# define a 2-layer Neural Network Model
class SimpleNN(nn.Module):
    def __init__(self, input_size):
        super(SimpleNN, self).__init__()
        self.hidden = nn.Linear(input_size, 16)  # hidden layer with 16 neurons
        self.output = nn.Linear(16, 1)  # output layer (binary classification)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, x):
        x = self.relu(self.hidden(x))
        x = self.sigmoid(self.output(x))
        return x

# initialize the model, loss function, and optimizer
model = SimpleNN(X_train.shape[1])
criterion = nn.BCELoss()  # binary cross-entropy loss
optimizer = optim.Adam(model.parameters(), lr=0.01)

# train
epochs = 50
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        outputs = model(X_batch).squeeze()
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

# evaluate the model
def evaluate(model, dataloader, y_true):
    model.eval()
    predictions = []
    with torch.no_grad():
        for X_batch, _ in dataloader:
            preds = model(X_batch).squeeze().numpy()
            predictions.extend(preds)

    predictions = [1 if p >= 0.5 else 0 for p in predictions]
    print(f"F1 Score: {f1_score(y_true, predictions):.4f}")
    print("\nClassification Report:\n", classification_report(y_true, predictions))
    print("\nConfusion Matrix:\n", confusion_matrix(y_true, predictions))

# evaluate on training and validation sets
print("\n Training Results:")
evaluate(model, train_loader, y_train_tensor.numpy())

print("\n Validation Results:")
evaluate(model, val_loader, y_val_tensor.numpy())

Train Shape: (27240, 1176), Validation Shape: (5837, 1176), Test Shape: (5838, 1176)

Class distribution after SMOTE (Training Set):
will_admit_next
False    22715
True     22715
Name: count, dtype: int64

 Training Results:
F1 Score: 0.5019

Classification Report:
               precision    recall  f1-score   support

         0.0       0.50      0.49      0.50     22715
         1.0       0.50      0.51      0.50     22715

    accuracy                           0.50     45430
   macro avg       0.50      0.50      0.50     45430
weighted avg       0.50      0.50      0.50     45430


Confusion Matrix:
 [[11174 11541]
 [11239 11476]]

 Validation Results:
F1 Score: 0.5494

Classification Report:
               precision    recall  f1-score   support

         0.0       0.91      0.90      0.91      4868
         1.0       0.53      0.57      0.55       969

    accuracy                           0.84      5837
   macro avg       0.72      0.73      0.73      5837
weighted avg       