In [86]:
from imblearn.over_sampling import SMOTE
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, accuracy_score
from sklearn.linear_model import LogisticRegression

In [77]:
# load dataset
df = pd.read_pickle('next_wave_admission_prediction_corrected.pkl')

In [78]:
# drop columns
df.drop(columns=['first_admission'], inplace=True)
df.drop(columns=['hhidpn'], inplace=True)  # id info (not helpful for training)
df.drop(columns=['hhhid'], inplace=True)  # id info (not helpful for training)
df.drop(columns=['shhidpn'], inplace=True)  # id info (not helpful for training)
df.drop(columns=['rnhmmvy'], inplace=True)  # nursing home info
df.drop(columns=['rnhmmvm'], inplace=True)  # nursing home info
df.drop(columns=['rnhmday'], inplace=True)  # nursing home info
df.drop(columns=['snhmliv'], inplace=True)  # nursing home info
df.drop(columns=['hnhmliv'], inplace=True)  # nursing home info
df.drop(columns=['rnrshom'], inplace=True)  # nursing home info

df.drop(columns=['wave'], inplace=True)  # not relevent

columns_to_drop = ['raedegrm','raeduc', 'rabplacf', 'rarelig','rabplace', 'raestrat', 'rawtsamp', 'raestrat', 'raehsamp']
df.drop(columns=columns_to_drop, inplace=True)





In [79]:
# remove variables with >80% missing vals
threshold = 0.80  # 10% missing threshold
missing_percentage = df.isnull().mean()
columns_to_drop_missing = missing_percentage[missing_percentage > threshold].index.tolist()
df.drop(columns=columns_to_drop_missing, inplace=True)

In [80]:
print(df.shape)
unique_dtypes = set(df.dtypes.unique())
dtypes_counts = df.dtypes.value_counts()
print(unique_dtypes)
print(dtypes_counts)

for dtype in df.dtypes.unique():
    sample_values = df.select_dtypes(include=[dtype]).iloc[:, :5].head()  # Get first 5 columns (if many exist)
    print(f"\nData Type: {dtype}\n")
    print(sample_values, "\n" + "-"*50)




(38915, 1021)
{dtype('O'), dtype('bool'), dtype('int8'), dtype('float32'), dtype('float64'), dtype('int64')}

Data Type: float64

        sbmonth  sbyear  sbdate  smrct  smlen
0           NaN     NaN     NaN    NaN    NaN
127216      NaN     NaN     NaN    NaN    NaN
381647      9.0  1938.0 -7778.0    1.0   49.2
424053      1.0  1936.0 -8752.0    2.0   51.0
466459      NaN     NaN     NaN    NaN    NaN 
--------------------------------------------------

Data Type: object

            sbflag             sracem         shispan   sgender  \
0              NaN                NaN             NaN       NaN   
127216         NaN                NaN             NaN       NaN   
381647  0.mo/yr ok  1.white/caucasian  0.not hispanic  2.female   
424053  0.mo/yr ok  1.white/caucasian  0.not hispanic    1.male   
466459         NaN                NaN             NaN       NaN   

                         seduc  
0                          NaN  
127216                     NaN  
381647     5.college

In [81]:
# Set threshold
threshold = 0.8

# Step 3: Separate features and target
y = df['will_admit_next'].copy()
X = df.drop(columns=['will_admit_next'])


# Separate numerical and categorical features
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = X.select_dtypes(include=['object', 'category']).columns

# Process numerical columns (fill NaNs with median)
X_numerical = X[numerical_cols]
X_numerical_filled = X_numerical.fillna(X_numerical.median())

# Process categorical columns (label encode NaN as "missing")
X_categorical_encoded = pd.DataFrame(index=X.index)
for col in categorical_cols:
    filled_col = X[col].fillna('missing')  # Treat NaN as 'missing'
    filled_col = filled_col.astype(str)
    le = LabelEncoder()
    X_categorical_encoded[col] = le.fit_transform(filled_col)

# Combine numerical and categorical data
X_processed = pd.concat([X_numerical_filled, X_categorical_encoded], axis=1)

# Scale the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_processed)

# Convert back to DataFrame for analysis
X_scaled_df = pd.DataFrame(X_scaled, columns=X_processed.columns)

# Print shape of final dataset
print(f"Processed Data Shape: {X_scaled_df.shape}")

# Get and print unique datatypes
unique_dtypes = X_scaled_df.dtypes.unique()
print(f"Unique Data Types: {unique_dtypes}")

# Print 5 examples per datatype
for dtype in unique_dtypes:
    print(f"\nExamples for {dtype}:")
    print(X_scaled_df.select_dtypes(include=[dtype]).head(5))

  X_categorical_encoded[col] = le.fit_transform(filled_col)
  X_categorical_encoded[col] = le.fit_transform(filled_col)
  X_categorical_encoded[col] = le.fit_transform(filled_col)
  X_categorical_encoded[col] = le.fit_transform(filled_col)
  X_categorical_encoded[col] = le.fit_transform(filled_col)
  X_categorical_encoded[col] = le.fit_transform(filled_col)
  X_categorical_encoded[col] = le.fit_transform(filled_col)
  X_categorical_encoded[col] = le.fit_transform(filled_col)
  X_categorical_encoded[col] = le.fit_transform(filled_col)
  X_categorical_encoded[col] = le.fit_transform(filled_col)
  X_categorical_encoded[col] = le.fit_transform(filled_col)
  X_categorical_encoded[col] = le.fit_transform(filled_col)
  X_categorical_encoded[col] = le.fit_transform(filled_col)
  X_categorical_encoded[col] = le.fit_transform(filled_col)
  X_categorical_encoded[col] = le.fit_transform(filled_col)
  X_categorical_encoded[col] = le.fit_transform(filled_col)
  X_categorical_encoded[col] = le.fit_tr

Processed Data Shape: (38915, 1005)
Unique Data Types: [dtype('float64')]

Examples for float64:
    sbmonth    sbyear    sbdate     smrct     smlen   smlenm   smcurln  \
0  0.101763  0.005296  0.007250 -0.387477  0.027207 -0.17905  0.050319   
1  0.101763  0.005296  0.007250 -0.387477  0.027207 -0.17905  0.050319   
2  0.868187 -0.528348 -0.511287 -0.387477  1.059650 -0.17905  1.005282   
3 -2.197509 -0.706229 -0.748402  1.351300  1.202603 -0.17905  1.154754   
4  0.101763  0.005296  0.007250 -0.387477  0.027207 -0.17905  0.050319   

      smdiv     smwid     smend  ...   hafmort  hafhmln   hpickhh    hifcap  \
0 -0.376954 -0.175068 -0.145869  ... -0.513783  -0.3485  0.536776  0.055773   
1 -0.376954 -0.175068 -0.145869  ... -0.513783  -0.3485  0.536776 -0.983695   
2 -0.376954 -0.175068 -0.145869  ... -0.513783  -0.3485  0.536776  1.095241   
3  1.539581 -0.175068 -0.145869  ... -0.513783  -0.3485 -1.563851  0.055773   
4 -0.376954 -0.175068 -0.145869  ... -0.513783  -0.3485  0.5367

In [87]:
# LOG REG WITH NO CLASS WEIGHT

# Step 1: Split data (70% train, 15% validation, 15% test)
X_train, X_temp, y_train, y_temp = train_test_split(X_scaled_df, y, test_size=0.30, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.50, random_state=42, stratify=y_temp)

print(f"Train Shape: {X_train.shape}, Validation Shape: {X_val.shape}, Test Shape: {X_test.shape}")

# Step 2: Train Logistic Regression Model
logreg = LogisticRegression(max_iter=1000, solver='liblinear')  # Using liblinear for small datasets
logreg.fit(X_train, y_train)

# Step 3: Evaluate Model
y_val_pred = logreg.predict(X_val)
y_test_pred = logreg.predict(X_test)

# Step 4: Print Summary Statistics
print("\nValidation Accuracy:", accuracy_score(y_val, y_val_pred))
print("\nTest Accuracy:", accuracy_score(y_test, y_test_pred))
print("\nValidation Classification Report:\n", classification_report(y_val, y_val_pred))
print("\nTest Classification Report:\n", classification_report(y_test, y_test_pred))

# Step 5: Print Confusion Matrix
print("\nValidation Confusion Matrix:\n", confusion_matrix(y_val, y_val_pred))
print("\nTest Confusion Matrix:\n", confusion_matrix(y_test, y_test_pred))



Train Shape: (27240, 1005), Validation Shape: (5837, 1005), Test Shape: (5838, 1005)

Validation Accuracy: 0.8805893438410142

Test Accuracy: 0.8705035971223022

Validation Classification Report:
               precision    recall  f1-score   support

       False       0.90      0.96      0.93      4868
        True       0.71      0.48      0.57       969

    accuracy                           0.88      5837
   macro avg       0.81      0.72      0.75      5837
weighted avg       0.87      0.88      0.87      5837


Test Classification Report:
               precision    recall  f1-score   support

       False       0.90      0.96      0.92      4868
        True       0.67      0.44      0.53       970

    accuracy                           0.87      5838
   macro avg       0.78      0.70      0.73      5838
weighted avg       0.86      0.87      0.86      5838


Validation Confusion Matrix:
 [[4679  189]
 [ 508  461]]

Test Confusion Matrix:
 [[4651  217]
 [ 539  431]]


In [88]:
# LOG REG WITH OPTIMAL CLASS WEIGHT

# Define logistic regression with class weighting
logistic_model = LogisticRegression(class_weight="balanced", random_state=42)

# Train the model
logistic_model.fit(X_train, y_train)

# Predictions
y_val_pred = logistic_model.predict(X_val)
y_test_pred = logistic_model.predict(X_test)

# Step 4: Print Summary Statistics
print("\nValidation Accuracy:", accuracy_score(y_val, y_val_pred))
print("\nTest Accuracy:", accuracy_score(y_test, y_test_pred))
print("\nValidation Classification Report:\n", classification_report(y_val, y_val_pred))
print("\nTest Classification Report:\n", classification_report(y_test, y_test_pred))

# Step 5: Print Confusion Matrix
print("\nValidation Confusion Matrix:\n", confusion_matrix(y_val, y_val_pred))
print("\nTest Confusion Matrix:\n", confusion_matrix(y_test, y_test_pred))


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



Validation Accuracy: 0.7971560733253383

Test Accuracy: 0.7963343610825625

Validation Classification Report:
               precision    recall  f1-score   support

       False       0.95      0.79      0.87      4868
        True       0.44      0.81      0.57       969

    accuracy                           0.80      5837
   macro avg       0.70      0.80      0.72      5837
weighted avg       0.87      0.80      0.82      5837


Test Classification Report:
               precision    recall  f1-score   support

       False       0.95      0.80      0.87      4868
        True       0.44      0.79      0.56       970

    accuracy                           0.80      5838
   macro avg       0.69      0.80      0.72      5838
weighted avg       0.87      0.80      0.82      5838


Validation Confusion Matrix:
 [[3869  999]
 [ 185  784]]

Test Confusion Matrix:
 [[3878  990]
 [ 199  771]]
