In [1]:
%%capture
%pip install pycaret fastparquet
%pip install numpy pandas scikit-learn xgboost lightgbm

import logging
import traceback

import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report

In [2]:
# Load covid_trends data from a Parquet file
covid_trends = pd.read_parquet("../data/covid_modeling.parquet")

print(covid_trends.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 267298 entries, 0 to 267297
Columns: 407 entries, Activitylimitationduetoarthritisamongadultsagedge18yearswhohavedoctor-diagnosedarthritis_Age-adjustedPrevalence to ed_trends_covid_outcome
dtypes: UInt32(1), bool(1), float64(395), int64(7), object(3)
memory usage: 827.5+ MB
None


## Class Distribution

In [3]:
# Count of positive (1) and negative (0) cases for COVID trends
covid_class_counts = covid_trends['ed_trends_covid_outcome'].value_counts()
print("COVID Trends Class Distribution:\n", covid_class_counts)

COVID Trends Class Distribution:
 ed_trends_covid_outcome
0    197812
1     69486
Name: count, dtype: int64


This is balanced enough to not need sampling.

In [4]:
covid_trends.head()

Unnamed: 0,Activitylimitationduetoarthritisamongadultsagedge18yearswhohavedoctor-diagnosedarthritis_Age-adjustedPrevalence,Activitylimitationduetoarthritisamongadultsagedge18yearswhohavedoctor-diagnosedarthritis_CrudePrevalence,Adultsagedge18yearswitharthritiswhohavetakenaclasstolearnhowtomanagearthritissymptoms_Age-adjustedPrevalence,Adultsagedge18yearswitharthritiswhohavetakenaclasstolearnhowtomanagearthritissymptoms_CrudePrevalence,Alcoholuseamongyouth_CrudePrevalence,Allteethlostamongadultsagedge65years_Age-adjustedPrevalence,Allteethlostamongadultsagedge65years_CrudePrevalence,Arthritisamongadultsagedge18years_Age-adjustedPrevalence,Arthritisamongadultsagedge18years_CrudePrevalence,Arthritisamongadultsagedge18yearswhoareobese_Age-adjustedPrevalence,...,Season,IsHolidayWeek,lag_ed_trends_influenza,lag_ed_trends_rsv,lag_ed_trends_covid,WeekNumber_sin,WeekNumber_cos,Month_sin,Month_cos,ed_trends_covid_outcome
0,49.9,44.6,23.2,20.2,,11.8,11.2,22.3,25.4,29.5,...,Fall,False,1.0,1.0,1.0,-0.992709,0.120537,-0.8660254,0.5,0
1,49.9,44.6,23.2,20.2,,11.8,11.2,22.3,25.4,29.5,...,Fall,False,2.0,2.0,1.0,-0.568065,0.822984,-0.5,0.866025,1
2,49.9,44.6,23.2,20.2,,11.8,11.2,22.3,25.4,29.5,...,Winter,False,2.0,2.0,2.0,-0.354605,0.935016,-2.449294e-16,1.0,1
3,49.9,44.6,23.2,20.2,,11.8,11.2,22.3,25.4,29.5,...,Winter,False,2.0,2.0,2.0,-0.120537,0.992709,-2.449294e-16,1.0,0
4,49.9,44.6,23.2,20.2,,11.8,11.2,22.3,25.4,29.5,...,Winter,False,0.0,0.0,0.0,0.120537,0.992709,0.5,0.866025,0


In [5]:
categorical_features = covid_trends.select_dtypes(include=['object', 'category']).columns.tolist()

print("Automatically detected categorical features:", categorical_features)

Automatically detected categorical features: ['DIVISION', 'week_end', 'Season']


In [6]:
covid_trends[['DIVISION', 'week_end', 'Season']]

Unnamed: 0,DIVISION,week_end,Season
0,Pacific,2022-10-08,Fall
1,Pacific,2022-11-26,Fall
2,Pacific,2022-12-10,Winter
3,Pacific,2022-12-24,Winter
4,Pacific,2023-01-07,Winter
...,...,...,...
267293,South Atlantic,2024-08-17,Summer
267294,South Atlantic,2024-10-12,Fall
267295,South Atlantic,2024-11-16,Fall
267296,South Atlantic,2024-11-23,Fall


# COVID Trends Modeling

In [7]:
feature_cols = [col for col in covid_trends.columns if col not in ['week_end', 'ed_trends_covid_outcome']]
target_col = 'ed_trends_covid_outcome'

X = covid_trends[feature_cols]
y = covid_trends[target_col]

In [8]:
categorical_features = ['DIVISION', 'Season']
numeric_features = [col for col in X.columns if col not in categorical_features]

#experiment = ModelExperiment(X, y, numeric_features, categorical_features)

In [11]:
# impute missing with median then scale
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
# For categorical features: impute missing with mode then one-hot encode
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
# Combine the transformations
preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])

# --- Train-Test Split ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- Define Models ---
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Penalized Logistic Regression": LogisticRegression(max_iter=1000, solver='liblinear', penalty='l1'),
    "Naive Bayes": GaussianNB(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    "LightGBM": LGBMClassifier()
}

results = {}

for name, model in models.items():
    try:
        logging.info(f"Starting training for {name}...")
        # Create pipeline that includes preprocessing and model training
        pipeline = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('classifier', model)
        ])
        
        # Train model
        pipeline.fit(X_train, y_train)
        logging.info(f"Finished training {name}.")
        
        # Evaluate performance
        y_pred = pipeline.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        report = classification_report(y_test, y_pred)
        
        results[name] = {'accuracy': acc, 'classification_report': report}
        
        logging.info(f"{name} Accuracy: {acc:.4f}")
        logging.info(f"{name} Classification Report:\n{report}")
        
    except Exception as e:
        logging.error(f"Error during training of {name}: {e}")
        logging.error(traceback.format_exc())

print("Summary of Model Performance:")
for model_name, metrics in results.items():
    print(f"Model: {model_name}")
    print(f"Accuracy: {metrics['accuracy']:.4f}")
    print(metrics['classification_report'])
    print("-" * 80)

Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Number of positive: 55704, number of negative: 158134
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.118306 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 32323
[LightGBM] [Info] Number of data points in the train set: 213838, number of used features: 415
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.260496 -> initscore=-1.043391
[LightGBM] [Info] Start training from score -1.043391
Summary of Model Performance:
Model: Logistic Regression
Accuracy: 0.8309
              precision    recall  f1-score   support

           0       0.87      0.90      0.89     39678
           1       0.69      0.62      0.65     13782

    accuracy                           0.83     53460
   macro avg       0.78      0.76      0.77     53460
weighted avg       0.83      0.83      0.83     53460

--------------------------------------------------------------------------------
Model: Penalized Log

# Fine-tune Best Performer
XGBoost performed best (surprise surpise). We will create a tuned XGBoost model for COVID trend predictions.

In [14]:
# --- Preprocessing Pipeline ---
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])

# --- Train-Test Split ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- XGBoost Pipeline ---
xgb_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier(use_label_encoder=False, eval_metric='logloss'))
])

# --- Simplified Hyperparameter Grid ---
param_grid = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [3, 5],
    'classifier__learning_rate': [0.1, 0.2]
}

# Use 3-fold cross-validation and single core (n_jobs=1) to keep computation light
grid_search = GridSearchCV(
    xgb_pipeline,
    param_grid,
    cv=3,
    scoring='f1',  # Evaluate based on f1 score
    n_jobs=1,
    verbose=1
)

# --- Fine-Tune the Model ---
grid_search.fit(X_train, y_train)

print("Best hyperparameters found:")
print(grid_search.best_params_)
print("Best cross-validation f1 score: {:.4f}".format(grid_search.best_score_))

# --- Evaluate on Test Set ---
best_xgb = grid_search.best_estimator_
y_pred = best_xgb.predict(X_test)

print("Test f1 Score: {:.4f}".format(f1_score(y_test, y_pred)))
print("Classification Report:")
print(classification_report(y_test, y_pred))

Fitting 3 folds for each of 8 candidates, totalling 24 fits


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

Best hyperparameters found:
{'classifier__learning_rate': 0.2, 'classifier__max_depth': 5, 'classifier__n_estimators': 200}
Best cross-validation f1 score: 0.8553
Test f1 Score: 0.8595
Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.96      0.95     39678
           1       0.88      0.84      0.86     13782

    accuracy                           0.93     53460
   macro avg       0.91      0.90      0.91     53460
weighted avg       0.93      0.93      0.93     53460



# Save COVID Model

In [16]:
final_model = grid_search.best_estimator_

# Fit the best model on the entire dataset
final_model.fit(X, y)

# save model for later predictions
import joblib
joblib.dump(final_model, '../models/best_COVID_model.pkl')

print("Final model trained on entire dataset and saved successfully!")

Parameters: { "use_label_encoder" } are not used.



Final model trained on entire dataset and saved successfully!


# RSV Modeling

We will fit an XGBoost model for RSV and Flu trends as well

In [19]:
rsv_trends = pd.read_parquet("../data/rsv_modeling.parquet")
feature_cols = [col for col in rsv_trends.columns if col not in ['week_end', 'ed_trends_rsv_outcome']]
target_col = 'ed_trends_rsv_outcome'

X = rsv_trends[feature_cols]
y = rsv_trends[target_col]

categorical_features = ['DIVISION', 'Season']
numeric_features = [col for col in X.columns if col not in categorical_features]

# --- Preprocessing Pipeline ---
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])

# --- Train-Test Split ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- XGBoost Pipeline ---
xgb_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier(use_label_encoder=False, eval_metric='logloss'))
])

# --- Simplified Hyperparameter Grid ---
param_grid = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [3, 5],
    'classifier__learning_rate': [0.1, 0.2]
}

# Use 3-fold cross-validation and single core (n_jobs=1) to keep computation light
grid_search = GridSearchCV(
    xgb_pipeline,
    param_grid,
    cv=3,
    scoring='f1',  # Evaluate based on f1 score
    n_jobs=1,
    verbose=1
)

# --- Fine-Tune the Model ---
grid_search.fit(X_train, y_train)

print("Best hyperparameters found:")
print(grid_search.best_params_)
print("Best cross-validation f1 score: {:.4f}".format(grid_search.best_score_))

# --- Evaluate on Test Set ---
best_xgb = grid_search.best_estimator_
y_pred = best_xgb.predict(X_test)

print("Test f1 Score: {:.4f}".format(f1_score(y_test, y_pred)))
print("Classification Report:")
print(classification_report(y_test, y_pred))

Fitting 3 folds for each of 8 candidates, totalling 24 fits


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

Best hyperparameters found:
{'classifier__learning_rate': 0.2, 'classifier__max_depth': 5, 'classifier__n_estimators': 200}
Best cross-validation f1 score: 0.9000
Test f1 Score: 0.9057
Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.98      0.98     43226
           1       0.91      0.90      0.91     10234

    accuracy                           0.96     53460
   macro avg       0.94      0.94      0.94     53460
weighted avg       0.96      0.96      0.96     53460



# Save RSV Model

In [20]:
final_model = grid_search.best_estimator_

# Fit the best model on the entire dataset
final_model.fit(X, y)

# save model for later predictions
import joblib
joblib.dump(final_model, '../models/best_RSV_model.pkl')

print("Final model trained on entire dataset and saved successfully!")

Parameters: { "use_label_encoder" } are not used.



Final model trained on entire dataset and saved successfully!


# Flu Modeling

In [None]:
flu_trends = pd.read_parquet("../data/flu_modeling.parquet")
feature_cols = [col for col in flu_trends.columns if col not in ['week_end', 'ed_trends_influenza_outcome']]
target_col = 'ed_trends_influenza_outcome'

X = flu_trends[feature_cols]
y = flu_trends[target_col]

categorical_features = ['DIVISION', 'Season']
numeric_features = [col for col in X.columns if col not in categorical_features]

# --- Preprocessing Pipeline ---
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])

# --- Train-Test Split ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- XGBoost Pipeline ---
xgb_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier(use_label_encoder=False, eval_metric='logloss'))
])

# --- Simplified Hyperparameter Grid ---
param_grid = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [3, 5],
    'classifier__learning_rate': [0.1, 0.2]
}

# Use 3-fold cross-validation and single core (n_jobs=1) to keep computation light
grid_search = GridSearchCV(
    xgb_pipeline,
    param_grid,
    cv=3,
    scoring='f1',  # Evaluate based on f1 score
    n_jobs=1,
    verbose=1
)

# --- Fine-Tune the Model ---
grid_search.fit(X_train, y_train)

print("Best hyperparameters found:")
print(grid_search.best_params_)
print("Best cross-validation f1 score: {:.4f}".format(grid_search.best_score_))

# --- Evaluate on Test Set ---
best_xgb = grid_search.best_estimator_
y_pred = best_xgb.predict(X_test)

print("Test f1 Score: {:.4f}".format(f1_score(y_test, y_pred)))
print("Classification Report:")
print(classification_report(y_test, y_pred))

Fitting 3 folds for each of 8 candidates, totalling 24 fits


Parameters: { "use_label_encoder" } are not used.



In [None]:
final_model = grid_search.best_estimator_

# Fit the best model on the entire dataset
final_model.fit(X, y)

# save model for later predictions
import joblib
joblib.dump(final_model, '../models/best_influenza_model.pkl')

print("Final model trained on entire dataset and saved successfully!")

Unnamed: 0,Activitylimitationduetoarthritisamongadultsagedge18yearswhohavedoctor-diagnosedarthritis_Age-adjustedPrevalence,Activitylimitationduetoarthritisamongadultsagedge18yearswhohavedoctor-diagnosedarthritis_CrudePrevalence,Adultsagedge18yearswitharthritiswhohavetakenaclasstolearnhowtomanagearthritissymptoms_Age-adjustedPrevalence,Adultsagedge18yearswitharthritiswhohavetakenaclasstolearnhowtomanagearthritissymptoms_CrudePrevalence,Alcoholuseamongyouth_CrudePrevalence,Allteethlostamongadultsagedge65years_Age-adjustedPrevalence,Allteethlostamongadultsagedge65years_CrudePrevalence,Arthritisamongadultsagedge18years_Age-adjustedPrevalence,Arthritisamongadultsagedge18years_CrudePrevalence,Arthritisamongadultsagedge18yearswhoareobese_Age-adjustedPrevalence,...,Season,IsHolidayWeek,lag_ed_trends_influenza,lag_ed_trends_rsv,lag_ed_trends_covid,WeekNumber_sin,WeekNumber_cos,Month_sin,Month_cos,ed_trends_influenza_outcome
0,49.9,44.6,23.2,20.2,,11.8,11.2,22.3,25.4,29.5,...,Fall,False,1.0,1.0,1.0,-0.992709,0.120537,-8.660254e-01,0.500000,1
1,49.9,44.6,23.2,20.2,,11.8,11.2,22.3,25.4,29.5,...,Fall,False,2.0,2.0,1.0,-0.568065,0.822984,-5.000000e-01,0.866025,1
2,49.9,44.6,23.2,20.2,,11.8,11.2,22.3,25.4,29.5,...,Winter,False,2.0,2.0,2.0,-0.354605,0.935016,-2.449294e-16,1.000000,1
3,49.9,44.6,23.2,20.2,,11.8,11.2,22.3,25.4,29.5,...,Winter,False,2.0,2.0,2.0,-0.120537,0.992709,-2.449294e-16,1.000000,0
4,49.9,44.6,23.2,20.2,,11.8,11.2,22.3,25.4,29.5,...,Winter,False,0.0,0.0,0.0,0.120537,0.992709,5.000000e-01,0.866025,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
267293,41.6,39.4,11.2,13.3,24.1,9.3,8.9,22.3,25.0,28.4,...,Summer,False,1.0,1.0,2.0,-0.748511,-0.663123,-8.660254e-01,-0.500000,0
267294,41.6,39.4,11.2,13.3,24.1,9.3,8.9,22.3,25.0,28.4,...,Fall,False,1.0,1.0,1.0,-0.970942,0.239316,-8.660254e-01,0.500000,1
267295,41.6,39.4,11.2,13.3,24.1,9.3,8.9,22.3,25.0,28.4,...,Fall,False,2.0,2.0,0.0,-0.663123,0.748511,-5.000000e-01,0.866025,1
267296,41.6,39.4,11.2,13.3,24.1,9.3,8.9,22.3,25.0,28.4,...,Fall,False,2.0,2.0,1.0,-0.568065,0.822984,-5.000000e-01,0.866025,1
