## Import Libraries

In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

from joblib import dump, load

## Read Dataset

In [87]:
covid = pd.read_csv("RiskPredictionArtifacts/fhir_synthea_merged_df.csv")
print(covid.shape)
covid.head()

(12452, 216)


Unnamed: 0,id,gender,age,city,state,race,ethnicity,death,actualCodeValue,cause of death,...,Has a criminal record (finding),Not in labor force (finding),Limited social contact (finding),Unemployed (finding),Unhealthy alcohol drinking behavior (finding),Risk activity involvement (finding),Reports of violence in the environment (finding),Severe anxiety (panic) (finding),Only received primary school education (finding),Refugee (person)
0,f0f3bc8d-ef38-49ce-a2bd-dfdda982b271,M,6,Springfield,Massachusetts,white,nonhispanic,0,0,0,...,,,,,,,,,,
1,067318a4-db8f-447f-8b6e-f2f61e9baaa5,F,7,Walpole,Massachusetts,white,nonhispanic,0,0,0,...,,,,,,,,,,
2,ae9efba3-ddc4-43f9-a781-f72019388548,M,31,Chicopee,Massachusetts,white,nonhispanic,0,0,0,...,,,,,,,,,,
3,199c586f-af16-4091-9998-ee4cfc02ee7a,F,20,Pembroke,Massachusetts,white,nonhispanic,0,0,0,...,,,,,,,,,,
4,353016ea-a0ff-4154-85bb-1cf8b6cedf20,M,27,Boston,Massachusetts,white,nonhispanic,0,0,0,...,,,,,,,,,,


In [88]:
threshold = len(covid) * 0.9
covid = covid.dropna(axis=1, thresh=threshold)

print(covid.shape)

(12452, 192)


## Prepare Target Column

In [89]:
covid['cause of death'] = (covid['cause of death'] == "COVID-19").astype(int)

In [90]:
covid.head(50)

Unnamed: 0,id,gender,age,city,state,race,ethnicity,death,actualCodeValue,cause of death,...,Tear of meniscus of knee,Posttraumatic stress disorder,Female Infertility,Cystic Fibrosis,Diabetes from Cystic Fibrosis,Blindness due to type 2 diabetes mellitus (disorder),History of disarticulation at wrist (situation),Male Infertility,Acute Cholecystitis,Cholelithiasis
0,f0f3bc8d-ef38-49ce-a2bd-dfdda982b271,M,6,Springfield,Massachusetts,white,nonhispanic,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,067318a4-db8f-447f-8b6e-f2f61e9baaa5,F,7,Walpole,Massachusetts,white,nonhispanic,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,ae9efba3-ddc4-43f9-a781-f72019388548,M,31,Chicopee,Massachusetts,white,nonhispanic,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,199c586f-af16-4091-9998-ee4cfc02ee7a,F,20,Pembroke,Massachusetts,white,nonhispanic,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,353016ea-a0ff-4154-85bb-1cf8b6cedf20,M,27,Boston,Massachusetts,white,nonhispanic,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,f58bf921-cba1-475a-b4f8-dc6fa3b8f89c,F,4,Colrain,Massachusetts,white,nonhispanic,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,b9fd2dd8-181b-494b-ab15-e9f286d668d9,M,40,Somerville,Massachusetts,white,nonhispanic,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,d22592ac-552f-4ecd-a63d-7663d77ce9ba,M,34,Chicopee,Massachusetts,white,hispanic,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,28658715-b770-4576-9a81-fbb2282a98ea,F,32,Medfield,Massachusetts,white,nonhispanic,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,055ae6fc-7e18-4a39-8058-64082ca6d515,M,19,Springfield,Massachusetts,white,nonhispanic,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [91]:
covid = covid.rename(columns={'cause of death': 'risk'})

## Feature Engineering

In [92]:
# Drop irrelevant columns
covid = covid.drop(columns=['actualCodeValue', 'city'])

In [93]:
covid.head()

Unnamed: 0,id,gender,age,state,race,ethnicity,death,risk,Otitis media,Fever (finding),...,Tear of meniscus of knee,Posttraumatic stress disorder,Female Infertility,Cystic Fibrosis,Diabetes from Cystic Fibrosis,Blindness due to type 2 diabetes mellitus (disorder),History of disarticulation at wrist (situation),Male Infertility,Acute Cholecystitis,Cholelithiasis
0,f0f3bc8d-ef38-49ce-a2bd-dfdda982b271,M,6,Massachusetts,white,nonhispanic,0,0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,067318a4-db8f-447f-8b6e-f2f61e9baaa5,F,7,Massachusetts,white,nonhispanic,0,0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,ae9efba3-ddc4-43f9-a781-f72019388548,M,31,Massachusetts,white,nonhispanic,0,0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,199c586f-af16-4091-9998-ee4cfc02ee7a,F,20,Massachusetts,white,nonhispanic,0,0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,353016ea-a0ff-4154-85bb-1cf8b6cedf20,M,27,Massachusetts,white,nonhispanic,0,0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [94]:
# Covert gender column to numeric representation
gender_map = {'M': 0, 'F': 1}
covid['Gender_Numeric'] = covid['gender'].map(gender_map)
covid = covid.drop(columns=['gender'])
covid = covid.rename(columns={'Gender_Numeric': 'gender'})
covid.head()

Unnamed: 0,id,age,state,race,ethnicity,death,risk,Otitis media,Fever (finding),Suspected COVID-19,...,Posttraumatic stress disorder,Female Infertility,Cystic Fibrosis,Diabetes from Cystic Fibrosis,Blindness due to type 2 diabetes mellitus (disorder),History of disarticulation at wrist (situation),Male Infertility,Acute Cholecystitis,Cholelithiasis,gender
0,f0f3bc8d-ef38-49ce-a2bd-dfdda982b271,6,Massachusetts,white,nonhispanic,0,0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,067318a4-db8f-447f-8b6e-f2f61e9baaa5,7,Massachusetts,white,nonhispanic,0,0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,ae9efba3-ddc4-43f9-a781-f72019388548,31,Massachusetts,white,nonhispanic,0,0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,199c586f-af16-4091-9998-ee4cfc02ee7a,20,Massachusetts,white,nonhispanic,0,0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,353016ea-a0ff-4154-85bb-1cf8b6cedf20,27,Massachusetts,white,nonhispanic,0,0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [95]:
# Encode state column
le = LabelEncoder()
covid['state'] = le.fit_transform(covid['state'])
covid['race'] = le.fit_transform(covid['race'])
covid['ethnicity'] = le.fit_transform(covid['ethnicity'])

In [96]:
covid.head()

Unnamed: 0,id,age,state,race,ethnicity,death,risk,Otitis media,Fever (finding),Suspected COVID-19,...,Posttraumatic stress disorder,Female Infertility,Cystic Fibrosis,Diabetes from Cystic Fibrosis,Blindness due to type 2 diabetes mellitus (disorder),History of disarticulation at wrist (situation),Male Infertility,Acute Cholecystitis,Cholelithiasis,gender
0,f0f3bc8d-ef38-49ce-a2bd-dfdda982b271,6,1,5,1,0,0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,067318a4-db8f-447f-8b6e-f2f61e9baaa5,7,1,5,1,0,0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,ae9efba3-ddc4-43f9-a781-f72019388548,31,1,5,1,0,0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,199c586f-af16-4091-9998-ee4cfc02ee7a,20,1,5,1,0,0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,353016ea-a0ff-4154-85bb-1cf8b6cedf20,27,1,5,1,0,0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [97]:
# Define age buckets
bins = [0, 30, 40, 50, 60, 100]
labels = ['0-30', '31-40', '41-50', '51-60', '61+']

# Convert age to buckets
covid['Age_Bucket'] = pd.cut(covid['age'], bins=bins, labels=labels, right=False)

# Map age buckets to numerical values
age_bucket_map = {'0-30': 0, '31-40': 1, '41-50': 2, '51-60': 3, '61+': 4}
covid['Age_Numeric'] = covid['Age_Bucket'].map(age_bucket_map)
covid.head()

Unnamed: 0,id,age,state,race,ethnicity,death,risk,Otitis media,Fever (finding),Suspected COVID-19,...,Cystic Fibrosis,Diabetes from Cystic Fibrosis,Blindness due to type 2 diabetes mellitus (disorder),History of disarticulation at wrist (situation),Male Infertility,Acute Cholecystitis,Cholelithiasis,gender,Age_Bucket,Age_Numeric
0,f0f3bc8d-ef38-49ce-a2bd-dfdda982b271,6,1,5,1,0,0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0-30,0
1,067318a4-db8f-447f-8b6e-f2f61e9baaa5,7,1,5,1,0,0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0-30,0
2,ae9efba3-ddc4-43f9-a781-f72019388548,31,1,5,1,0,0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,31-40,1
3,199c586f-af16-4091-9998-ee4cfc02ee7a,20,1,5,1,0,0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0-30,0
4,353016ea-a0ff-4154-85bb-1cf8b6cedf20,27,1,5,1,0,0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0-30,0


In [98]:
covid = covid.drop(columns=['age', 'Age_Bucket'])
covid = covid.rename(columns={'Age_Numeric': 'age'})

In [99]:
covid = covid.drop(columns=['COVID_START_DATE', 'COVID_END_DATE', 'death'])

In [100]:
records_with_nan = covid.isna().any(axis=1).sum()
records_with_nan

913

In [101]:
covid = covid.dropna()
covid.shape

(11539, 187)

In [102]:
covid['risk'].value_counts()

0    11229
1      310
Name: risk, dtype: int64

## Split the data into features and target


In [103]:
X = covid.drop('risk', axis=1)
y = covid['risk']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [104]:
y_train.value_counts()

0    8986
1     245
Name: risk, dtype: int64

In [105]:
# Drop irrelevant columns
X_train = X_train.drop(columns=['id'])

## Export X_test and y_test csv

In [106]:
X_test.to_csv('RiskPredictionArtifacts/X_test.csv', index=False)
y_test.to_csv('RiskPredictionArtifacts/y_test.csv', index=False)

In [108]:
X_test = X_test.drop(columns=['id'])

## Random Forest

In [109]:
# Initialize the Random Forest model
rf_model = RandomForestClassifier(n_estimators=10, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

# Make predictions
y_pred = rf_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9922010398613518


## Logistic Regression

In [110]:
# Initialize the logistic regression model
model = LogisticRegression()

# Train the model on the training set
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

# View detailed classification report
print(classification_report(y_test, y_pred))

Accuracy: 0.9930675909878682
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2243
           1       0.92      0.83      0.87        65

    accuracy                           0.99      2308
   macro avg       0.96      0.91      0.93      2308
weighted avg       0.99      0.99      0.99      2308



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## Save the Model

In [111]:
# Save the model to a file
dump(rf_model, 'RiskPredictionArtifacts/model.joblib')

['RiskPredictionArtifacts/model.joblib']