# Train Random Forest Classifier on COVID-19 dataset from CDC

In [1]:
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
%matplotlib inline

In [3]:
pd.set_option('display.max_columns', None)

## Download the [csv file](https://catalog.data.gov/dataset/covid-19-case-surveillance-public-use-data).

## Prepare the dataset for AI. Replace Yes, No with 1 and 0 respectively. 

## Replace age groups with integer values.
1. 0-9 years
2. 10-19 years
3. 20-29 years
4. 30-39 years
5. 40-49 years
6. 50-59 years
7. 60-69 years
8. 70-79 years
9. 85+ years

## Replace current_status with integer values.
0. Probable Case
1. Laboratory-confirmed case

In [4]:
df_death_med_all = pd.read_csv('COVID-19_Case_Surveillance_Public_Use_Data_death_med_all_ai.csv')
df_death_med_all

Unnamed: 0.1,Unnamed: 0,cdc_case_earliest_dt,cdc_report_dt,pos_spec_dt,onset_dt,current_status,sex,age_group,race_ethnicity_combined,hosp_yn,icu_yn,death_yn,medcond_yn
0,45,2021/04/15,2021/08/31,2021/04/15,,1,0,3,"American Indian/Alaska Native, Non-Hispanic",0,0,0,0
1,750,2020/11/09,2020/11/23,2020/11/18,2020/11/09,1,0,3,"American Indian/Alaska Native, Non-Hispanic",0,0,0,0
2,759,2020/11/01,2020/11/17,2020/11/12,2020/11/01,1,0,3,"American Indian/Alaska Native, Non-Hispanic",0,0,0,0
3,1015,2020/08/19,2020/08/24,2020/08/19,2020/08/19,1,0,3,"American Indian/Alaska Native, Non-Hispanic",0,0,0,0
4,1203,2021/12/19,2021/12/19,2021/12/19,2021/12/19,0,0,3,"American Indian/Alaska Native, Non-Hispanic",1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1480740,103534708,2021/11/05,2021/11/16,2021/11/05,2021/11/05,1,0,3,"American Indian/Alaska Native, Non-Hispanic",1,0,0,1
1480741,103534714,2020/11/07,2020/11/16,2020/11/13,2020/11/07,1,0,3,"American Indian/Alaska Native, Non-Hispanic",0,0,0,0
1480742,103534737,2022/07/26,2022/07/30,2022/07/28,2022/07/26,1,0,3,"American Indian/Alaska Native, Non-Hispanic",0,0,0,1
1480743,103534799,2020/03/26,2020/04/08,2020/03/30,2020/03/26,1,0,3,"American Indian/Alaska Native, Non-Hispanic",0,0,0,0


## Import the AI libraries

In [5]:
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier 
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import pickle

## Extract features and labels from the dataframe. Using age group and medical condition as features.

In [6]:
X=df_death_med_all.loc[:,["age_group","medcond_yn","current_status","hosp_yn","sex","icu_yn"]]
y=df_death_med_all["death_yn"]

## Split dataset into train and test data for hospitalization

In [7]:
X_hosp=df_death_med_all.loc[:,["age_group","medcond_yn","current_status","sex"]]
y_hosp=df_death_med_all["hosp_yn"]

In [8]:
X_hosp_train, X_hosp_test, y_hosp_train, y_hosp_test = train_test_split(X_hosp, y_hosp, test_size=0.2)

## Train the model for hospitalization

In [10]:
model_hosp = RandomForestClassifier()#GradientBoostingClassifier()#LogisticRegression()#RandomForestClassifier()
# model_hosp.fit(X_hosp_train, y_hosp_train)
# pickle.dump(model_hosp, open("hosp_model.obj","wb"))
model_hosp = pickle.load(open('hosp_model.obj','rb'))

## Test the hospitalization model

In [11]:
model_hosp.score(X_hosp_test,y_hosp_test)

0.8382874836653171

In [12]:
probability = model_hosp.predict_proba(pd.DataFrame({"age_group":[9],"medcond_yn":[1],"current_status":[1],"sex":[1]}))
hosp_probaility_percent = probability[0,1]/(probability[0,0]+probability[0,1])*100
hosp_probaility_percent

69.16435134088039

## Split ICU dataset into train and test

In [13]:
X_icu=df_death_med_all.loc[:,["age_group","medcond_yn","current_status","sex","hosp_yn"]]
y_icu=df_death_med_all["icu_yn"]

In [14]:
X_icu_train, X_icu_test, y_icu_train, y_icu_test = train_test_split(X_icu, y_icu, test_size=0.2)

## Train the model on ICU dataset

In [28]:
model_icu = LogisticRegression()#GradientBoostingClassifier()#LogisticRegression()#RandomForestClassifier()
# model_icu.fit(X_icu_train, y_icu_train)
# pickle.dump(model_icu, open("icu_model.obj","wb"))
model_icu = pickle.load(open('icu_model.obj','rb'))

## Test the ICU model

In [29]:
model_icu.score(X_icu_test,y_icu_test)

0.9420764547575714

In [30]:
probability = model_icu.predict_proba(pd.DataFrame({"age_group":[9],"medcond_yn":[1],"current_status":[1],"sex":[1],"hosp_yn":[1]}))
hosp_icu_percent = probability[0,1]/(probability[0,0]+probability[0,1])*100
hosp_icu_percent

38.69413695232536

## Split the dataset into test and train for death

In [31]:
X_death=df_death_med_all.loc[:,["age_group","medcond_yn","current_status","sex","hosp_yn","icu_yn"]]
y_death=df_death_med_all["death_yn"]

In [32]:
X_death_train, X_death_test, y_death_train, y_death_test = train_test_split(X_death, y_death, test_size=0.2)

In [34]:
model_death = RandomForestClassifier()#GradientBoostingClassifier()#LogisticRegression()#RandomForestClassifier()
# model_death.fit(X_death_train, y_death_train)
# pickle.dump(model_death, open("death_model.obj","wb"))
model_death = pickle.load(open("death_model.obj","rb"))

In [35]:
model_death.score(X_death_test,y_death_test)

0.9567278633390625

## Get the probability for age 40-49 years with no pre-existing condition

In [36]:
model_death.predict_proba(pd.DataFrame({"age_group":[5],"medcond_yn":[0],"current_status":[1],"sex":[0],"hosp_yn":[1],"icu_yn":[1]}))[0,1]

0.2723472274887319

## Get the probability for age 40-49 years with pre-existing condition

In [37]:
model_death.predict_proba(pd.DataFrame({"age_group":[5],"medcond_yn":[1],"current_status":[1],"sex":[0],"hosp_yn":[1],"icu_yn":[1]}))[0,1]

0.4075769985072046

## Get the probability for age 70-79 years with no pre-existing condition

In [38]:
model_death.predict_proba(pd.DataFrame({"age_group":[8],"medcond_yn":[0],"current_status":[1],"sex":[1],"hosp_yn":[1],"icu_yn":[1]}))[0,1]

0.5918466945658848

## Get the probability for age 70-79 years with pre-existing condition

In [39]:
model_death.predict_proba(pd.DataFrame({"age_group":[8],"medcond_yn":[1],"current_status":[1],"sex":[1],"hosp_yn":[1],"icu_yn":[1]}))[0,1]

0.6780175157703868

## Get the probability for age 85+ with no pre-existing condition

In [40]:
model_death.predict_proba(pd.DataFrame({"age_group":[9],"medcond_yn":[0],"current_status":[1],"sex":[0],"hosp_yn":[1],"icu_yn":[1]}))[0,1]

0.5850861231081558

## Get the probability for age 85+ with pre-existing condition

In [41]:
model_death.predict_proba(pd.DataFrame({"age_group":[9],"medcond_yn":[1],"current_status":[1],"sex":[1],"hosp_yn":[1],"icu_yn":[1]}))[0,1]

0.7582120996842775

## Enter the values to test the model

In [56]:
hosp_probability = model_hosp.predict_proba(pd.DataFrame({"age_group":[9],"medcond_yn":[1],"current_status":[1],"sex":[0]}))
icu_probability = model_icu.predict_proba(pd.DataFrame({"age_group":[9],"medcond_yn":[1],"current_status":[1],"sex":[0],"hosp_yn":[1]}))
death_probability = model_death.predict_proba(pd.DataFrame({"age_group":[9],"medcond_yn":[1],"current_status":[1],"sex":[1],"hosp_yn":[1],"icu_yn":[0]}))
death_icu_probability = model_death.predict_proba(pd.DataFrame({"age_group":[9],"medcond_yn":[1],"current_status":[1],"sex":[1],"hosp_yn":[1],"icu_yn":[1]}))
hosp_probability, icu_probability, death_probability, death_icu_probability

(array([[0.41271842, 0.58728158]]),
 array([[0.69999928, 0.30000072]]),
 array([[0.63401599, 0.36598401]]),
 array([[0.2417879, 0.7582121]]))