# Train Random Forest Classifier on COVID-19 dataset from CDC

In [1]:
import pandas as pd
import matplotlib.pyplot as plt

In [5]:
%matplotlib inline

In [9]:
pd.set_option('display.max_columns', None)

## Load the pre-processed dataset for AI. Replaced Yes, No with 1 and 0. Replaced age groups with integer values.

In [47]:
df_death_med_all = pd.read_csv('datasets/COVID-19_Case_Surveillance_Public_Use_Data_death_med_all_ai.csv')
df_death_med_all

Unnamed: 0.1,Unnamed: 0,cdc_case_earliest_dt,cdc_report_dt,pos_spec_dt,onset_dt,current_status,sex,age_group,race_ethnicity_combined,hosp_yn,icu_yn,death_yn,medcond_yn
0,45,2021/04/15,2021/08/31,2021/04/15,,Laboratory-confirmed case,Female,3,"American Indian/Alaska Native, Non-Hispanic",No,No,0,0
1,49,2020/12/17,2020/12/19,2020/12/18,2020/12/17,Laboratory-confirmed case,Female,3,"American Indian/Alaska Native, Non-Hispanic",No,Missing,0,0
2,71,2021/03/03,2021/03/05,2021/03/05,2021/03/03,Probable Case,Female,3,"American Indian/Alaska Native, Non-Hispanic",No,Missing,0,0
3,79,2021/12/20,2021/12/27,2021/12/20,2021/12/20,Laboratory-confirmed case,Female,3,"American Indian/Alaska Native, Non-Hispanic",No,Missing,0,0
4,80,2021/04/13,2021/04/28,,2021/04/13,Laboratory-confirmed case,Female,3,"American Indian/Alaska Native, Non-Hispanic",No,Missing,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4976169,103535071,2020/07/18,2020/07/26,2020/07/23,2020/07/18,Laboratory-confirmed case,Female,3,"American Indian/Alaska Native, Non-Hispanic",Missing,Missing,0,0
4976170,103535074,2020/06/28,2020/07/01,2020/06/29,2020/06/28,Laboratory-confirmed case,Female,3,"American Indian/Alaska Native, Non-Hispanic",No,Missing,0,1
4976171,103535123,2020/07/12,2020/07/18,2020/07/17,2020/07/12,Laboratory-confirmed case,Female,3,"American Indian/Alaska Native, Non-Hispanic",No,Missing,0,0
4976172,103535128,2020/07/02,2020/07/10,2020/07/04,2020/07/02,Laboratory-confirmed case,Female,3,"American Indian/Alaska Native, Non-Hispanic",No,Missing,0,1


In [17]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import pickle

## Extract features and labels from the data frame. Using age group and medical condition as features.

In [13]:
X=df_death_med_all.loc[:,["age_group","medcond_yn"]]
y=df_death_med_all["death_yn"]

## Split dataset into train and test data

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)

## Train the model or load the pre-trained model.

In [18]:
#rf_model = RandomForestClassifier()
#rf_model.fit(X_train, y_train)
rf_model = pickle.load(open("rf_model.obj","rb"))

## Get the probability for age 40-49 years with no pre-existing condition

In [43]:
rf_model.predict_proba(pd.DataFrame({"age_group":[5],"medcond_yn":[0]}))[0][1]

0.004232142143454183

## Get the probability for age 40-49 years with pre-existing condition

In [44]:
rf_model.predict_proba(pd.DataFrame({"age_group":[5],"medcond_yn":[1]}))[0][1]

0.018238575053557405

## Get the probability for age 85+ with no pre-existing condition

In [39]:
rf_model.predict_proba(pd.DataFrame({"age_group":[9],"medcond_yn":[0]}))[0][1]

0.17750733450516853

## Get the probability for age 85+ with pre-existing condition

In [46]:
rf_model.predict_proba(pd.DataFrame({"age_group":[9],"medcond_yn":[1]}))[0][1]

0.30052461748944115