# Train Random Forest Classifier on COVID-19 dataset from CDC

In [1]:
import pandas as pd
import matplotlib.pyplot as plt

In [5]:
%matplotlib inline

In [9]:
pd.set_option('display.max_columns', None)

## Download the [csv file](https://catalog.data.gov/dataset/covid-19-case-surveillance-public-use-data).

## Prepare the dataset for AI. Replace Yes, No with 1 and 0 respectively. 

## Replace age groups with integer values.
1. 0-9 years
2. 10-19 years
3. 20-29 years
4. 30-39 years
5. 40-49 years
6. 50-59 years
7. 60-69 years
8. 70-79 years
9. 85+ years

## Replace current_status with integer values.
0. Probable Case
1. Laboratory-confirmed case

In [95]:
df_death_med_all = pd.read_csv('COVID-19_Case_Surveillance_Public_Use_Data_death_med_all_ai1.csv')
df_death_med_all

Unnamed: 0.1,Unnamed: 0,cdc_case_earliest_dt,cdc_report_dt,pos_spec_dt,onset_dt,current_status,sex,age_group,race_ethnicity_combined,hosp_yn,icu_yn,death_yn,medcond_yn
0,45,2021/04/15,2021/08/31,2021/04/15,,1,Female,3,"American Indian/Alaska Native, Non-Hispanic",0,No,0,0
1,49,2020/12/17,2020/12/19,2020/12/18,2020/12/17,1,Female,3,"American Indian/Alaska Native, Non-Hispanic",0,Missing,0,0
2,71,2021/03/03,2021/03/05,2021/03/05,2021/03/03,0,Female,3,"American Indian/Alaska Native, Non-Hispanic",0,Missing,0,0
3,79,2021/12/20,2021/12/27,2021/12/20,2021/12/20,1,Female,3,"American Indian/Alaska Native, Non-Hispanic",0,Missing,0,0
4,80,2021/04/13,2021/04/28,,2021/04/13,1,Female,3,"American Indian/Alaska Native, Non-Hispanic",0,Missing,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4131725,103535053,2020/09/10,2020/09/15,2020/09/13,2020/09/10,1,Female,3,"American Indian/Alaska Native, Non-Hispanic",0,Missing,0,1
4131726,103535074,2020/06/28,2020/07/01,2020/06/29,2020/06/28,1,Female,3,"American Indian/Alaska Native, Non-Hispanic",0,Missing,0,1
4131727,103535123,2020/07/12,2020/07/18,2020/07/17,2020/07/12,1,Female,3,"American Indian/Alaska Native, Non-Hispanic",0,Missing,0,0
4131728,103535128,2020/07/02,2020/07/10,2020/07/04,2020/07/02,1,Female,3,"American Indian/Alaska Native, Non-Hispanic",0,Missing,0,1


## Import the AI libraries

In [17]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import pickle

## Extract features and labels from the dataframe. Using age group and medical condition as features.

In [54]:
X=df_death_med_all.loc[:,["age_group","medcond_yn","current_status","hosp_yn"]]
y=df_death_med_all["death_yn"]

## Split dataset into train and test data

In [55]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

## Train the model or load the pre-trained model.

In [75]:
# rf_model = RandomForestClassifier()
# rf_model.fit(X_train, y_train)

In [None]:
rf_model = pickle.load(open("rf_model1.obj","rb"))

In [133]:
rf_model.score(X_test,y_test)

0.962138135841403

## Get the probability for age 40-49 years with no pre-existing condition

In [99]:
rf_model.predict_proba(pd.DataFrame({"age_group":[5],"medcond_yn":[0],"current_status":[1],"hosp_yn":[1]}))[0,1]

0.0732518175383056

## Get the probability for age 40-49 years with pre-existing condition

In [100]:
rf_model.predict_proba(pd.DataFrame({"age_group":[5],"medcond_yn":[1],"current_status":[1],"hosp_yn":[1]}))[0,1]

0.16718780903456762

## Get the probability for age 70-79 years with no pre-existing condition

In [105]:
rf_model.predict_proba(pd.DataFrame({"age_group":[8],"medcond_yn":[0],"current_status":[1],"hosp_yn":[1]}))[0,1]

0.26649298149239703

## Get the probability for age 70-79 years with pre-existing condition

In [112]:
rf_model.predict_proba(pd.DataFrame({"age_group":[8],"medcond_yn":[1],"current_status":[1],"hosp_yn":[1]}))[0,1]

0.3889287455955175

## Get the probability for age 85+ with no pre-existing condition

In [111]:
rf_model.predict_proba(pd.DataFrame({"age_group":[9],"medcond_yn":[0],"current_status":[1],"hosp_yn":[1]}))[0,1]

0.3329467969037689

## Get the probability for age 85+ with pre-existing condition

In [110]:
rf_model.predict_proba(pd.DataFrame({"age_group":[9],"medcond_yn":[1],"current_status":[1],"hosp_yn":[1]}))[0,1]

0.5036927931709284

## Enter the values to test the model

In [132]:
probability = rf_model.predict_proba(pd.DataFrame({"age_group":[9],"medcond_yn":[1],"current_status":[0],"hosp_yn":[1]}))
death_probaility_percent = probability[0,1]/(probability[0,0]+probability[0,1])*100
death_probaility_percent

52.79551790704811