In [135]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.impute import KNNImputer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report

covid = pd.read_csv("datasets/Case_information.csv")

check data

In [136]:
covid

Unnamed: 0,case_id,age,age_group,sex,date_announced,date_recovered,date_of_death,status,date_announced_as_removed,province,muni_city,health_status,home_quarantined,date_of_onset_of_symptoms,pregnant,region
0,C404174,38.0,35 to 39,Female,2020-01-30,,,Recovered,2020-02-07,Negros Oriental,Dumaguete City,Recovered,,2020-01-21,,Central Visayas (Region VII)
1,C462688,44.0,40 to 44,Male,2020-02-03,,2020-02-01,Died,2020-02-02,Negros Oriental,Dumaguete City,Died,,2020-01-18,,Central Visayas (Region VII)
2,C387710,60.0,60 to 64,Female,2020-02-05,2020-01-31,,Recovered,2020-02-05,Bohol,Panglao,Recovered,No,2020-01-21,,Central Visayas (Region VII)
3,C377460,48.0,45 to 49,Male,2020-03-06,,,Recovered,2020-03-27,Metropolitan Manila,Taguig,Recovered,No,2020-03-03,,Metropolitan Manila
4,C498051,62.0,60 to 64,Male,2020-03-06,,2020-03-11,Died,2020-03-12,Rizal,Cainta,Died,No,2020-02-25,,CALABARZON (Region IV-A)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12086,C972617,35.0,35 to 39,Male,2020-05-15,,,For validation,,Metropolitan Manila,Makati City,Mild,,,,Metropolitan Manila
12087,C975669,37.0,35 to 39,Male,2020-05-15,,2020-05-13,Died,2020-05-15,Davao del Sur,Davao City,Died,,2020-04-29,,Davao Region (Region XI)
12088,C980530,62.0,60 to 64,Female,2020-05-15,,,For validation,,,,Mild,,,,
12089,C987431,18.0,15 to 19,Female,2020-05-15,,,Home quarantined,,Samar,Tarangnan,Asymptomatic,Yes,,No,Eastern Visayas (Region VIII)


Dropping Missing Values and Irrelevant Features

In [137]:
# Removing Null values for Date recovered, home_quarantined, and Date of Death
df = covid.dropna(subset=['date_recovered', 'date_of_death', 'home_quarantined'], how='all') 

#Removing values other than recovered and died
values=['Critical', 'Severe', 'Mild', 'Asymptomatic']
df = df[df.health_status.isin(values) == False]

#Join data recovered and date death
df['date_outcome'] = df['date_recovered'].fillna(df['date_of_death'])

# Remove unnecessary columns
df = df.drop(['case_id', 'date_announced_as_removed','date_of_death','date_recovered', 'status', 'age_group'], axis=1)


Feature Engineering

In [138]:
#Conversion of status to categorical variables
df['health_status'], uniques_hs = pd.factorize(df['health_status'])
df['sex'], uniques_sex = pd.factorize(df['sex'])
df['home_quarantined'], uniques_hq = pd.factorize(df['home_quarantined'])
df['region'], uniques_region = pd.factorize(df['region'])
df['muni_city'], uniques_mcity = pd.factorize(df['muni_city'])
df['province'], uniques_prov = pd.factorize(df['province'])

# label_encoder = LabelEncoder()
# df['sex'] = label_encoder.fit_transform(df['sex'])

#Conversion of date time columns to datetime format
df['date_announced'] = pd.to_datetime(df['date_announced'])
df['date_outcome'] = pd.to_datetime(df['date_outcome'])
df['date_of_onset_of_symptoms'] = pd.to_datetime(df['date_of_onset_of_symptoms'])

#Computation of outcome days
df['outcome_days'] = (df['date_outcome'] - df['date_announced']) / np.timedelta64(1, 'D')
df = df.drop(['date_outcome', 'date_announced', 'pregnant', 'date_of_onset_of_symptoms'], axis=1)
df = df[df.home_quarantined != -1]

df

Unnamed: 0,age,sex,province,muni_city,health_status,home_quarantined,region,outcome_days
2,60.0,1,1,1,1,0,0,-5.0
3,48.0,0,2,2,1,0,1,
4,62.0,0,3,3,0,0,2,5.0
5,58.0,1,3,3,0,0,2,5.0
6,39.0,0,2,4,1,0,1,13.0
...,...,...,...,...,...,...,...,...
10261,44.0,1,2,10,1,1,1,
10383,73.0,0,3,23,0,0,2,-13.0
11339,39.0,0,2,20,1,1,1,
11380,66.0,1,2,40,0,1,1,-11.0


Data split

In [139]:
features = ['age', 'sex', 'home_quarantined', 'outcome_days', 'region', 'province']
X = df[features]
y = df['health_status']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


Model Evaluation

In [140]:
categorical_features = np.array(['sex', 'province', 'home_quarantined', 'region'])

# HistGradientBoostingClassifier model
hgb_model = HistGradientBoostingClassifier(
    categorical_features = categorical_features,
    verbose = 1
    )
hgb_model.fit(X_train, y_train)

# predictions on the test set
y_pred = hgb_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.3f}')

Binning 0.000 GB of training data: 0.003 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 31 leaves, max depth = 10, in 0.017s
[2/100] 1 tree, 31 leaves, max depth = 10, in 0.011s
[3/100] 1 tree, 31 leaves, max depth = 10, in 0.013s
[4/100] 1 tree, 31 leaves, max depth = 10, in 0.011s
[5/100] 1 tree, 31 leaves, max depth = 11, in 0.011s
[6/100] 1 tree, 31 leaves, max depth = 10, in 0.011s
[7/100] 1 tree, 31 leaves, max depth = 11, in 0.010s
[8/100] 1 tree, 31 leaves, max depth = 10, in 0.010s
[9/100] 1 tree, 31 leaves, max depth = 9, in 0.012s
[10/100] 1 tree, 31 leaves, max depth = 11, in 0.009s
[11/100] 1 tree, 31 leaves, max depth = 10, in 0.010s
[12/100] 1 tree, 31 leaves, max depth = 10, in 0.010s
[13/100] 1 tree, 31 leaves, max depth = 11, in 0.010s
[14/100] 1 tree, 31 leaves, max depth = 11, in 0.010s
[15/100] 1 tree, 31 leaves, max depth = 12, in 0.010s
[16/100] 1 tree, 31 leaves, max depth = 11, in 0.010s
[17/100] 1 tree, 31 leaves, max depth = 11, in 0.010s
[18/100] 1 tree,

In [141]:
dfKnn = df

dfKnn

Unnamed: 0,age,sex,province,muni_city,health_status,home_quarantined,region,outcome_days
2,60.0,1,1,1,1,0,0,-5.0
3,48.0,0,2,2,1,0,1,
4,62.0,0,3,3,0,0,2,5.0
5,58.0,1,3,3,0,0,2,5.0
6,39.0,0,2,4,1,0,1,13.0
...,...,...,...,...,...,...,...,...
10261,44.0,1,2,10,1,1,1,
10383,73.0,0,3,23,0,0,2,-13.0
11339,39.0,0,2,20,1,1,1,
11380,66.0,1,2,40,0,1,1,-11.0


In [142]:
imputer = KNNImputer(n_neighbors=5)
dfKnn[['outcome_days', 'age']] = imputer.fit_transform(dfKnn[['outcome_days', 'age']])

dfKnn

Unnamed: 0,age,sex,province,muni_city,health_status,home_quarantined,region,outcome_days
2,60.0,1,1,1,1,0,0,-5.0
3,48.0,0,2,2,1,0,1,18.8
4,62.0,0,3,3,0,0,2,5.0
5,58.0,1,3,3,0,0,2,5.0
6,39.0,0,2,4,1,0,1,13.0
...,...,...,...,...,...,...,...,...
10261,44.0,1,2,10,1,1,1,14.2
10383,73.0,0,3,23,0,0,2,-13.0
11339,39.0,0,2,20,1,1,1,21.4
11380,66.0,1,2,40,0,1,1,-11.0


In [163]:
features = ['age', 'sex', 'home_quarantined', 'outcome_days', 'region', 'province']
X = df[features]
y = df['health_status']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# KNeighborsClassifier model
knn = KNeighborsClassifier(n_neighbors=6)
knn.fit(X_train, y_train)

# Make predictions on the test set
y_pred = knn.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.3f}')

Accuracy: 0.912


In [145]:
# loading library
import pickle
# create an iterator object with write permission - model.pkl
with open('hgb_model_pkl', 'wb') as files:
    pickle.dump(hgb_model, files)