# COMP3115 Exploratory Data Analysis and Visualization Semester 2, 2022-23

Group Project
Topic 1: Open Covid-19 Data Analytics

# 1. Import Library

In [None]:
import sys # For installing library and dependency

!{sys.executable} -m pip install requests pandas numpy seaborn tqdm scikit-learn

In [131]:
import requests # Import the necessary library
import pandas as pd
import numpy as np
import seaborn as sns
from tqdm import tqdm

# 2. Load Dataset from API

In [116]:
from time import gmtime, strftime, time # Function of generateAPILinks
import urllib.parse

def generateAPILinks(file_url = 'http://www.chp.gov.hk/files/misc/enhanced_sur_covid_19_eng.csv', start = '20200101', end = strftime('%Y%m%d', gmtime(time() - 3600 * 24))):
  response = requests.get(f'https://api.data.gov.hk/v1/historical-archive/list-file-versions?url={urllib.parse.quote_plus(file_url)}&start={start}&end={end}')
  json = response.json()
  return [f'https://api.data.gov.hk/v1/historical-archive/get-file?url={urllib.parse.quote_plus(file_url)}&time=' + t for t in json['timestamps']]

# 2.1 Latest situation of reported cases of COVID-19 in Hong Kong

In [118]:
#Load Latest situation of reported cases of COVID-19 in Hong Kong
latest_situation_of_reported_cases_covid_19_eng_urls = generateAPILinks('http://www.chp.gov.hk/files/misc/latest_situation_of_reported_cases_covid_19_eng.csv')
len(latest_situation_of_reported_cases_covid_19_eng_urls)

1095

In [128]:
# Fetching data to csv
latest_situation_of_reported_cases_covid_19_eng_df = pd.DataFrame()
for u in tqdm(latest_situation_of_reported_cases_covid_19_eng_urls):
  latest_situation_of_reported_cases_covid_19_eng_df = pd.concat([latest_situation_of_reported_cases_covid_19_eng_df, pd.read_csv(u, index_col=0)])
latest_situation_of_reported_cases_covid_19_eng_df.to_csv('latest_situation_of_reported_cases_covid_19_eng.csv')  

latest_situation_of_reported_cases_covid_19_eng_df = pd.read_csv('latest_situation_of_reported_cases_covid_19_eng.csv', index_col=0)

100%|██████████| 1095/1095 [26:54<00:00,  1.47s/it] 


In [172]:
latest_situation_of_reported_cases_covid_19_eng_df = pd.read_csv('latest_situation_of_reported_cases_covid_19_eng.csv', index_col=0)
latest_situation_of_reported_cases_covid_19_eng_df.info()
latest_situation_of_reported_cases_covid_19_eng_df.count

<class 'pandas.core.frame.DataFrame'>
Index: 698416 entries, 08/01/2020 to 09/04/2023
Data columns (total 14 columns):
 #   Column                                                                       Non-Null Count   Dtype  
---  ------                                                                       --------------   -----  
 0   As of time                                                                   206654 non-null  object 
 1   Number of confirmed cases                                                    590688 non-null  float64
 2   Number of ruled out cases                                                    97452 non-null   float64
 3   Number of cases still hospitalised for investigation                         97452 non-null   float64
 4   Number of cases fulfilling the reporting criteria                            97452 non-null   float64
 5   Number of death cases                                                        695931 non-null  float64
 6   Number of discharge 

<bound method DataFrame.count of            As of time  Number of confirmed cases  Number of ruled out cases   
As of date                                                                    
08/01/2020      12:00                        0.0                       21.0  \
09/01/2020      12:00                        0.0                       25.0   
10/01/2020      12:00                        0.0                       31.0   
11/01/2020      12:00                        0.0                       46.0   
12/01/2020      12:00                        0.0                       51.0   
...               ...                        ...                        ...   
05/04/2023        NaN                        NaN                        NaN   
06/04/2023        NaN                        NaN                        NaN   
07/04/2023        NaN                        NaN                        NaN   
08/04/2023        NaN                        NaN                        NaN   
09/04/2023        N

In [173]:
# Replace NaN and null values with 0
latest_situation_of_reported_cases_covid_19_eng_df.fillna(0, inplace=True)
latest_situation_of_reported_cases_covid_19_eng_df.replace([np.inf, -np.inf, np.nan, ' '], 0, inplace=True)

# Drop completely identical rows in the DataFrame
latest_situation_of_reported_cases_covid_19_eng_df = latest_situation_of_reported_cases_covid_19_eng_df.drop_duplicates()

# Drop the 'As of time' column
latest_situation_of_reported_cases_covid_19_eng_df = latest_situation_of_reported_cases_covid_19_eng_df.drop('As of time', axis=1)

# Convert date to datetime objects
latest_situation_of_reported_cases_covid_19_eng_df.index = pd.to_datetime(latest_situation_of_reported_cases_covid_19_eng_df.index.astype(str), format="%d/%m/%Y")

# Group by 'As of date' and sum up the records
latest_situation_of_reported_cases_covid_19_eng_df = latest_situation_of_reported_cases_covid_19_eng_df.groupby('As of date').sum().reset_index()

In [164]:
# show last 5
latest_situation_of_reported_cases_covid_19_eng_df.tail(5)

<bound method DataFrame.count of       As of date                                         As of time   
0     01/01/2021                                                  0  \
1     01/01/2022                                                  0   
2     01/01/2023                                                  0   
3     01/02/2020  23:5923:5923:5923:5923:5923:5923:5923:5923:592...   
4     01/02/2021                                                  0   
...          ...                                                ...   
1897    9/7/2021                                                  0   
1898    9/8/2020                                                  0   
1899    9/8/2021                                                  0   
1900    9/9/2020                                                  0   
1901    9/9/2021                                                  0   

      Number of confirmed cases  Number of ruled out cases   
0                     7305936.0                     

In [174]:
latest_situation_of_reported_cases_covid_19_eng_df.to_csv('latest_situation_of_reported_cases_covid_19_eng_group.csv')  
