This script contains the data preparation of Hong Kong monthly Ovitrap Index in different locations. The data is from Food and Environmental Hygiene Department(FEHD). The data from 2008 to 2017 is downloaded from FEHD in pdf format while the data from 2018 to now (July, 2018) is scrapped from the FEHD website. The processed data is converted to a csv file at the end of this script and it will be stored into a relational database in the next step.

In [1]:
import pandas as pd
import numpy as np
import requests
import re
from bs4 import BeautifulSoup

def matched(patterns,values): #find out expressions that matched with the patterns
    found = []
    for pattern in patterns:
        for value in values:
            match = re.search(pattern,value)
            if match:
                found.append(value)
    return set(found)

def replace_value(to_replace,replace,df_column): #replace value in the dataframe column
    for index,value in enumerate(to_replace): 
        df_column = df_column.replace(value,replace[index])
    return df_column

def make_row(row_list): 
    months = ['{:02d}'.format(i) for i in range(1,13)]
    row_dict = {}
    for index,value in enumerate(months):
        row_dict[value] = { 'Eng': row_list[0],
                            'Chi': row_list[1],
                            'Date': "{month}-{year}".format(month=value,year=row_list[-1]),
                            'AOI': row_list[index+2]
                            }   
    return row_dict

def convert_float(number):
    if '/' in number:
        decimal = float(number[0:-2])/100
        return round(decimal,3)
    elif number[-1] == '%':
        decimal = float(number[0:-1])/100
        return round(decimal,3)
    else:
        decimal = float(number)/100
        return round(decimal,3)    

def classification(value):
    if value < 0.05:
        return 1
    elif value >= 0.05 and value < 0.2:
        return 2
    elif value >= 0.2 and value < 0.4:
        return 3
    elif value >= 0.4:
        return 4

In [8]:
archive = pd.read_csv("monthlyOvitrap_2008-2017.csv",header=None)

#use the table header at the fourth row as the header of the dataframe
#only need the English parts
new_col = {}
for i in range(13):
    splited = archive.loc[4,i].split('\n')
    new_col[i] = splited[0]

archive.rename(columns=new_col,inplace=True)

#find and drop all the rows that contain unwanted data
patterns = ["Food and Environmental Hygiene Department","食物環境衛生署","^Monthly",
            "二零..年每月誘蚊產卵器分區指數","Locations\n地區"]

unwanted = matched(patterns,archive["Locations"])
for value in unwanted:
    drop_index = archive[archive["Locations"] == value].index
    archive.drop(drop_index,inplace=True)

archive.reset_index(drop=True,inplace=True)    

#consistency is provided for inconsistent names of locations 
to_replace_1 = ['Central, Sheung Wan and Sai Ying\nPun\n中環, 上環及西營盤','Tseung Kwan O\n將軍澳',
                'Tseung Kwan O South (Formerly: Tseung Kwan O)\n將軍澳南 (前稱: 將軍澳)',
                'Tsing Yi\n青衣','Tsing Yi South (Formerly: Tsing Yi)\n青衣南 (前稱: 青衣)']

replace_1 = ['Central, Sheung Wan and Sai Ying Pun\n中環, 上環及西營盤','Tseung Kwan O South\n將軍澳南',
             'Tseung Kwan O South\n將軍澳南','Tsing Yi South\n青衣南','Tsing Yi South\n青衣南']

archive['Locations'] = replace_value(to_replace_1,replace_1,archive['Locations'])

#Columns 'Eng' and 'Chi' are created to store the bilingual location 
for index, value in enumerate(archive['Locations']):
    splited = value.split('\n')
    archive.loc[index,'Eng'] = splited[0]
    archive.loc[index,'Chi'] = splited[1]

archive.drop('Locations',axis=1,inplace=True)
cols = archive.columns.tolist()
cols = cols[-2:]+cols[:-2]
archive = archive[cols]

#Column 'Year' is created to stored the corresponding year
archive.loc[:,'Year'] = pd.Series('NaN'for each in range(archive.shape[0]))
year_interval = archive[archive['Eng'] == 'Chai Wan West'].index.tolist()
for index, value in enumerate(range(2008,2018)):
    try:
        archive['Year'].loc[year_interval[index]:year_interval[index+1]] = value
    except IndexError:
        archive['Year'].loc[year_interval[index]:] = value

In [7]:
response = requests.get("https://www.fehd.gov.hk/tc_chi/pestcontrol/dengue_fever/ovitrap_index.html")
content = response.content
parser =  BeautifulSoup(content,'html.parser')

scrapped = []
for each in parser.select("td"):
    scrapped.append(each.text)
    
scrapped = scrapped[10:] # The first ten data are unrelated
pattern = ['區$','月$'] # Data that match with anyone of these two Chinese words is not needed
unwanted = matched(pattern,scrapped)

wanted = []
for value in scrapped:
    if value not in unwanted:
        wanted.append(value)

interval = np.arange(0,len(wanted),13) # Every row is consisted of 13 data
current = []
for index,value in enumerate(interval):
    location = wanted[value]
    if location == "香港國際機場": # Port area are not the targets in this analysis
        break
    else:
        current.append(wanted[value:interval[index+1]])

for each in current: 
    each.insert(0,'') # Prepare a place for the 'Eng' column 
    each.append(2018) # Provide value for the 'Year' column
    

header = archive.columns.tolist() # set the same header for combination of datasets
current = pd.DataFrame(current,columns=header)
current.replace('',np.nan,inplace=True)     

In [9]:
combined = pd.concat([archive,current],axis=0,ignore_index=True)

to_replace_2 = ['中環, 上環及西營盤','上環及西營盤(前稱: 中環,上環及西營盤)','將軍澳南(前稱: 將軍澳)','青衣南(前稱: 青衣)']
replace_2 = ['上環及西營盤','上環及西營盤','將軍澳南','青衣南']
combined['Chi'] = replace_value(to_replace_2,replace_2,combined['Chi'])

to_replace_3 = ['Central, Sheung Wan and Sai Ying Pun','Tuen Mun (S)','Tuen Mun (N)']
replace_3 = ['Sheung Wan and Sai Ying Pun','Tuen Mun South','Tuen Mun North'] 
combined['Eng'] = replace_value(to_replace_3,replace_3,combined['Eng'])

eng_name = combined['Eng'].unique()
chi_name = combined['Chi'].unique()
eng_name = [name for name in eng_name if name is not np.nan]
translations = list(zip(eng_name,chi_name))

new_locations = [('Central and Admiralty','中環及金鐘'),('Yau Tong','油塘'),('Wo Che','禾輋'),
                 ('Tuen Mun West','屯門西'),('Tsuen Wan West','荃灣西')]
translations = translations + new_locations

name_2018 = combined[combined['Year']==2018].loc[:,['Eng','Chi']]
for translation in translations:
    for index in name_2018.index:
        if name_2018.loc[index][1] == translation[1]:
            name_2018.loc[index][0] = translation[0]  
combined.loc[name_2018.index,'Eng'] = name_2018['Eng']         

In [22]:
complete = pd.DataFrame(columns=['Eng','Chi','Date','AOI'])

for i in combined.index:
    row_list = combined.loc[i].tolist()
    row_dict = make_row(row_list)
    for key, value in row_dict.items():
        complete = complete.append(value,ignore_index=True)

complete.dropna(inplace=True)
complete.reset_index(drop=True,inplace=True)
complete['AOI'] = complete['AOI'].apply(convert_float)
complete['Classification'] = complete['AOI'].apply(classification)

In [6]:
complete.to_csv('Area_OviTrap_Index_Jan2008-Jul2018.csv',index=False)