# HongKong Study
## Dataset Dictionary

Data dictionary for Hong Kong household study (2008-2009)				
				
### household.csv
|Column|		Explanation|		Coding	|
|------|------|
|hhID		    |  Household ID		|
|intervention	| Intervention arm		|        1: Control; 3: Hand hygiene; 4: Mask+HH
|familysize		| No. of people living in the same household	|	integer
|clinic_date	| The date of clinic visit		|d/m/y
|clinic_day		| Delay from index symptom onset to randomisation	|	in days
|v1_day		    | Delay from index symptom onset to visit 1	|	in days
|v2_day		    |Delay from index symptom onset to visit 2	|	in days
|v3_day		    | Delay from index symptom onset to visit 3	|	in days
				
				
### Household member characteristics: demographic.csv		
|Column|		Explanation|		Coding	|
|------|------|
|hhID	|	Household ID	|	Integer
|member		|Household member number	|0: Index subject;1-7: Household contact
|age		|Age		|in years
|male		|Sex		|0: Female; 1: Male
|vaccine	|	Received influenza vaccination for current season		|1: Yes; 0: No
				
				
### Household member daily symptom diaries: symptom.csv	
|Column|		Explanation|		Coding	|
|------|------|
|hhID	|	Household ID		
|member	|	Household member number	|	0: Index subject;1-7: Household contact
|day	|	Day 0 = home visit 1	|	in days
|bodytemp	|	Measured body temperature	|	in °C
|headache	|	Headache		|1: Yes; 0: No
|sthroat	|	Sore throat	|	1: Yes; 0: No
|cough		|Cough		|1: Yes; 0: No
|pmuscle	|	Aches or pains in muscles or joints	|	1: Yes; 0: No
|rnose	|	Runny nose	|	1: Yes; 0: No
|phlegm	|	Phlegm		|1: Yes; 0: No
				
				
### Lab-confirmed PCR result: PCRresults.csv	
|Column|		Explanation|		Coding	|
|------|------|
|hhID	|	Household ID		|
|member	|	Household member number	|	0: Index subject;1-7: Household contact
|visit	|	Visit number		|0: Clinic visit; 1-3: Home visit
|qPCR	|	Laboratory RT-qPCR result	|	Detection limit: 900 copies/mL
	
				
Remark				
NA - data not available				

In [1]:
import pandas as pd
import numpy as np
base_path = '../data/original_per_study/hk/'
def load(filename, basepath, sepra, index_col=None):
    path = basepath+filename
    df = pd.read_csv(path, sep=sepra, index_col=index_col)
    return df
pcr = load('PCRresults.csv',base_path,';', None)
sym = load('symptom.csv',base_path,';', None)
dem = load('demographic.csv',base_path, ';', None)
household = load('household.csv', base_path, ';', None)


In [2]:
pcr.to_csv('pcr.csv')
sym.to_csv('sym.csv')
dem.to_csv('dem.csv')
household.to_csv('household.csv')

In [3]:
"""
    A patient is first identified by household number and the member number in a household
    mhhID is the unque id for a patient. Generating a primary key for each datset
"""
pcr['mhhID'] = pcr['hhID'].astype(str) + pcr['member'].astype(str)
sym['mhhID'] = sym['hhID'].astype(str) + sym['member'].astype(str)
dem['mhhID'] = dem['hhID'].astype(str) + dem['member'].astype(str)

In [4]:
print('The unique number of participants in the survey is:')
print(len(dem.groupby('mhhID')['mhhID']))
print(len(dem['mhhID']))
print('This proves that Demographics file has all unique records.')

The unique number of participants in the survey is:
1742
1742
This proves that Demographics file has all unique records.


In [5]:
print(len(sym))
print(sym.groupby('mhhID')['mhhID'].count().value_counts())

17420
10    1742
Name: mhhID, dtype: int64


In [6]:
print("Total Number of times clinical tests have been conducted:")
print(len(pcr))
print("No of tests conducted: No. of Patients")
print(pcr.groupby('mhhID')['mhhID'].count().value_counts())

Total Number of times clinical tests have been conducted:
5217
No of tests conducted: No. of Patients
3    1724
1      17
2      14
Name: mhhID, dtype: int64


In [7]:
len(pcr.groupby(['hhID', 'member']).count())

1755

In [8]:
len(sym.groupby(['hhID', 'member']).count())

1742

In [9]:
len(dem.groupby(['hhID', 'member']).count())

1742

In [10]:
sym.head()

Unnamed: 0,hhID,member,day,bodytemp,headache,sthroat,cough,pmuscle,rnose,phlegm,mhhID
0,1,0,0,37.7,0.0,1.0,1.0,0.0,1.0,1.0,10
1,1,0,1,36.1,0.0,1.0,1.0,0.0,1.0,1.0,10
2,1,0,2,35.9,0.0,1.0,1.0,0.0,1.0,1.0,10
3,1,0,3,36.0,0.0,1.0,1.0,0.0,0.0,1.0,10
4,1,0,4,36.0,0.0,1.0,1.0,0.0,1.0,1.0,10


In [11]:
pcr.head()

Unnamed: 0,hhID,member,visit,qPCR,mhhID
0,1,0,1,3500000.0,10
1,1,0,2,167000.0,10
2,1,0,3,3830.0,10
3,1,1,1,0.0,11
4,1,1,2,0.0,11


In [12]:
household.head()

Unnamed: 0,hhID,intervention,familysize,clinic_date,clinic_day,v1_day,v2_day,v3_day
0,1,1,3,9/1/2008,1,1,4.0,7
1,2,3,4,11/1/2008,1,1,3.0,8
2,3,4,5,14/1/2008,2,2,4.0,8
3,5,3,3,15/1/2008,1,1,4.0,7
4,6,4,4,17/1/2008,1,1,4.0,7


In [13]:
household['v3_day'] = household['v3_day'] - household['v1_day']
household['v2_day'] = household['v2_day'] - household['v1_day']
household['v1_day'] = household['v1_day'] - household['v1_day']

household_v1 = household[['hhID', 'clinic_date', 'v1_day']]
household_v2 = household[['hhID', 'clinic_date', 'v2_day']]
household_v3 = household[['hhID', 'clinic_date', 'v3_day']]

household_v1.rename(index=str, columns={'v1_day': 'day'}, inplace =True)
household_v2.rename(index=str, columns={'v2_day': 'day'}, inplace =True)
household_v3.rename(index=str, columns={'v3_day': 'day'}, inplace =True)

household_df = pd.concat([household_v1, household_v2, household_v3], ignore_index=True)

print("The length should be thrice as the number of households: ")
len(household_df)

The length should be thrice as the number of households: 


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  **kwargs)


1281

In [14]:
household_df.dtypes

hhID             int64
clinic_date     object
day            float64
dtype: object

In [15]:
household_df.day.value_counts()

0.0     428
3.0     216
6.0     155
7.0     145
2.0     112
4.0      94
5.0      88
8.0      28
9.0       9
1.0       2
10.0      1
11.0      1
Name: day, dtype: int64

In [16]:
"""
    I need to filter out symptoms only for the days when tests have been condicted.
    The date for all members of the household remains same.
    For this I need to merge left join household_df with sym on hhID and day.
    The resultant dataframe should have 1742 * 3 number of observations. 
    Three observations with respect to each patient.
"""
sym_household = pd.merge(household_df, sym, how='left', on = ['hhID', 'day'])

In [17]:
len(sym_household)

5213

In [18]:
sym_household['member'].value_counts().sum()
print("This means there are 4 NA values in mhhID")

This means there are 4 NA values in mhhID


In [19]:
print("Total Number of times clinical tests have been conducted:")
print(len(sym_household))
print("No of tests conducted: No. of Patients")
print(sym_household.groupby('mhhID')['mhhID'].count().value_counts())

Total Number of times clinical tests have been conducted:
5213
No of tests conducted: No. of Patients
3    1725
2      17
Name: mhhID, dtype: int64


In [20]:
sym_household.head()

Unnamed: 0,hhID,clinic_date,day,member,bodytemp,headache,sthroat,cough,pmuscle,rnose,phlegm,mhhID
0,1,9/1/2008,0,0.0,37.7,0.0,1.0,1.0,0.0,1.0,1.0,10
1,1,9/1/2008,0,1.0,35.8,0.0,0.0,0.0,0.0,0.0,0.0,11
2,1,9/1/2008,0,2.0,36.4,0.0,1.0,0.0,0.0,0.0,0.0,12
3,2,11/1/2008,0,0.0,38.9,1.0,1.0,1.0,1.0,1.0,1.0,20
4,2,11/1/2008,0,1.0,36.8,0.0,0.0,0.0,0.0,0.0,0.0,21


In [21]:
sym_dem = pd.merge(sym_household, dem, how='left', on = [ 'hhID', 'member'])
sym_dem.drop('mhhID_x', axis=1, inplace=True)
print(len(sym_dem))
sym_dem.rename(index=str, columns={'mhhID_y': 'mhhID'}, inplace =True)
sym_dem.head()

5213


Unnamed: 0,hhID,clinic_date,day,member,bodytemp,headache,sthroat,cough,pmuscle,rnose,phlegm,age,male,vaccine,mhhID
0,1,9/1/2008,0,0,37.7,0.0,1.0,1.0,0.0,1.0,1.0,8.0,1.0,0.0,10
1,1,9/1/2008,0,1,35.8,0.0,0.0,0.0,0.0,0.0,0.0,44.0,1.0,0.0,11
2,1,9/1/2008,0,2,36.4,0.0,1.0,0.0,0.0,0.0,0.0,36.0,0.0,0.0,12
3,2,11/1/2008,0,0,38.9,1.0,1.0,1.0,1.0,1.0,1.0,3.0,0.0,0.0,20
4,2,11/1/2008,0,1,36.8,0.0,0.0,0.0,0.0,0.0,0.0,39.0,0.0,0.0,21


In [22]:
pcr.head()

Unnamed: 0,hhID,member,visit,qPCR,mhhID
0,1,0,1,3500000.0,10
1,1,0,2,167000.0,10
2,1,0,3,3830.0,10
3,1,1,1,0.0,11
4,1,1,2,0.0,11


In [23]:
z = sym_dem.groupby('mhhID')

In [24]:
mhhID_cnt = pd.DataFrame(sym_dem.groupby('mhhID')['mhhID'].count())

In [25]:
x = sym_dem.groupby('mhhID')['member'].apply(list)
x.head()

mhhID
10      [0.0, 0.0, 0.0]
1000    [0.0, 0.0, 0.0]
1001    [1.0, 1.0, 1.0]
1002    [2.0, 2.0, 2.0]
1003    [3.0, 3.0, 3.0]
Name: member, dtype: object

In [26]:
"""
    This is done because the day values are different from visit numbers. 
    day of visit = [0, ,6, 9] assign [1, 2, 3]
 
"""
for index, row in mhhID_cnt.iterrows():
    ind = pd.DataFrame(sym_dem[sym_dem['mhhID'] == index])
    i=0
    for index_x, row_x in ind.iterrows():
        i+=1
        sym_dem.set_value(index_x, 'day', i)

In [27]:
sym_dem.day.value_counts()

2.0     1742
1.0     1742
3.0     1725
11.0       1
10.0       1
Name: day, dtype: int64

In [28]:
sym_dem.head()

Unnamed: 0,hhID,clinic_date,day,member,bodytemp,headache,sthroat,cough,pmuscle,rnose,phlegm,age,male,vaccine,mhhID
0,1,9/1/2008,1,0,37.7,0.0,1.0,1.0,0.0,1.0,1.0,8.0,1.0,0.0,10
1,1,9/1/2008,1,1,35.8,0.0,0.0,0.0,0.0,0.0,0.0,44.0,1.0,0.0,11
2,1,9/1/2008,1,2,36.4,0.0,1.0,0.0,0.0,0.0,0.0,36.0,0.0,0.0,12
3,2,11/1/2008,1,0,38.9,1.0,1.0,1.0,1.0,1.0,1.0,3.0,0.0,0.0,20
4,2,11/1/2008,1,1,36.8,0.0,0.0,0.0,0.0,0.0,0.0,39.0,0.0,0.0,21


In [29]:
"""
    Joining Results PCR file with sys_dem
    I first need to convert days in sys_dem to 1, 2, 3 to match with visit in pcr
"""
sym_dem_pcr_hsd = pd.merge(sym_dem, pcr, left_on= ['mhhID', 'day', 'hhID', 'member'] , \
                           right_on = ['mhhID', 'visit', 'hhID', 'member'])

In [30]:
sym_dem_pcr_hsd.head()

Unnamed: 0,hhID,clinic_date,day,member,bodytemp,headache,sthroat,cough,pmuscle,rnose,phlegm,age,male,vaccine,mhhID,visit,qPCR
0,1,9/1/2008,1,0,37.7,0.0,1.0,1.0,0.0,1.0,1.0,8.0,1.0,0.0,10,1,3500000.0
1,1,9/1/2008,1,1,35.8,0.0,0.0,0.0,0.0,0.0,0.0,44.0,1.0,0.0,11,1,0.0
2,1,9/1/2008,1,2,36.4,0.0,1.0,0.0,0.0,0.0,0.0,36.0,0.0,0.0,12,1,0.0
3,2,11/1/2008,1,0,38.9,1.0,1.0,1.0,1.0,1.0,1.0,3.0,0.0,0.0,20,1,98600.0
4,2,11/1/2008,1,1,36.8,0.0,0.0,0.0,0.0,0.0,0.0,39.0,0.0,0.0,21,1,0.0


In [31]:
sym_dem_pcr_hsd.qPCR.value_counts()
for index, row in sym_dem_pcr_hsd.iterrows():
    if(row['qPCR'] == 0.000000e+00):
        sym_dem_pcr_hsd.set_value(index, 'qPCR', 0)
    else:
        sym_dem_pcr_hsd.set_value(index, 'qPCR', 1)

In [32]:
for index, row in sym_dem_pcr_hsd.iterrows():
    if row['bodytemp'] > 37.5:
        sym_dem_pcr_hsd.set_value(index, 'fever', 1.0)
    else:
        sym_dem_pcr_hsd.set_value(index, 'fever', 0.0)
    if row['age'] > 0 and row['age'] < 4:
        sym_dem_pcr_hsd.set_value(index, 'age_1', 1.0)
    if row['age'] >= 5 and row['age'] <= 15:
        sym_dem_pcr_hsd.set_value(index, 'age_2', 1.0)
    if row['age'] >= 16 and row['age'] <= 44:
        sym_dem_pcr_hsd.set_value(index, 'age_3', 1.0)
    if row['age'] >= 45 and row['age'] <= 64:
        sym_dem_pcr_hsd.set_value(index, 'age_4', 1.0)
    if row['age'] >= 65:
        sym_dem_pcr_hsd.set_value(index, 'age_5', 1.0)

In [33]:
sym_dem_pcr_hsd['gender'] = sym_dem_pcr_hsd['male']
sym_dem_pcr_hsd['female'] = sym_dem_pcr_hsd['male'].map({1.0: 0, 0.0: 1})

In [34]:
len(sym_dem_pcr_hsd)

5139

In [35]:
"""
    Filter out all rows that have NAs for all symptoms
"""
x = sym_dem_pcr_hsd[sym_dem_pcr_hsd[['headache','bodytemp',
       'sthroat', 'cough', 'pmuscle', 'rnose', 'phlegm']].isnull().all(axis=1)]
in_x = x.index
in_sy = sym_dem_pcr_hsd.index
y = in_sy.difference(in_x)
sym_dem_pcr_hsd= sym_dem_pcr_hsd.loc[y]

In [40]:
sym_dem_pcr_hsd.rename(index=str, columns={'rnose':'runnynose', 'sthroat':'sorethroat',\
                                          'pmuscle':'muscle'}, inplace =True)
sym_dem_pcr_hsd.to_csv('hk_fit_March28.csv')