In [257]:
import pandas as pd
import re
import numpy as np
import datetime
from dateutil import parser

In [258]:
base_path = '~/Google Drive/Global Health Research/data/original_per_study/'

In [259]:
save_path = '~/Google Drive/Global Health Research/data/current/'

# Data Cleaning Notebook (1/15/17)
* Cleans each dataset
* Outputs cleaned versions where each row is considered a record
* This is different from the other cleaning notebook which contained functionality for each row being multiple records

### Cleaning Functionality

In [10]:
def age_clean(df=None, col=None):
    """
    For splitting age into 5 ranges
    
    Takes a dataframe and a column with the age in it as arguments
    """
    for index, row in df.iterrows():
        if row[col] > 0 and row[col] < 4:
            df.set_value(index, 'age_1', 1.0)
        if row[col] >= 5 and row[col] <= 15:
            df.set_value(index, 'age_2', 1.0)
        if row[col] >= 16 and row[col] <= 44:
            df.set_value(index, 'age_3', 1.0)
        if row[col] >= 45 and row[col] <= 64:
            df.set_value(index, 'age_4', 1.0)
        if row[col] >= 65:
            df.set_value(index, 'age_5', 1.0)

In [11]:
def clean_dataframe(d_df=None):
    """
    For cleaning the symptom names
    
    Takes a dataframe as an argument and returns a version with clean column names
    """
    d_df = d_df.copy()
    d_df.rename(columns=lambda x : x.lower(), inplace=True)
    d_df.rename(columns=lambda x: x.replace('anycough', 'cough'), inplace=True)
    d_df.rename(columns=lambda x: x.replace('coughingsd', 'cough'), inplace=True)
    d_df.rename(columns=lambda x: x.replace('coughing', 'cough'), inplace=True)
    d_df.rename(columns=lambda x: x.replace('legpain', 'leg pain'), inplace=True)
    d_df.rename(columns=lambda x: x.replace('coughing', 'cough'), inplace=True)
    d_df.rename(columns=lambda x: x.replace('persistent sore throat', 'sorethroat'), inplace=True)
    d_df.rename(columns=lambda x: x.replace('sorethroatsd', 'sorethroat'), inplace=True)
    d_df.rename(columns=lambda x: x.replace('sore throat', 'sorethroat'), inplace=True)
    d_df.rename(columns=lambda x: x.replace('sthroat', 'sorethroat'), inplace=True)
    d_df.rename(columns=lambda x: x.replace('runny nose', 'runnynose'), inplace=True)
    d_df.rename(columns=lambda x: x.replace('rnose', 'runnynose'), inplace=True)
    d_df.rename(columns=lambda x: x.replace('runnynosesd', 'runnynose'), inplace=True)
    d_df.rename(columns=lambda x: x.replace('running nose', 'runnynose'), inplace=True)
    d_df.rename(columns=lambda x: x.replace('diarrhoea', 'diarrhea'), inplace=True)
    d_df.rename(columns=lambda x: x.replace('legpain', 'leg pain'), inplace=True)
    d_df.rename(columns=lambda x: x.replace('ja', 'joint aches'), inplace=True)
    d_df.rename(columns=lambda x: x.replace('la', 'loss of appetite'), inplace=True)
    d_df.rename(columns=lambda x: x.replace('ft', 'feeling tired'), inplace=True)
    d_df.rename(columns=lambda x: x.replace('earinfectionsd', 'earache'), inplace=True)
    d_df.rename(columns=lambda x: x.replace('muscleaches', 'muscle'), inplace=True)
    d_df.rename(columns=lambda x: x.replace('pmuscle', 'muscle'), inplace=True)
    d_df.rename(columns=lambda x: x.replace('muscleachessd', 'muscle'), inplace=True)
    d_df.rename(columns=lambda x: x.replace('muscleache', 'muscle'), inplace=True)
    d_df.rename(columns=lambda x: x.replace('musclesd', 'muscle'), inplace=True)
    d_df.rename(columns=lambda x: x.replace('headachesd', 'headache'), inplace=True)
    d_df.rename(columns=lambda x: x.replace('fatiguesd', 'fatigue'), inplace=True)
    d_df.rename(columns=lambda x: x.replace('earachesd', 'earache'), inplace=True)
    d_df.rename(columns=lambda x: x.replace('na', 'nausea'), inplace=True)
    d_df.rename(columns=lambda x: x.replace('nauseausea', 'nausea'), inplace=True)
    d_df.rename(columns=lambda x: x.replace('chills/night sweats', 'chills'), inplace=True)
    d_df.rename(columns=lambda x: x.replace('chils', 'chills'), inplace=True)
    d_df.rename(columns=lambda x: x.replace('chillssd', 'chills'), inplace=True)
    d_df.rename(columns=lambda x: x.replace('ns', 'chills'), inplace=True)
    d_df.rename(columns=lambda x: x.replace('feverchills', 'fever'), inplace=True)
    d_df.rename(columns=lambda x: x.replace('feverish', 'fever'), inplace=True)
    d_df.rename(columns=lambda x: x.replace('feversd', 'fever'), inplace=True)
    d_df.rename(columns=lambda x: x.replace('sinusproblemssd', 'blockednose'), inplace=True)
    d_df.rename(columns=lambda x: x.replace('sinusproblems', 'blockednose'), inplace=True)
    d_df.rename(columns=lambda x: x.replace('vomitmit', 'vomit'), inplace=True)
    d_df.rename(columns=lambda x: x.replace('vomiting', 'vomit'), inplace=True)
    d_df.rename(columns=lambda x: x.replace('vomitinging', 'vomit'), inplace=True)
    d_df.rename(columns=lambda x: x.replace('body achess', 'muscle'), inplace=True)
    d_df.rename(columns=lambda x: x.replace('body ache', 'muscle'), inplace=True)
    d_df.rename(columns=lambda x: x.replace('vo', 'vomit'), inplace=True)
    d_df.rename(columns=lambda x: x.replace('feeling tired', 'fatigue'), inplace=True)

    return d_df

# Loeb

In [599]:
loeb = pd.read_csv(base_path + 'loeb/LOEBRumi-request-20160622.csv')

In [600]:
loeb = loeb[[
 'AGE',
 'VIRUS1',
 'VIRUS2',
 'ChillsSD',
 'CoughingSD',
 'EarAcheSD',
 'EarInfectionSD',
 'FatigueSD',
 'FeverSD',
 'HeadacheSD',
 'MuscleachesSD',
 'RunnyNoseSD',
 'SinusproblemsSD',
 'SoreThroatSD', 
'Gender']]

In [601]:
loeb.head()

Unnamed: 0,AGE,VIRUS1,VIRUS2,ChillsSD,CoughingSD,EarAcheSD,EarInfectionSD,FatigueSD,FeverSD,HeadacheSD,MuscleachesSD,RunnyNoseSD,SinusproblemsSD,SoreThroatSD,Gender
0,45,,,,,4/28/09,4/29/09,,,4/28/09,,,,,Female
1,46,,,,4/1/09,4/1/09,,,,4/1/09,4/1/09,4/1/09,4/1/09,4/1/09,Female
2,17,,,,,,,,,4/1/09,,4/1/09,4/1/09,4/1/09,Female
3,29,,,,2/15/09,,,,,,,2/16/09,,2/15/09,Female
4,6,,,,2/23/09,,,,,,,2/25/09,,2/25/09,Female


In [602]:
for i, r in loeb.iterrows():
    loeb.set_value(i, r['Gender'], 1)

  


In [603]:
age_clean(loeb, 'AGE')

  from ipykernel import kernelapp as app
  del sys.path[0]
  # This is added back by InteractiveShellApp.init_path()
  if __name__ == '__main__':


In [604]:
list(loeb)

['AGE',
 'VIRUS1',
 'VIRUS2',
 'ChillsSD',
 'CoughingSD',
 'EarAcheSD',
 'EarInfectionSD',
 'FatigueSD',
 'FeverSD',
 'HeadacheSD',
 'MuscleachesSD',
 'RunnyNoseSD',
 'SinusproblemsSD',
 'SoreThroatSD',
 'Gender',
 'Female',
 'Male',
 'age_4',
 'age_3',
 'age_2',
 'age_5',
 'age_1']

In [605]:
temp_df = loeb.notnull().astype('int')

In [12]:
temp_df = clean_dataframe(temp_df)

In [13]:
list(temp_df)

['age',
 'virus1',
 'virus2',
 'chills',
 'cough',
 'earache',
 'earache',
 'fatigue',
 'fever',
 'headache',
 'muscle',
 'runnynose',
 'blockednose',
 'sorethroat',
 'gender',
 'female',
 'male',
 'age_4',
 'age_3',
 'age_2',
 'age_5',
 'age_1']

In [306]:
temp_df['earache2'] = temp_df.iloc[:,5:7].max(axis=1)

In [307]:
temp_df = temp_df.drop(['earache'], axis=1)

In [308]:
temp_df['earache'] = temp_df['earache2']

In [309]:
temp_df = temp_df.drop(['earache2'], axis=1)

In [310]:
temp_df['virus'] = temp_df['virus1'] + temp_df['virus2']

In [311]:
list(temp_df)

['age',
 'virus1',
 'virus2',
 'chills',
 'cough',
 'fatigue',
 'fever',
 'headache',
 'muscle',
 'runnynose',
 'blockednose',
 'sorethroat',
 'gender',
 'female',
 'male',
 'age_4',
 'age_3',
 'age_2',
 'age_5',
 'age_1',
 'earache',
 'virus']

In [312]:
temp_df = temp_df.drop(['age', 'virus1', 'virus2'], axis=1)

In [313]:
temp_df = temp_df.astype(float).astype(int)
temp_df[temp_df>=1] = 1

In [314]:
set(temp_df['virus'])

{0, 1}

In [318]:
temp_df.to_csv(save_path + 'loeb/loeb.fit')

# Nigeria

In [320]:
nigeria = pd.read_csv(base_path + 'nigeria/RTI_data_organized.csv')

In [321]:
nigeria = nigeria.loc[nigeria['Area'] != 'Hospital']

In [322]:
nigeria = nigeria[[
    'Age',
    'Gender',
    'Fever',
    'cough',
    'Running Nose',
    'vomiting',
    'Body ache',
    'Leg pain',
    'Nausea',
    'Chils',
    'Persistent Sore throat',
    'Shortness of Breath',
    'OC43',
    'RSVB',
    'FLUA',
    'PIV1',
    'PIV2',
    'PIV3',
    'PIV4',
    '229E_nl63',
    'BoV',
    'HEV'
]].fillna(0)

In [323]:
for i,r in nigeria.iterrows():
    nigeria.set_value(i,'virus', sum(list(r[12:22])))


In [324]:
nigeria = nigeria.replace({'Yes': 1, 'no': np.nan, 'No': np.nan, 'yes ': 1})

In [325]:
nigeria = nigeria.drop(['OC43',
    'RSVB',
    'FLUA',
    'PIV1',
    'PIV2',
    'PIV3',
    'PIV4',
    '229E_nl63',
    'BoV',
    'HEV'], axis=1)

In [326]:
list(nigeria)

['Age',
 'Gender',
 'Fever',
 'cough',
 'Running Nose',
 'vomiting',
 'Body ache',
 'Leg pain',
 'Nausea',
 'Chils',
 'Persistent Sore throat',
 'Shortness of Breath',
 'virus']

In [327]:
nigeria['Gender'] = nigeria['Gender'].replace({'Male': 1, 'Female': 0})

In [328]:
for i, r in nigeria.iterrows():
    if r['Gender'] == 1:
        nigeria.set_value(i, 'Male',1)
    else:
        nigeria.set_value(i, 'Female',1)

In [329]:
nigeria = nigeria.drop(['Gender'], axis=1).fillna(0)

In [330]:
age_clean(nigeria, 'Age')

In [331]:
nigeria = nigeria.drop(['Age'], axis=1).fillna(0)

In [332]:
nigeria = clean_dataframe(nigeria)

In [333]:
nigeria.rename(columns=lambda x: x.replace('vomitmit', 'vomit'), inplace=True)

In [334]:
list(nigeria)

['fever',
 'cough',
 'runnynose',
 'vomit',
 'muscle',
 'leg pain',
 'nausea',
 'chills',
 'sorethroat',
 'shortness of breath',
 'virus',
 'female',
 'male',
 'age_2',
 'age_1']

In [335]:
nigeria = nigeria.astype(float).astype(int)
nigeria[nigeria>=1] = 1

In [336]:
nigeria.to_csv(save_path + '/nigeria/nigeria.csv')

# goviral

In [260]:
gv_df = pd.read_csv(base_path + '/goviral/Results_Combined 2.12.18.csv')

In [261]:
for i,r in gv_df.iterrows():

    try:
        yrs = datetime.datetime.now() - parser.parse(r['Birthday']).replace(tzinfo=None)
        yrs = (yrs/(365.25)).days
        if yrs < 0:
            yrs += 100
        gv_df.set_value(i, 'age', yrs)
    except Exception as e:
        print(r['Birthday'])
        print(e)


nan
Parser must be a string or character stream, not float
nan
Parser must be a string or character stream, not float
nan
Parser must be a string or character stream, not float
nan
Parser must be a string or character stream, not float
nan
Parser must be a string or character stream, not float
nan
Parser must be a string or character stream, not float
nan
Parser must be a string or character stream, not float
nan
Parser must be a string or character stream, not float
nan
Parser must be a string or character stream, not float
nan
Parser must be a string or character stream, not float
nan
Parser must be a string or character stream, not float
nan
Parser must be a string or character stream, not float
nan
Parser must be a string or character stream, not float
nan
Parser must be a string or character stream, not float
nan
Parser must be a string or character stream, not float
nan
Parser must be a string or character stream, not float
nan
Parser must be a string or character stream, not flo

nan
Parser must be a string or character stream, not float
nan
Parser must be a string or character stream, not float
nan
Parser must be a string or character stream, not float
nan
Parser must be a string or character stream, not float
nan
Parser must be a string or character stream, not float
nan
Parser must be a string or character stream, not float
nan
Parser must be a string or character stream, not float
nan
Parser must be a string or character stream, not float
nan
Parser must be a string or character stream, not float
nan
Parser must be a string or character stream, not float
nan
Parser must be a string or character stream, not float
nan
Parser must be a string or character stream, not float
nan
Parser must be a string or character stream, not float
nan
Parser must be a string or character stream, not float
nan
Parser must be a string or character stream, not float
nan
Parser must be a string or character stream, not float
nan
Parser must be a string or character stream, not flo

  


In [262]:
for i,r in gv_df.iterrows():
    if r['Date of Analysis']:
        dt = r['Date of Analysis']
    elif r['Symptom Report Date']:
        dt = r['Symptom Report Date']
    else:
        dt = r['Collection Date']
    gv_df.set_value(i, 'date_time', dt)



  


In [263]:
virus_mapping = {
    'en-rh': 'Rhinovirus (HRV)',
    'fla': 'Influenza A',
    'coronavirushku1': 'Coronavirus HKU1',
    'coronavirus229e': 'Coronavirus 229E',
    'rsvi': 'RSVI',
    'hrv': 'Rhinovirus (HRV)',
    'coronanl63': 'Coronavirus NL63',
    'flah3': 'Influenza H3',
    'influenzah1n1': 'Influenza A 2009 H1N1',
    'rsvb': 'RSV B',
    'fla-indeterm': 'Influenza Indeterminate',
    'flb': 'Influenza B',
    'para1': 'Parainfluenza 1',
    'coronanl-63': 'Coronavirus NL63',
    'coronaoc43': 'Coronavirus OC43',
    'influenzaah1n1': 'Influenza A 2009 H1N1',
    'para3':  'Parainfluenza 3',
    'rsva':  'RSV A',
    'corona229e': 'Coronavirus 229E',
    'mypn': 'mypn',
    'hmpv': 'hMPV',
    'coronanl-63': 'Coronavirus NL63',
    'para3': 'Parainfluenza 3',
    'adenov': 'Adenovirus',
    'para1': 'Parainfluenza 1',
    'para2': 'Parainfluenza 2',
}

In [264]:
def fix_format(val):
    return ''.join(str(val).lower().split(' '))

In [265]:
for i,r in gv_df.iterrows():
    kn1 = virus_mapping.get(fix_format(r['Kit Nasal']))
    kn2 = virus_mapping.get(fix_format(r['Kit Nasal 2']))
    ks1 = virus_mapping.get(fix_format(r['Kit Saliva']))
    ks2 = virus_mapping.get(fix_format(r['Kit Saliva 2']))
    result = virus_mapping.get(fix_format(r['Result']))
    
    if kn1:
        gv_df.set_value(i, kn1, 1)    
    if kn2:
        gv_df.set_value(i, kn2, 1)    
    if ks1:
        gv_df.set_value(i, ks1, 1)    
    if ks2:
        gv_df.set_value(i, ks2, 1)    
    if result:
        gv_df.set_value(i, result, 1)

  if __name__ == '__main__':
  del sys.path[0]
  # This is added back by InteractiveShellApp.init_path()
  from ipykernel import kernelapp as app


In [266]:
gv_df = gv_df.replace(
    {
        'positive': 1,
        'negative': np.nan,
        'no result': np.nan,
        'positive': 1, 
        'negative': np.nan,
        'DETECTED': 1, 
        'NOT DETECTED': np.nan,
        'none': np.nan,
        '-': np.nan,
        '+': 1
    }
)

In [267]:
for i, r in gv_df[[
 'Kit Nasal',
 'Kit Nasal 2',
 'Kit Saliva',
 'Kit Saliva 2',
 'Result',
 'Detect',
 'Influenza A',
 'Influenza A H1',
 'Influenza H3',
 'Influenza A 2009 H1N1',
 'Influenza B',
 'RSV A',
 'RSV B',
 'Parainfluenza 1',
 'Parainfluenza 2',
 'Parainfluenza 3',
 'Parainfluenza 4',
 'hMPV',
 'Rhinovirus (HRV)',
 'Adenovirus B/E',
 'Adenovirus C',
 'Coronavirus 229E',
 'Coronavirus NL63',
 'Coronavirus OC43',
 'Coronavirus HKU1',
 'RSVI',
 'Adenovirus',
 'mypn',
 'Influenza Indeterminate'
]].iterrows():
    if r.notnull().any():
        gv_df.set_value(i, 'result_aggregated', 1)
    else:
        gv_df.set_value(i, 'result_aggregated', 0)
        
    



In [268]:
gv_df = gv_df[[
    'result_aggregated',
    'Age',
    'date_time',
    'Gender',
    'Influenza A',
    'Influenza A H1',
    'Influenza H3',
    'Influenza A 2009 H1N1',
    'Influenza B',
    'RSV A',
    'RSV B',
    'Parainfluenza 1',
    'Parainfluenza 2',
    'Parainfluenza 3',
    'Parainfluenza 4',
    'hMPV',
    'Rhinovirus (HRV)',
    'Adenovirus B/E',
    'Adenovirus C',
    'Coronavirus 229E',
    'Coronavirus NL63',
    'Coronavirus OC43',
    'Coronavirus HKU1',
    'Fever',
    'Cough',
    'Runny nose',
    'Sore throat',
    'Body aches',
    'Fatigue',
    'Chills/night sweats',
    'Shortness of breath',
    'Leg pain',
    'Nausea',
    'Diarrhea',
    'Vomiting',
    'RSVI',
    'Adenovirus',
    'mypn',
    'Influenza Indeterminate'
]]

In [269]:
# These represent nasal and saliva specimens.
# So, we may have two different results for those - usually what happened is that something was detected in one,
# not the other. So we should count each specimen once, and just the result from N or S, 
# whichever one showed something.


In [270]:
gv_virus = gv_df[[
    'result_aggregated',
    'Influenza A',
    'Influenza A H1',
    'Influenza H3',
    'Influenza A 2009 H1N1',
    'Influenza B',
    'RSV A',
    'RSV B',
    'Parainfluenza 1',
    'Parainfluenza 2',
    'Parainfluenza 3',
    'Parainfluenza 4',
    'hMPV',
    'Rhinovirus (HRV)',
    'Adenovirus B/E',
    'Adenovirus C',
    'Coronavirus 229E',
    'Coronavirus NL63',
    'Coronavirus OC43',
    'Coronavirus HKU1',
    'RSVI',
    'Adenovirus',
    'mypn',
    'Influenza Indeterminate'
]].fillna(0)

In [271]:
gv_virus

Unnamed: 0,result_aggregated,Influenza A,Influenza A H1,Influenza H3,Influenza A 2009 H1N1,Influenza B,RSV A,RSV B,Parainfluenza 1,Parainfluenza 2,...,Adenovirus B/E,Adenovirus C,Coronavirus 229E,Coronavirus NL63,Coronavirus OC43,Coronavirus HKU1,RSVI,Adenovirus,mypn,Influenza Indeterminate
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
7,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [272]:
gv_virus = gv_virus.fillna(0)

In [273]:
gv_virus[gv_virus != 0] = np.nan

In [274]:
gv_virus = gv_virus.fillna(1)

In [275]:
gv_df[['result_aggregated',
    'Influenza A',
    'Influenza A H1',
    'Influenza H3',
    'Influenza A 2009 H1N1',
    'Influenza B',
    'RSV A',
    'RSV B',
    'Parainfluenza 1',
    'Parainfluenza 2',
    'Parainfluenza 3',
    'Parainfluenza 4',
    'hMPV',
    'Rhinovirus (HRV)',
    'Adenovirus B/E',
    'Adenovirus C',
    'Coronavirus 229E',
    'Coronavirus NL63',
    'Coronavirus OC43',
    'Coronavirus HKU1',
    'RSVI',
    'Adenovirus',
    'mypn',
    'Influenza Indeterminate']] = gv_virus

In [276]:
gv_symptoms = [
    'Fever',
    'Cough',
    'Runny nose',
    'Sore throat',
    'Body aches',
    'Fatigue',
    'Chills/night sweats',
    'Shortness of breath',
    'Leg pain',
    'Nausea',
    'Diarrhea',
    'Vomiting'
]

In [277]:
gv_df = gv_df.fillna(0).replace({'FALSE': np.nan, 'TRUE': 1,'No': np.nan, 'Yes': 1, 'Maybe later': np.nan})

In [278]:
gv_df = gv_df.fillna(0)

In [279]:
symptoms = gv_df[gv_symptoms]

In [280]:
symptoms[symptoms != 0] = 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._where(-key, value, inplace=True)


In [281]:
gv_df[gv_symptoms] = symptoms

In [282]:
gv_df['Gender'] = gv_df['Gender'].replace({
    0: 'Female',
    '0': 'Female',
    'male': 'Male',
    'Other': 'other',
    'female': 'Female'
})

In [283]:
gv_df['Gender'].value_counts()

Female    474
Male      151
other       6
Name: Gender, dtype: int64

In [284]:
gv_df = gv_df[gv_df['Gender'] != 'other']

In [285]:
for i, r in gv_df.iterrows():
    gv_df.set_value(i, r['Gender'], 1)

  


In [286]:
gv_df = gv_df.fillna(0)

In [287]:
gv_df = clean_dataframe(gv_df)

In [288]:
len(gv_df)

625

In [289]:
gv_df['result_aggregated'].value_counts()

1.0    349
0.0    276
Name: result_aggregated, dtype: int64

In [290]:
gv_df.rename(columns=lambda x: x.replace('nauseame', 'name'), inplace=True)

In [291]:
gv_df.rename(columns=lambda x: x.replace('vomitmit', 'vomit'), inplace=True)

In [292]:
list(gv_df)

['result_aggregated',
 'age',
 'date_time',
 'gender',
 'influenza a',
 'influenza a h1',
 'influenza h3',
 'influenza a 2009 h1n1',
 'influenza b',
 'rsv a',
 'rsv b',
 'parainfluenza 1',
 'parainfluenza 2',
 'parainfluenza 3',
 'parainfluenza 4',
 'hmpv',
 'rhinovirus (hrv)',
 'adenovirus b/e',
 'adenovirus c',
 'coronauseavirus 229e',
 'coronauseavirus nl63',
 'coronauseavirus oc43',
 'coronauseavirus hku1',
 'fever',
 'cough',
 'runnynose',
 'sorethroat',
 'muscles',
 'fatigue',
 'chills',
 'shortness of breath',
 'leg pain',
 'nausea',
 'diarrhea',
 'vomit',
 'rsvi',
 'adenovirus',
 'mypn',
 'influenza indeterminauseate',
 'male',
 'female']

In [293]:
gv_df_symptoms = gv_df[[
'result_aggregated',
 'fever',
 'cough',
 'runnynose',
 'sorethroat',
 'muscles',
 'fatigue',
 'chills',
 'shortness of breath',
 'leg pain',
 'nausea',
 'diarrhea',
 'vomit',
 'male',
 'female'
]]

In [294]:
gv_df_symptoms.rename(columns={'result_aggregated':'virus'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return super(DataFrame, self).rename(**kwargs)


In [295]:
gv_df_symptoms.to_csv('~/Google Drive/Global Health Research/data/current/fit/gv_fit.csv')

In [296]:
gv_df.to_csv('~/Google Drive/Global Health Research/data/current/full/gv_full_dataset.csv')

# WNYC

In [28]:
wnyc_dem = pd.read_csv(base_path + 'wnyc/wnyc_dem.csv')

In [29]:
wnyc_df = pd.read_csv(base_path + 'wnyc/wnyc.csv')

In [30]:
wnyc_df = wnyc_df.dropna(subset=['Name'])

In [31]:
for i,r in wnyc_dem.iterrows():
    first = r['What is your name? (first and last name)'].split(' ')[0]
    wnyc_dem.set_value(i,'first_name', first)
    

  This is separate from the ipykernel package so we can avoid doing imports until


In [32]:
for i,r in wnyc_df.iterrows():
    name = str(r['Name']).split(' ')[0]
    if name == 'Robbie':
        name = 'Robby'
    dem_row = wnyc_dem.loc[wnyc_dem['first_name'] == name]
    gender = dem_row['What is your gender?'].iloc[0]
    dob = dem_row['What is your date of birth?'].iloc[0]
    wnyc_df.set_value(i, 'gender', gender)
    wnyc_df.set_value(i, 'birth_date', dob)

    

  
  if __name__ == '__main__':


In [33]:
wnyc_df['Collection Date'] = pd.to_datetime(wnyc_df['Collection Date'])

In [34]:
wnyc_df  = wnyc_df.sort_values(by=['Name', 'Collection Date'])

In [35]:
wnyc_df = wnyc_df[[
    'Name',
    'gender',
    'birth_date',
    'Collection Date',
    'Symptom Report Date',
    'Influenza A',
    'Influenza A H1',
    'Influenza H3',
    'Influenza A 2009 H1N1',
    'Influenza B',
    'RSV A',
    'RSV B',
    'Parainfluenza 1',
    'Parainfluenza 2',
    'Parainfluenza 3',
    'Parainfluenza 4',
    'hMPV',
    'Rhinovirus (HRV)',
    'Adenovirus B/E',
    'Adenovirus C',
    'Coronavirus 229E',
    'Coronavirus NL63',
    'Coronavirus OC43',
    'Coronavirus HKU1',
    'Fever',
    'Cough',
    'Runny nose',
    'Sore throat',
    'Body aches',
    'Fatigue',
    'Chills/night sweats',
    'Shortness of breath',
    'Leg pain',
    'Nausea',
    'Diarrhea',
    'Vomiting'
]]

In [36]:
add_cols = [
    'Influenza A',
    'Influenza A H1',
    'Influenza H3',
    'Influenza A 2009 H1N1',
    'Influenza B',
    'RSV A',
    'RSV B',
    'Parainfluenza 1',
    'Parainfluenza 2',
    'Parainfluenza 3',
    'Parainfluenza 4',
    'hMPV',
    'Rhinovirus (HRV)',
    'Adenovirus B/E',
    'Adenovirus C',
    'Coronavirus 229E',
    'Coronavirus NL63',
    'Coronavirus OC43',
    'Coronavirus HKU1',
    'Fever',
    'Cough',
    'Runny nose',
    'Sore throat',
    'Body aches',
    'Fatigue',
    'Chills/night sweats',
    'Shortness of breath',
    'Leg pain',
    'Nausea',
    'Diarrhea',
    'Vomiting'
]


In [37]:
wnyc_df = wnyc_df.replace({'-': np.nan, '+': 1})

In [38]:
wnyc_df = wnyc_df.replace({'Female ': 'Female'})

In [39]:
wnyc_df = wnyc_df.fillna(0)

In [40]:
add_df = wnyc_df[add_cols]

In [41]:
add_df[add_df != 0] = np.nan

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._where(-key, value, inplace=True)


In [42]:
add_df = add_df.fillna(1)

In [43]:
wnyc_df[add_cols] = add_df

In [44]:
names = set(wnyc_df['Name'])

In [45]:
#I *think* < 4 (so 1, 2, 3) weeks apart should be treated as one illness. 
#The duplicates should basically be collapsed into one illness
#for symptoms (since they may differ across weeks of the same illness)
# just use the total of all symptoms ever reported for that particular illness.

new_wnyc_df = pd.DataFrame()
for x in names:
    temp_df = wnyc_df.loc[wnyc_df['Name'] == x]
    temp_df.sort_values('Collection Date')
    first_dt = temp_df['Collection Date'].iloc[0]
    temp_row = temp_df.iloc[0]
    for i, r in temp_df.iterrows():
        dt = r['Collection Date']
        
        if dt - first_dt < datetime.timedelta(days=28):
            temp_row[add_cols] += r[add_cols]
        else:
            new_dt = str(dt.month) + '/' + str(dt.year)
            new_wnyc_df = new_wnyc_df.append(temp_row)


            temp_row = r
            first_dt = dt
            
    new_wnyc_df = new_wnyc_df.append(temp_row)
    

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._set_labels(key, value)


In [46]:
new_wnyc_df = new_wnyc_df.reset_index(drop=True)

In [47]:
new_wnyc_df['month_yr'] = new_wnyc_df['Collection Date'].apply(lambda x: str(x.month) + '/' + str(x.year))

In [48]:
new_wnyc_df['virus'] = new_wnyc_df[[
    'Influenza A',
    'Influenza A H1',
    'Influenza H3',
    'Influenza A 2009 H1N1',
    'Influenza B',
    'RSV A',
    'RSV B',
    'Parainfluenza 1',
    'Parainfluenza 2',
    'Parainfluenza 3',
    'Parainfluenza 4',
    'hMPV',
    'Rhinovirus (HRV)',
    'Adenovirus B/E',
    'Adenovirus C',
    'Coronavirus 229E',
    'Coronavirus NL63',
    'Coronavirus OC43',
    'Coronavirus HKU1'
]].sum(axis=1)

In [49]:
for i, r in new_wnyc_df.iterrows():
    new_wnyc_df.set_value(i, r['gender'], 1)

  


In [50]:
new_wnyc_df.drop('gender', inplace=True, axis=1)

In [55]:
new_wnyc_df = new_wnyc_df.fillna(0)

In [56]:
new_wnyc_df = clean_dataframe(new_wnyc_df)

In [57]:
new_wnyc_df['virus'][new_wnyc_df['virus']>=1] = 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [58]:
new_wnyc_df.rename(columns=lambda x: x.replace('vomitmit', 'vomit'), inplace=True)

In [59]:
new_wnyc_df.rename(columns=lambda x: x.replace('nauseame', 'name'), inplace=True)

In [60]:
wnyc_just_symptoms = new_wnyc_df[[
 'muscles',
 'chills',
 'cough',
 'diarrhea',
 'fatigue',
 'fever',
 'leg pain',
 'nausea',
 'runnynose',
 'shortness of breath',
 'sorethroat',
 'vomit',
 'virus',
 'female',
 'male'
]]

In [61]:
wnyc_just_symptoms[wnyc_just_symptoms > 0] = 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._where(-key, value, inplace=True)


In [62]:
wnyc_just_symptoms.to_csv('~/Google Drive/Global Health Research/data/current/wnyc_fit.csv')

In [63]:
new_wnyc_df.to_csv('~/Google Drive/Global Health Research/data/current/wnyc_full_dataset.csv')

# fluwatch

In [337]:
fw_df = pd.read_csv(base_path + 'fluwatch/rumi-daily_data.csv')

In [338]:
fw_df = fw_df[np.isfinite(fw_df['myPCRinc_NumPos'])]

In [339]:
len(fw_df)

2759

In [340]:
fw_df_profile = pd.read_csv(base_path + 'fluwatch/rumi-burden_data.csv')

In [341]:
len(fw_df_profile)

118077

In [342]:
fw_df = fw_df.join(fw_df_profile[['p_id', 'sex']], on=['p_id'], lsuffix='l_')

In [343]:
len(fw_df)

2759

In [344]:
fw_df = fw_df[[
    'p_id',
    'sex',
    'fever',
    'feverish',
    'anycough',
    'sorethroat',
    'runnynose',
    'blockednose',
    'sneeze',
    'diarrhoea',
    'muscle',
    'headache',
    'rash',
    'earache',
    'wheezy',
    'ns',
    'ja',
    'la',
    'ft',
    'vo',
    'na',
     'myPCRinc_NumPos',
     'myPCRinc_IND_H1p',
     'myPCRinc_IND_H1s',
     'myPCRinc_IND_H3',
     'myPCRinc_IND_FluB',
     'myPCRinc_IND_RSV',
     'myPCRinc_IND_hMPV',
     'myPCRinc_IND_CoV',
     'myPCRinc_IND_PIV',
     'myPCRinc_IND_AdV',
     'myPCRinc_IND_hRhV',
     'PCRoutsideILLNESS'
]]

In [345]:
fw_df = fw_df[np.isfinite(fw_df['myPCRinc_NumPos'])].reset_index()

In [346]:
fw_df = fw_df.replace({'.a': np.nan, '.n': np.nan})
fw_df = fw_df.fillna(0)
fw_df = fw_df.astype(float).astype(int)
fw_df[fw_df>=1] = 1

In [347]:
list(fw_df)

['index',
 'p_id',
 'sex',
 'fever',
 'feverish',
 'anycough',
 'sorethroat',
 'runnynose',
 'blockednose',
 'sneeze',
 'diarrhoea',
 'muscle',
 'headache',
 'rash',
 'earache',
 'wheezy',
 'ns',
 'ja',
 'la',
 'ft',
 'vo',
 'na',
 'myPCRinc_NumPos',
 'myPCRinc_IND_H1p',
 'myPCRinc_IND_H1s',
 'myPCRinc_IND_H3',
 'myPCRinc_IND_FluB',
 'myPCRinc_IND_RSV',
 'myPCRinc_IND_hMPV',
 'myPCRinc_IND_CoV',
 'myPCRinc_IND_PIV',
 'myPCRinc_IND_AdV',
 'myPCRinc_IND_hRhV',
 'PCRoutsideILLNESS']

In [348]:
fw_df = fw_df.iloc[:,2:23]

In [349]:
fw_df = fw_df.rename(columns = {'myPCRinc_NumPos':'virus'})

In [350]:
fw_df['sex'] = fw_df['sex'].replace({1: 'male', 0: 'female'})

In [351]:
for i, r in fw_df.iterrows():
    fw_df.set_value(i, r['sex'], 1)

In [352]:
fw_df.drop('sex', inplace=True, axis=1)

In [353]:
list(fw_df)

['fever',
 'feverish',
 'anycough',
 'sorethroat',
 'runnynose',
 'blockednose',
 'sneeze',
 'diarrhoea',
 'muscle',
 'headache',
 'rash',
 'earache',
 'wheezy',
 'ns',
 'ja',
 'la',
 'ft',
 'vo',
 'na',
 'virus',
 'male',
 'female']

In [354]:
fw_df['fever'] = fw_df[['feverish','fever']].max(axis=1)

In [355]:
fw_df.drop('feverish', inplace=True, axis=1)

In [356]:
fw_df = clean_dataframe(fw_df)

In [357]:
list(fw_df)

['fever',
 'cough',
 'sorethroat',
 'runnynose',
 'blockednose',
 'sneeze',
 'diarrhea',
 'muscle',
 'headache',
 'rash',
 'earache',
 'wheezy',
 'chills',
 'joint aches',
 'loss of appetite',
 'fatigue',
 'vomit',
 'nausea',
 'virus',
 'male',
 'female']

In [358]:
len(fw_df)

2759

In [359]:
fw_df.to_csv(save_path + 'fluwatch/fw_fit.csv')

# NYUMC

In [495]:
nyumc_df = pd.read_csv(base_path + 'nyumc/nyumc.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [497]:
list(nyumc_df)

['ARRIVAL',
 'AGE_AT_ARRIVAL',
 'GENDER',
 'MEANS_OF_ARRIVAL',
 'CC',
 'ACUITY',
 'PRIMARY_DX',
 'SECONDARY_DX_NAME',
 'ED_TIME_IN_MIN',
 'feverchills',
 'Sorethroat',
 'Cough',
 'Muscleache',
 'Headache',
 'Fatigue',
 'Vomit',
 'Nausea',
 'Diarrhea',
 'Flu',
 'ADMIT_DATE',
 'DISCHARGE_DATE',
 'PATIENT_CLASS',
 'ADMIT_SOURCE',
 'DISCHARGE_DISP',
 'PRIMARY_PAYOR',
 'INSURANCE_PRODUCT',
 'LENGTH_OF_STAY',
 'TOTAL_CHARGES',
 'PRINCIPAL_DIAG_CODE',
 'PRINCIPAL_DIAG_DESC',
 'PRINCIPAL_PROC_CODE',
 'PRINCIPAL_PROC_DESC',
 'DISCHARGE_DISP_DESC',
 'FINANCIAL_CLASS',
 'PATIENT_AFTER_ED',
 'PATIENT_ZIP',
 'PATIENT_RACE',
 'PATIENT_ETHNICITY',
 'DRG_FEDERAL_DESC',
 'DRG_BILLED_DESC',
 'DRG_STATE_DESC',
 'DRG_APR_DESC',
 'CHIEF_COMPLAINT',
 'MORTALITY_RISK_IND',
 'Facility',
 'GENDERCODE',
 'Hight_m',
 'Weight_kg',
 'BMI',
 'TobUseCensus',
 'DrgUseCensus',
 'AlcUseCensus',
 'Tobacco_use',
 'Adm_diag1',
 'Adm_diag2',
 'Adm_diag3',
 'Adm_diag4',
 'Adm_diag5',
 'Adm_diag6',
 'smoking_quit_date1',
 'q

In [498]:
nyumc_df = nyumc_df[[
 'GENDER',
 'PRIMARY_DX',
 'feverchills',
 'Sorethroat',
 'Cough',
 'Muscleache',
 'Headache',
 'Fatigue',
 'Vomit',
 'Nausea',
 'Diarrhea',
 'Flu',
]]

In [499]:
primary_dx = ['virus', 'influenza', 'respiratory']

In [500]:
nyumc_df = clean_dataframe(nyumc_df)

In [501]:
nyumc_df.rename(columns=lambda x: x.replace('vomitmit', 'vomit'), inplace=True)

In [502]:
for i, r in nyumc_df.iterrows():
    nyumc_df.set_value(i, r['gender'].lower(), 1)

In [503]:
nyumc_df.drop('gender', axis=1, inplace=True)

In [504]:
nyumc_df = nyumc_df.fillna(0)

In [505]:
sampled_nyumc_df = nyumc_df.copy()

In [506]:
for i,r in sampled_nyumc_df.iterrows():
    for dx in primary_dx:
        if dx in str(r['primary_dx']).lower():
            break
        else:
            sampled_nyumc_df.drop(i, axis=0, inplace=True)
            break

In [507]:
sampled_nyumc_df.drop('primary_dx', axis=1, inplace=True)

In [513]:
sampled_nyumc_df = sampled_nyumc_df.reset_index(drop=True)

In [514]:
sampled_nyumc_df.to_csv(save_path + '/nyumc/nyumc_sampled_fit.csv')

In [515]:
sampled_nyumc_df['flu'].value_counts()

0    255
1     23
Name: flu, dtype: int64

In [516]:
len(sampled_nyumc_df)

278

In [525]:
nyumc_df = nyumc_df.drop('primary_dx', axis=1)

In [526]:
nyumc_df.to_csv(save_path + 'nyumc/nyumc_fit.csv')

In [517]:
nyumc_df['flu'].value_counts()

0    35258
1     2388
Name: flu, dtype: int64

In [None]:
list(sampled_nyumc_df)

# Hong Kong

In [710]:
pcr = pd.read_csv(base_path + 'hk/PCRresults.csv', sep=';')

In [711]:
sym = pd.read_csv(base_path + 'hk/symptom.csv', sep=';')

In [712]:
dem = pd.read_csv(base_path + 'hk/demographic.csv', sep=';')

In [713]:
hh = pd.read_csv(base_path + 'hk/household.csv', sep=';')

In [714]:
pcr['mhhID'] = pcr['hhID'].astype(str) + pcr['member'].astype(str)

In [715]:
pcr['visit'] = pcr['visit'].map({1: 0, 2: 3, 3: 6})

In [716]:
sym['mhhID'] = sym['hhID'].astype(str) + sym['member'].astype(str)

In [717]:
dem['mhhID'] = dem['hhID'].astype(str) + dem['member'].astype(str)

In [718]:
sym_dem = pd.merge(sym, dem, left_on='mhhID', right_on='mhhID')

In [719]:
manifest = pd.merge(sym_dem, pcr,  how='left', left_on=['mhhID','day'], right_on = ['mhhID','visit']).dropna()

In [720]:
for index, row in manifest.iterrows():
    if row['bodytemp'] > 37.5:
        manifest.set_value(index, 'fever', 1.0)
    if row['age'] > 0 and row['age'] < 4:
        manifest.set_value(index, 'age_1', 1.0)
    if row['age'] >= 5 and row['age'] <= 15:
        manifest.set_value(index, 'age_2', 1.0)
    if row['age'] >= 16 and row['age'] <= 44:
        manifest.set_value(index, 'age_3', 1.0)
    if row['age'] >= 45 and row['age'] <= 64:
        manifest.set_value(index, 'age_4', 1.0)
    if row['age'] >= 65:
        manifest.set_value(index, 'age_5', 1.0)

  This is separate from the ipykernel package so we can avoid doing imports until
  import sys
  if __name__ == '__main__':
  """
  del sys.path[0]
  # This is added back by InteractiveShellApp.init_path()


In [721]:
manifest['gender'] = manifest['male']

In [722]:
manifest.drop('male', axis=1, inplace=True)

In [723]:
manifest['gender'].value_counts()

0    2630
1    1749
Name: gender, dtype: int64

In [724]:
manifest['gender'] = manifest['gender'].replace({1: 'male', 0: 'female'})

In [725]:
for i, r in manifest.iterrows():
    manifest.set_value(i, r['gender'], 1)

  


In [726]:
manifest['flu'] = manifest['qPCR']

In [727]:
manifest.drop(['gender','qPCR'], axis=1, inplace=True)

In [728]:
manifest = manifest.reset_index()

In [729]:
# "Pooled specimens of nasal and throat swabs were collected from all household contacts, 
# regardless of whether the person was ill at the initial home visit, and at two follow-up visits 
# approximately 3 and 6 days later."

for i,r in manifest.iterrows():

    clinic_row = hh.loc[hh['hhID'] == r['hhID_x']]
    clinic_date = clinic_row['clinic_date']
    v2_day = clinic_row['v2_day'] - clinic_row['clinic_day']
    v3_day = clinic_row['v3_day'] - clinic_row['clinic_day']
    cd = parser.parse(clinic_date.iloc[0])
    
    dispatch_date = {
        '0': 0,
        '3': v2_day.iloc[0],
        '6': v3_day.iloc[0],
    }

    dt = datetime.timedelta(days=int(dispatch_date[str(r['day'])]))
    manifest.set_value(i, 'visit_date', cd + dt)
    



In [730]:
manifest = manifest[[
    'headache',
    'sthroat',
    'cough',
    'pmuscle',
    'rnose',
    'phlegm',
    'fever',
    'female',
    'male',
    'flu',
    'visit_date'
]]

In [731]:
hk_df = clean_dataframe(manifest)

In [732]:
hk_df = hk_df.fillna(0)

In [733]:
hk_df['flu'] = hk_df['flu'].astype(float).astype(int)


In [734]:
hk_df['flu'][hk_df['flu'] >=1] = 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [735]:
list(hk_df)

['headache',
 'sorethroat',
 'cough',
 'muscle',
 'runnynose',
 'phlegm',
 'fever',
 'female',
 'male',
 'flu',
 'visit_date']

In [743]:
hk_df_symptoms = hk_df[[
'headache',
 'sorethroat',
 'cough',
 'muscle',
 'runnynose',
 'phlegm',
 'fever',
 'female',
 'male',
 'flu',
]]

In [745]:
hk_df_symptoms

Unnamed: 0,headache,sorethroat,cough,muscle,runnynose,phlegm,fever,female,male,flu
0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1
1,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1
2,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0
6,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0
8,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0
9,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1


In [741]:
hk_df_symptoms.to_csv(save_path + '/hk_fit.csv')

In [742]:
hk_df.to_csv(save_path + '/hk_full_dataset.csv')