* 

In [1]:
import time
import os
import pandas as pd
import numpy as np

In [2]:
def read_files(filename):
    data = pd.read_csv(filename)
    return data

In [5]:
data_directory = "../Data/With_Improved_Target/With_Demographics/"

In [6]:
data_nyumc = read_files(data_directory+"nyumc.csv")
data_gv = read_files(data_directory+"goviral.csv")
data_fw = read_files(data_directory+"fluwatch.csv")
data_hk = read_files(data_directory+"hongkong.csv")
data_ht = read_files(data_directory+"hutterite.csv")

In [7]:
def set_target(dataset):
    columns_ = dataset.columns
    if 'virus' not in columns_:
        print("Virus is not present in the columns!")
        dataset = dataset.rename(columns = {'flu':'virus'})
        print("New columns :\n")
        print(dataset.columns)
    else:
        print("Virus is already the target variable!")
    return dataset

In [8]:
#set target variable for NYUMC
data_nyumc = set_target(data_nyumc)

Virus is already the target variable!


In [9]:
#set the target variable for GoViral
data_gv = set_target(data_gv)

Virus is already the target variable!


In [10]:
#set the target variable for FluWatch
data_fw = set_target(data_fw)

Virus is already the target variable!


In [11]:
#set the target variable for hongkong
data_hk = set_target(data_hk)

Virus is already the target variable!


In [12]:
#set the target variable for hutterite
data_ht = set_target(data_ht)

Virus is already the target variable!


In [13]:
symptoms = ['fever',
            'sorethroat',
            'cough',
            'muscle',
            'headache',
            'fatigue',
            'vomit',
            'nausea',
            'diarrhea',
            'chills',
            'sneeze',
            'shortness of breath',
            'phlegm',
            'blockednose',
            'earache',
            'leg pain',
            'runnynose',
            'virus']

In [14]:
data = [data_nyumc,data_gv,data_fw,data_hk,data_ht]

In [15]:
print("NYUMC : ",data_nyumc.shape[0])
print("GoViral : ",data_gv.shape[0])
print("FluWatch : ",data_fw.shape[0])
print("HongKong : ",data_hk.shape[0])
print("Hutterite : ",data_ht.shape[0])

NYUMC :  21907
GoViral :  520
FluWatch :  915
HongKong :  4954
Hutterite :  1281


In [16]:
def return_with_symptoms(dataset,symptoms):
    data = dataset[symptoms]
    return data

In [17]:
def choose_specific(dataset,symptoms,column):
    dataset = dataset.loc[dataset[column] == 1]
    data = return_with_symptoms(dataset,symptoms)
    return data

In [18]:
def get_data(demographic):
    to_concat = []
    for i in data:
        to_concat.append(choose_specific(i,symptoms,demographic))
    temp = pd.concat(to_concat)
    print(temp.shape[0])
    return temp

In [19]:
data_nyumc.columns

Index(['Unnamed: 0', 'fever', 'sorethroat', 'cough', 'muscle', 'headache',
       'fatigue', 'vomit', 'nausea', 'diarrhea', 'chills', 'sneeze',
       'shortness of breath', 'phlegm', 'blockednose', 'earache', 'leg pain',
       'runnynose', 'age 0-4', 'age 5-15', 'age 16-44', 'age 45-64', 'age 65+',
       'male', 'female', 'virus'],
      dtype='object')

### Get all datapoints where male is positive from all the datasets

In [20]:
male = get_data('male')

13671


In [21]:
female = get_data('female')

15856


In [22]:
age1 = get_data('age 0-4')

5209


In [23]:
age2 = get_data('age 5-15')

3780


In [24]:
age3 = get_data('age 16-44')

10667


In [25]:
age4 = get_data('age 45-64')

5587


In [26]:
age5 = get_data('age 65+')

4061


### Get only the symptoms for all the datasets

In [27]:
nyumc = return_with_symptoms(data_nyumc,symptoms)

In [28]:
goviral = return_with_symptoms(data_gv,symptoms)

In [29]:
fluwatch = return_with_symptoms(data_fw,symptoms)

In [30]:
hongkong = return_with_symptoms(data_hk,symptoms)

In [31]:
hutterite = return_with_symptoms(data_ht,symptoms)

### Get datapoints based on the collection mode

In [32]:
clinically_collected = nyumc

In [33]:
individually_reported = pd.concat([goviral,fluwatch])

In [34]:
health_worker_facilitated = pd.concat([hongkong,hutterite])

### All the datapoints together in one dataset

In [35]:
total = pd.concat([nyumc,goviral,fluwatch,hongkong,hutterite])

### Save the datafiles

In [37]:
def save_file(dataset,filename):
    print(dataset.head())
    dataset.to_csv(filename,index = False)

In [40]:
directory = "../Data/With_Improved_Target/Processed_Features/"

In [41]:
save_file(male,directory+"male.csv")

    fever  sorethroat  cough  muscle  headache  fatigue  vomit  nausea  \
0     1.0         0.0    0.0     0.0       0.0        0      0       0   
7     1.0         0.0    0.0     0.0       0.0        0      0       0   
10    1.0         0.0    0.0     0.0       0.0        0      0       0   
11    1.0         0.0    1.0     0.0       0.0        0      0       0   
14    0.0         0.0    1.0     0.0       0.0        0      0       0   

    diarrhea  chills  sneeze  shortness of breath  phlegm  blockednose  \
0        0.0       0     0.0                    0     0.0          0.0   
7        0.0       0     0.0                    0     0.0          0.0   
10       0.0       0     0.0                    0     0.0          0.0   
11       0.0       0     0.0                    0     0.0          0.0   
14       0.0       0     0.0                    0     0.0          0.0   

    earache  leg pain  runnynose  virus  
0       0.0         0        0.0      0  
7       0.0         0     

In [42]:
save_file(female,directory+"female.csv")

   fever  sorethroat  cough  muscle  headache  fatigue  vomit  nausea  \
1    0.0         0.0    1.0     0.0       0.0        0      0       0   
2    0.0         0.0    1.0     1.0       0.0        0      0       0   
3    1.0         0.0    1.0     0.0       0.0        0      0       0   
4    1.0         0.0    0.0     0.0       0.0        0      0       0   
5    1.0         0.0    1.0     0.0       0.0        0      0       0   

   diarrhea  chills  sneeze  shortness of breath  phlegm  blockednose  \
1       0.0       0     0.0                    0     0.0          0.0   
2       0.0       0     0.0                    0     0.0          0.0   
3       0.0       0     0.0                    0     0.0          0.0   
4       0.0       0     0.0                    0     0.0          0.0   
5       0.0       0     0.0                    0     0.0          0.0   

   earache  leg pain  runnynose  virus  
1      0.0         0        0.0      1  
2      0.0         0        0.0      1  

In [43]:
save_file(age1,directory+"age1.csv")

    fever  sorethroat  cough  muscle  headache  fatigue  vomit  nausea  \
0     1.0         0.0    0.0     0.0       0.0        0      0       0   
16    1.0         0.0    0.0     0.0       0.0        0      1       0   
21    1.0         0.0    0.0     0.0       0.0        0      0       0   
22    1.0         0.0    0.0     0.0       0.0        0      0       0   
27    0.0         0.0    1.0     0.0       0.0        0      0       0   

    diarrhea  chills  sneeze  shortness of breath  phlegm  blockednose  \
0        0.0       0     0.0                    0     0.0          0.0   
16       0.0       0     0.0                    0     0.0          0.0   
21       0.0       0     0.0                    0     0.0          0.0   
22       0.0       0     0.0                    0     0.0          0.0   
27       1.0       0     0.0                    0     0.0          0.0   

    earache  leg pain  runnynose  virus  
0       0.0         0        0.0      0  
16      0.0         0     

In [44]:
save_file(age2,directory+"age2.csv")

    fever  sorethroat  cough  muscle  headache  fatigue  vomit  nausea  \
3     1.0         0.0    1.0     0.0       0.0        0      0       0   
14    0.0         0.0    1.0     0.0       0.0        0      0       0   
15    1.0         0.0    1.0     0.0       0.0        0      0       0   
45    1.0         0.0    0.0     0.0       0.0        0      0       0   
48    1.0         0.0    0.0     0.0       0.0        0      0       0   

    diarrhea  chills  sneeze  shortness of breath  phlegm  blockednose  \
3        0.0       0     0.0                    0     0.0          0.0   
14       0.0       0     0.0                    0     0.0          0.0   
15       0.0       0     0.0                    0     0.0          0.0   
45       0.0       0     0.0                    0     0.0          0.0   
48       0.0       0     0.0                    0     0.0          0.0   

    earache  leg pain  runnynose  virus  
3       0.0         0        0.0      1  
14      0.0         0     

In [45]:
save_file(age3,directory+"age3.csv")

   fever  sorethroat  cough  muscle  headache  fatigue  vomit  nausea  \
2    0.0         0.0    1.0     1.0       0.0        0      0       0   
4    1.0         0.0    0.0     0.0       0.0        0      0       0   
5    1.0         0.0    1.0     0.0       0.0        0      0       0   
8    1.0         0.0    0.0     0.0       0.0        0      0       0   
9    0.0         0.0    1.0     0.0       0.0        0      0       0   

   diarrhea  chills  sneeze  shortness of breath  phlegm  blockednose  \
2       0.0       0     0.0                    0     0.0          0.0   
4       0.0       0     0.0                    0     0.0          0.0   
5       0.0       0     0.0                    0     0.0          0.0   
8       0.0       0     0.0                    0     0.0          0.0   
9       0.0       0     0.0                    0     0.0          0.0   

   earache  leg pain  runnynose  virus  
2      0.0         0        0.0      1  
4      0.0         0        0.0      1  

In [46]:
save_file(age4,directory+"age4.csv")

    fever  sorethroat  cough  muscle  headache  fatigue  vomit  nausea  \
6     0.0         0.0    1.0     0.0       0.0        0      0       0   
18    1.0         0.0    0.0     0.0       0.0        0      0       0   
23    1.0         0.0    0.0     0.0       0.0        0      1       0   
24    0.0         0.0    1.0     0.0       0.0        0      0       0   
30    1.0         0.0    0.0     0.0       0.0        0      0       0   

    diarrhea  chills  sneeze  shortness of breath  phlegm  blockednose  \
6        0.0       0     0.0                    0     0.0          0.0   
18       0.0       0     0.0                    0     0.0          0.0   
23       1.0       0     0.0                    0     0.0          0.0   
24       0.0       0     0.0                    0     0.0          0.0   
30       0.0       0     0.0                    0     0.0          0.0   

    earache  leg pain  runnynose  virus  
6       0.0         0        0.0      0  
18      0.0         0     

In [47]:
save_file(age5,directory+"age5.csv")

    fever  sorethroat  cough  muscle  headache  fatigue  vomit  nausea  \
1     0.0         0.0    1.0     0.0       0.0        0      0       0   
7     1.0         0.0    0.0     0.0       0.0        0      0       0   
10    1.0         0.0    0.0     0.0       0.0        0      0       0   
12    1.0         0.0    0.0     0.0       0.0        0      0       0   
25    0.0         0.0    1.0     0.0       0.0        0      0       0   

    diarrhea  chills  sneeze  shortness of breath  phlegm  blockednose  \
1        0.0       0     0.0                    0     0.0          0.0   
7        0.0       0     0.0                    0     0.0          0.0   
10       0.0       0     0.0                    0     0.0          0.0   
12       0.0       0     0.0                    0     0.0          0.0   
25       0.0       0     0.0                    0     0.0          0.0   

    earache  leg pain  runnynose  virus  
1       0.0         0        0.0      1  
7       0.0         0     

In [48]:
save_file(clinically_collected,directory+"clinically_collected.csv")

   fever  sorethroat  cough  muscle  headache  fatigue  vomit  nausea  \
0      1           0      0       0         0        0      0       0   
1      0           0      1       0         0        0      0       0   
2      0           0      1       1         0        0      0       0   
3      1           0      1       0         0        0      0       0   
4      1           0      0       0         0        0      0       0   

   diarrhea  chills  sneeze  shortness of breath  phlegm  blockednose  \
0         0       0       0                    0       0            0   
1         0       0       0                    0       0            0   
2         0       0       0                    0       0            0   
3         0       0       0                    0       0            0   
4         0       0       0                    0       0            0   

   earache  leg pain  runnynose  virus  
0        0         0          0      0  
1        0         0          0      1  

In [49]:
save_file(individually_reported,directory+"individually_reported.csv")

   fever  sorethroat  cough  muscle  headache  fatigue  vomit  nausea  \
0    0.0         1.0    0.0     0.0       0.0        1      0       0   
1    0.0         0.0    0.0     0.0       0.0        0      0       0   
2    1.0         1.0    1.0     1.0       0.0        1      1       1   
3    0.0         0.0    1.0     0.0       0.0        0      0       0   
4    1.0         1.0    1.0     0.0       0.0        0      0       0   

   diarrhea  chills  sneeze  shortness of breath  phlegm  blockednose  \
0       0.0       0     0.0                    0       0          0.0   
1       0.0       0     0.0                    0       0          0.0   
2       1.0       1     0.0                    1       0          0.0   
3       0.0       0     0.0                    0       0          0.0   
4       0.0       0     0.0                    0       0          0.0   

   earache  leg pain  runnynose  virus  
0        0         0        1.0      0  
1        0         0        0.0      0  

In [50]:
save_file(health_worker_facilitated,directory+"health_worker_facilitated.csv")

   fever  sorethroat  cough  muscle  headache  fatigue  vomit  nausea  \
0    0.0         0.0    0.0     0.0       0.0        0      0       0   
1    0.0         0.0    0.0     0.0       0.0        0      0       0   
2    0.0         0.0    0.0     0.0       0.0        0      0       0   
3    0.0         0.0    1.0     0.0       0.0        0      0       0   
4    0.0         0.0    0.0     0.0       0.0        0      0       0   

   diarrhea  chills  sneeze  shortness of breath  phlegm  blockednose  \
0         0       0       0                    0     0.0            0   
1         0       0       0                    0     0.0            0   
2         0       0       0                    0     0.0            0   
3         0       0       0                    0     1.0            0   
4         0       0       0                    0     0.0            0   

   earache  leg pain  runnynose  virus  
0      0.0         0        0.0      0  
1      0.0         0        0.0      0  

In [51]:
save_file(total,directory+"total.csv")

   fever  sorethroat  cough  muscle  headache  fatigue  vomit  nausea  \
0    1.0         0.0    0.0     0.0       0.0        0      0       0   
1    0.0         0.0    1.0     0.0       0.0        0      0       0   
2    0.0         0.0    1.0     1.0       0.0        0      0       0   
3    1.0         0.0    1.0     0.0       0.0        0      0       0   
4    1.0         0.0    0.0     0.0       0.0        0      0       0   

   diarrhea  chills  sneeze  shortness of breath  phlegm  blockednose  \
0       0.0       0     0.0                    0     0.0          0.0   
1       0.0       0     0.0                    0     0.0          0.0   
2       0.0       0     0.0                    0     0.0          0.0   
3       0.0       0     0.0                    0     0.0          0.0   
4       0.0       0     0.0                    0     0.0          0.0   

   earache  leg pain  runnynose  virus  
0      0.0         0        0.0      0  
1      0.0         0        0.0      1  

In [52]:
save_file(nyumc,directory+"nyumc.csv")

   fever  sorethroat  cough  muscle  headache  fatigue  vomit  nausea  \
0      1           0      0       0         0        0      0       0   
1      0           0      1       0         0        0      0       0   
2      0           0      1       1         0        0      0       0   
3      1           0      1       0         0        0      0       0   
4      1           0      0       0         0        0      0       0   

   diarrhea  chills  sneeze  shortness of breath  phlegm  blockednose  \
0         0       0       0                    0       0            0   
1         0       0       0                    0       0            0   
2         0       0       0                    0       0            0   
3         0       0       0                    0       0            0   
4         0       0       0                    0       0            0   

   earache  leg pain  runnynose  virus  
0        0         0          0      0  
1        0         0          0      1  

In [53]:
save_file(goviral,directory+"goviral.csv")

   fever  sorethroat  cough  muscle  headache  fatigue  vomit  nausea  \
0      0           1      0       0         0        1      0       0   
1      0           0      0       0         0        0      0       0   
2      1           1      1       1         0        1      1       1   
3      0           0      1       0         0        0      0       0   
4      1           1      1       0         0        0      0       0   

   diarrhea  chills  sneeze  shortness of breath  phlegm  blockednose  \
0         0       0       0                    0       0            0   
1         0       0       0                    0       0            0   
2         1       1       0                    1       0            0   
3         0       0       0                    0       0            0   
4         0       0       0                    0       0            0   

   earache  leg pain  runnynose  virus  
0        0         0          1      0  
1        0         0          0      0  

In [54]:
save_file(fluwatch,directory+"fluwatch.csv")

   fever  sorethroat  cough  muscle  headache  fatigue  vomit  nausea  \
0    1.0         1.0    1.0     1.0       1.0        0      0       0   
1    0.0         0.0    1.0     0.0       0.0        0      0       0   
2    0.0         1.0    1.0     1.0       1.0        0      0       0   
3    0.0         0.0    0.0     0.0       1.0        0      0       0   
4    0.0         0.0    0.0     0.0       0.0        0      0       0   

   diarrhea  chills  sneeze  shortness of breath  phlegm  blockednose  \
0       0.0       0     1.0                    0       0          0.0   
1       0.0       0     0.0                    0       0          0.0   
2       0.0       0     1.0                    0       0          0.0   
3       0.0       0     1.0                    0       0          0.0   
4       0.0       0     0.0                    0       0          0.0   

   earache  leg pain  runnynose  virus  
0        0         0        1.0      0  
1        0         0        1.0      1  

In [55]:
save_file(hongkong,directory+"hongkong.csv")

   fever  sorethroat  cough  muscle  headache  fatigue  vomit  nausea  \
0    0.0         0.0    0.0     0.0       0.0        0      0       0   
1    0.0         0.0    0.0     0.0       0.0        0      0       0   
2    0.0         0.0    0.0     0.0       0.0        0      0       0   
3    0.0         0.0    1.0     0.0       0.0        0      0       0   
4    0.0         0.0    0.0     0.0       0.0        0      0       0   

   diarrhea  chills  sneeze  shortness of breath  phlegm  blockednose  \
0         0       0       0                    0     0.0            0   
1         0       0       0                    0     0.0            0   
2         0       0       0                    0     0.0            0   
3         0       0       0                    0     1.0            0   
4         0       0       0                    0     0.0            0   

   earache  leg pain  runnynose  virus  
0        0         0        0.0      0  
1        0         0        0.0      0  

In [56]:
save_file(hutterite,directory+"hutterite.csv")

   fever  sorethroat  cough  muscle  headache  fatigue  vomit  nausea  \
0      0           0      1       0         0        0      0       0   
1      0           0      0       0         0        0      0       0   
2      0           1      0       1         1        0      0       0   
3      0           0      1       0         0        0      0       0   
4      1           0      1       1         1        0      0       0   

   diarrhea  chills  sneeze  shortness of breath  phlegm  blockednose  \
0         0       0       0                    0       0            0   
1         0       1       0                    0       0            0   
2         0       1       0                    0       0            1   
3         0       0       0                    0       0            0   
4         0       1       0                    0       0            1   

   earache  leg pain  runnynose  virus  
0      0.0         0          1      1  
1      0.0         0          1      0  