In [1]:
import pandas as pd
import numpy as np
import os
import time
from collections import defaultdict

In [2]:
symptoms = ['fever',
            'sorethroat',
            'cough',
            'muscle',
            'headache',
            'fatigue',
            'vomit',
            'nausea',
            'diarrhea',
            'chills',
            'sneeze',
            'shortness of breath',
            'phlegm',
            'blockednose',
            'earache',
            'leg pain',
            'runnynose',
            'virus']

In [3]:
def read_file(filename_):
    return pd.read_csv(filename_)

In [4]:
symptom_combinations = []
for i in range(0,len(symptoms)-1):
    for j in range(i+1,len(symptoms)-1):
        symptom_combinations.append((symptoms[i],symptoms[j]))

In [5]:
def create_columns(combined_features):
    columns_ = [i[0]+"_"+i[1] for i in combined_features]
    return columns_

In [6]:
columns_ = create_columns(symptom_combinations)

In [7]:
def create_new_dataframe(data_,combined_features,columns_):
    new_data = pd.DataFrame(columns=columns_)
    for i in list(data_.columns):
        new_data[i] = data_[i]
    for j in combined_features:
        symp_1 = j[0]
        symp_2 = j[1]
        new_data[symp_1+"_"+symp_2] = np.where((new_data[symp_1] == 1) & (new_data[symp_2] ==1),1,0 )
    for k in combined_features:
        symp_1 = k[0]
        symp_2 = k[1]
        new_data[symp_1] = np.where((new_data[symp_1+"_"+symp_2] == 1),0,new_data[symp_1] )
        new_data[symp_2] = np.where((new_data[symp_1+"_"+symp_2] == 1),0, new_data[symp_2])
    return new_data
    

In [8]:
directory = "../Data/With_Improved_Target/With_Demographics/"
file_ = directory+'nyumc.csv'

In [9]:
data = read_file(file_)

In [10]:
print(data.head())

   fever  sorethroat  cough  muscle  headache  fatigue  vomit  nausea  \
0      1           0      0       0         0        0      0       0   
1      0           1      1       0         0        0      0       0   
2      1           0      0       0         0        0      0       0   
3      0           0      1       0         0        0      0       0   
4      1           0      0       0         0        0      0       0   

   diarrhea  chills  ...    leg pain  runnynose  age 0-4  age 5-15  age 16-44  \
0         0       0  ...           0          0        0         0          1   
1         0       0  ...           0          0        0         0          0   
2         0       0  ...           0          0        0         1          0   
3         0       0  ...           0          0        0         0          0   
4         0       0  ...           0          0        0         0          0   

   age 45-64  age 65+  male  female  virus  
0          0        0     1  

In [11]:
data_nyumc = create_new_dataframe(data,symptom_combinations,columns_)
print("Original number of sample points : ",data.shape)
print("Final number of sample points : ",data_nyumc.shape)

Original number of sample points :  (21907, 25)
Final number of sample points :  (21907, 161)


In [12]:
data_nyumc.head()

Unnamed: 0,fever_sorethroat,fever_cough,fever_muscle,fever_headache,fever_fatigue,fever_vomit,fever_nausea,fever_diarrhea,fever_chills,fever_sneeze,...,leg pain,runnynose,age 0-4,age 5-15,age 16-44,age 45-64,age 65+,male,female,virus
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,1


In [13]:
file_goviral = directory+'goviral.csv'

In [14]:
data = read_file(file_goviral)
print(data.head())
data_goviral = create_new_dataframe(data,symptom_combinations,columns_)
print("Original number of sample points : ",data.shape)
print("Final number of sample points : ",data_goviral.shape)

   fever  sorethroat  cough  muscle  headache  fatigue  vomit  nausea  \
0      0           1      1       0         0        1      0       0   
1      0           1      0       1         0        1      0       0   
2      0           1      1       0         0        0      0       0   
3      1           1      1       1         0        1      1       1   
4      0           1      1       0         0        0      0       0   

   diarrhea  chills  ...    leg pain  runnynose  age 0-4  age 5-15  age 16-44  \
0         0       0  ...           0          1        0         0          1   
1         1       1  ...           1          0        0         0          1   
2         0       0  ...           0          1        0         0          1   
3         1       1  ...           1          1        0         0          0   
4         0       1  ...           0          1        0         0          0   

   age 45-64  age 65+  male  female  virus  
0          0        0   1.0  

In [15]:
data_goviral.head()

Unnamed: 0,fever_sorethroat,fever_cough,fever_muscle,fever_headache,fever_fatigue,fever_vomit,fever_nausea,fever_diarrhea,fever_chills,fever_sneeze,...,leg pain,runnynose,age 0-4,age 5-15,age 16-44,age 45-64,age 65+,male,female,virus
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1.0,0.0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1.0,0.0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0.0,1.0,1
3,1,1,1,0,1,1,1,1,1,0,...,0,0,0,0,0,0,1,1.0,0.0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0.0,1.0,1


In [16]:
file_fluwatch = directory +'fluwatch.csv'
data = read_file(file_fluwatch)
data.head()

Unnamed: 0,fever,sorethroat,cough,muscle,headache,fatigue,vomit,nausea,diarrhea,chills,...,leg pain,runnynose,age 0-4,age 5-15,age 16-44,age 45-64,age 65+,male,female,virus
0,0.0,0.0,0.0,0.0,0.0,0,0,0,0.0,0,...,0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1
1,0.0,1.0,1.0,0.0,1.0,0,0,0,0.0,0,...,0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0
2,1.0,1.0,1.0,1.0,1.0,0,0,0,0.0,0,...,0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0
3,0.0,0.0,1.0,0.0,0.0,0,0,0,0.0,0,...,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1
4,1.0,1.0,1.0,1.0,0.0,0,0,0,1.0,0,...,0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0


In [17]:
data_fluwatch = create_new_dataframe(data,symptom_combinations,columns_)
print("Original number of sample points : ",data.shape)
print("Final number of sample points : ",data_fluwatch.shape)

Original number of sample points :  (915, 25)
Final number of sample points :  (915, 161)


In [18]:
data_fluwatch.head()

Unnamed: 0,fever_sorethroat,fever_cough,fever_muscle,fever_headache,fever_fatigue,fever_vomit,fever_nausea,fever_diarrhea,fever_chills,fever_sneeze,...,leg pain,runnynose,age 0-4,age 5-15,age 16-44,age 45-64,age 65+,male,female,virus
0,0,0,0,0,0,0,0,0,0,0,...,0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0
2,1,1,1,1,0,0,0,0,0,1,...,0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1
4,1,1,1,0,0,0,0,1,0,1,...,0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0


In [19]:
filename_hongkong = directory + 'hongkong.csv'
data = read_file(filename_hongkong)

In [20]:
data_hongkong = create_new_dataframe(data,symptom_combinations,columns_)
print("Original number of sample points : ",data.shape)
print("Final number of sample points : ",data_hongkong.shape)

Original number of sample points :  (4954, 25)
Final number of sample points :  (4954, 161)


In [21]:
data_fluwatch.head()

Unnamed: 0,fever_sorethroat,fever_cough,fever_muscle,fever_headache,fever_fatigue,fever_vomit,fever_nausea,fever_diarrhea,fever_chills,fever_sneeze,...,leg pain,runnynose,age 0-4,age 5-15,age 16-44,age 45-64,age 65+,male,female,virus
0,0,0,0,0,0,0,0,0,0,0,...,0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0
2,1,1,1,1,0,0,0,0,0,1,...,0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1
4,1,1,1,0,0,0,0,1,0,1,...,0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0


In [22]:
filename_hutterite = directory + 'hutterite.csv'
data = read_file(filename_hutterite)
data.head()

Unnamed: 0,fever,sorethroat,cough,muscle,headache,fatigue,vomit,nausea,diarrhea,chills,...,leg pain,runnynose,age 0-4,age 5-15,age 16-44,age 45-64,age 65+,male,female,virus
0,0,1,1,0,0,0,0,0,0,0,...,0,0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1
1,0,0,0,1,0,1,0,0,0,0,...,0,0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0
2,0,0,1,0,0,0,0,0,0,0,...,0,1,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1
3,1,1,1,1,1,1,0,0,0,1,...,0,0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1
4,0,1,1,0,0,0,0,0,0,0,...,0,1,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1


In [23]:
data_hutterite = create_new_dataframe(data,symptom_combinations,columns_)
print("Original number of sample points : ",data.shape)
print("Final number of sample points : ",data_hutterite.shape)

Original number of sample points :  (1281, 25)
Final number of sample points :  (1281, 161)


In [24]:
data_hutterite.head()

Unnamed: 0,fever_sorethroat,fever_cough,fever_muscle,fever_headache,fever_fatigue,fever_vomit,fever_nausea,fever_diarrhea,fever_chills,fever_sneeze,...,leg pain,runnynose,age 0-4,age 5-15,age 16-44,age 45-64,age 65+,male,female,virus
0,0,0,0,0,0,0,0,0,0,0,...,0,0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1
3,1,1,1,1,1,0,0,0,1,0,...,0,0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1


In [26]:
data_nyumc.to_csv('./Data/With_Demographics/nyumc.csv',index = False)

In [31]:
data_goviral.to_csv('./Data/With_Demographics/goviral.csv',index = False)

In [32]:
data_fluwatch.to_csv('./Data/With_Demographics/fluwatch.csv',index = False)

In [33]:
data_hongkong.to_csv('./Data/With_Demographics/hongkong.csv',index = False)

In [34]:
data_hutterite.to_csv('./Data/With_Demographics/hutterite.csv',index = False)

In [None]:
directory = "../Data/With_Improved_Target/With_Demographics/"
print("Loaded new directory")