In [1]:
import pandas as pd
import numpy as np
import os
import time
from collections import defaultdict
from sklearn import linear_model
from sklearn.metrics import accuracy_score,f1_score,recall_score,roc_curve
from sklearn import metrics
import seaborn as sns
import matplotlib.pyplot as plt

**Combine the symptoms and the demographic information**

In [39]:
directory = "../Data/Symptoms_Demo/NYUMC/Test/"

In [40]:
def read_file(filename):
    temp = pd.read_csv(directory+filename)
    temp['index'] = range(0,temp.shape[0])
    temp.set_index('index',inplace = True)
    return temp

In [41]:
def get_all_files(files_):
    data = defaultdict()
    for i in files_:
        print("Reading file : ",i)
        data[i[:-4]] = read_file(i)
    return data

In [42]:
files_ = ['nyumc.csv','goviral.csv','fluwatch.csv','hongkong.csv','hutterite.csv']

In [43]:
data = get_all_files(files_)

Reading file :  nyumc.csv
Reading file :  goviral.csv


FileNotFoundError: File b'../Data/Symptoms_Demo/NYUMC/Test/goviral.csv' does not exist

In [44]:
print("The read files are : ",data.keys())

The read files are :  dict_keys(['hutterite', 'goviral', 'hongkong', 'nyumc', 'fluwatch'])


*separate the symptoms and demographics information*

In [45]:
symptoms = ['fever','sorethroat','cough','muscle','headache','fatigue','vomit','nausea','diarrhea','chills',
            'sneeze','shortness of breath','phlegm','blockednose','earache','leg pain','runnynose']

demographic = ['age 0-4','age 5-15','age 16-44','age 45-64','age 65+','male','female']


In [46]:
def combine(symptoms,demographic):
    combined_features = [(p1, p2) for p1 in symptoms for p2 in demographic]
    return combined_features

In [47]:
combined_features = combine(symptoms,demographic)

In [48]:
def create_columns(combined_features):
    columns_ = [i[0]+"_"+i[1] for i in combined_features]
    return columns_

In [49]:
columns_ = create_columns(combined_features)

In [50]:
def create_new_dataframe(data_,combined_features,columns_):
    new_data = pd.DataFrame(columns=columns_)
    for i in combined_features:
        symp = i[0]
        dem  = i[1]
        new_data[symp+"_"+dem] = np.where((data_[symp] == 1) & (data_[dem] == 1), 1, 0)
    new_data['virus'] = data_['virus']
    return new_data
    
    

In [51]:
data_nyumc = create_new_dataframe(data['nyumc'],combined_features,columns_)
print("Original number of sample points : ",data['nyumc'].shape)
print("Final number of sample points : ",data_nyumc.shape)

Original number of sample points :  (2190, 25)
Final number of sample points :  (2190, 120)


In [52]:
data_nyumc.head()

Unnamed: 0,fever_age 0-4,fever_age 5-15,fever_age 16-44,fever_age 45-64,fever_age 65+,fever_male,fever_female,sorethroat_age 0-4,sorethroat_age 5-15,sorethroat_age 16-44,...,leg pain_male,leg pain_female,runnynose_age 0-4,runnynose_age 5-15,runnynose_age 16-44,runnynose_age 45-64,runnynose_age 65+,runnynose_male,runnynose_female,virus
0,0,1,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,1,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [53]:
data_gv = create_new_dataframe(data['goviral'],combined_features,columns_)
print("Original number of sample points : ",data['goviral'].shape)
print("Final number of sample points : ",data_gv.shape)

Original number of sample points :  (520, 25)
Final number of sample points :  (520, 120)


In [26]:
data_gv.head()

Unnamed: 0,fever_age 0-4,fever_age 5-15,fever_age 16-44,fever_age 45-64,fever_age 65+,fever_male,fever_female,sorethroat_age 0-4,sorethroat_age 5-15,sorethroat_age 16-44,...,leg pain_male,leg pain_female,runnynose_age 0-4,runnynose_age 5-15,runnynose_age 16-44,runnynose_age 45-64,runnynose_age 65+,runnynose_male,runnynose_female,virus
0,0,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,1,0,1,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,1,0,0,1,0,0,0,1,...,0,0,0,0,1,0,0,1,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [27]:
data_fw = create_new_dataframe(data['fluwatch'],combined_features,columns_)
print("Original number of sample points : ",data['fluwatch'].shape)
print("Final number of sample points : ",data_fw.shape)

Original number of sample points :  (915, 25)
Final number of sample points :  (915, 120)


In [28]:
data_fw.head()

Unnamed: 0,fever_age 0-4,fever_age 5-15,fever_age 16-44,fever_age 45-64,fever_age 65+,fever_male,fever_female,sorethroat_age 0-4,sorethroat_age 5-15,sorethroat_age 16-44,...,leg pain_male,leg pain_female,runnynose_age 0-4,runnynose_age 5-15,runnynose_age 16-44,runnynose_age 45-64,runnynose_age 65+,runnynose_male,runnynose_female,virus
0,0,0,0,0,0,0,0,0,1,0,...,0,0,0,1,0,0,0,1,0,0
1,0,1,0,0,0,0,1,0,1,0,...,0,0,0,1,0,0,0,0,1,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,1
3,0,0,0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,1,1,0,1
4,0,0,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,1,0,1,1


In [29]:
data_hk = create_new_dataframe(data['hongkong'],combined_features,columns_)
print("Original number of sample points : ",data['hongkong'].shape)
print("Final number of sample points :",data_hk.shape)

Original number of sample points :  (4954, 25)
Final number of sample points : (4954, 120)


In [30]:
data_hk.head()

Unnamed: 0,fever_age 0-4,fever_age 5-15,fever_age 16-44,fever_age 45-64,fever_age 65+,fever_male,fever_female,sorethroat_age 0-4,sorethroat_age 5-15,sorethroat_age 16-44,...,leg pain_male,leg pain_female,runnynose_age 0-4,runnynose_age 5-15,runnynose_age 16-44,runnynose_age 45-64,runnynose_age 65+,runnynose_male,runnynose_female,virus
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,1,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [31]:
data_ht = create_new_dataframe(data['hutterite'],combined_features,columns_)
print("Original number of sample points : ",data['hutterite'].shape)
print("Final number of sample points : ",data_ht.shape)

Original number of sample points :  (1281, 25)
Final number of sample points :  (1281, 120)


In [32]:
data_ht.head()

Unnamed: 0,fever_age 0-4,fever_age 5-15,fever_age 16-44,fever_age 45-64,fever_age 65+,fever_male,fever_female,sorethroat_age 0-4,sorethroat_age 5-15,sorethroat_age 16-44,...,leg pain_male,leg pain_female,runnynose_age 0-4,runnynose_age 5-15,runnynose_age 16-44,runnynose_age 45-64,runnynose_age 65+,runnynose_male,runnynose_female,virus
0,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,1
4,0,0,0,0,0,0,0,0,1,0,...,0,0,0,1,0,0,0,1,0,1


#### Save the new data files with combined symptoms and demographics

In [33]:
def save_file(dataframe_,filename_):
    print(len(list(dataframe_.columns)))
    directory = "../Data/Symptoms_Demo/NYUMC/Combined/Train/"
    filename = directory+filename_
    if os.path.isfile(filename):
        print("File exists!")
    else:
        dataframe_.to_csv(filename,index=False)

In [34]:
save_file(data_nyumc,"nyumc.csv")

120


In [35]:
save_file(data_gv,"goviral.csv")

120


In [36]:
save_file(data_gv,"fluwatch.csv")

120


In [37]:
save_file(data_hk,"hongkong.csv")

120


In [38]:
save_file(data_ht,"hutterite.csv")

120
