In [1]:
import warnings; warnings.simplefilter('ignore')
import pandas as pd
import numpy as np
import os
import time
from collections import defaultdict
from sklearn.utils import resample
from imblearn.over_sampling import SMOTE

In [2]:
directory_dataset = "../Data/Symptoms_Demo/All/"
store_directory = "../Data/Symptoms_Demo/Balanced_Data/"

In [3]:
symptoms = ['fever','sorethroat','cough','muscle','headache','fatigue','vomit','nausea','diarrhea','chills','sneeze','shortness of breath','phlegm','blockednose','earache','leg pain','runnynose','virus']
demo = ['age 0-4', 'age 5-15', 'age 16-44', 'age 45-64','age 65+', 'male', 'female']

In [4]:
def read_file(filename):
    data = pd.read_csv(filename)
    data_demo = data[demo]
    data_symp = data[symptoms]
    return data,data_demo,data_symp

In [5]:
def target_distribution(data):
    pos = data.loc[data['virus'] == 1].shape[0]
    neg = data.loc[data['virus'] == 0].shape[0]
    print("Number of total samples : ",data.shape[0])
    print("Number of positive examples : ",pos)
    print("Number of negative examples : ",neg)
    print("Proportion of positive : ",round(pos/(pos+neg),5))
    print(pos+neg)
    return pos,neg

In [6]:
filename_nyumc = 'nyumc.csv'
data_nyumc,nyumc_demo,nyumc_symp = read_file(directory_dataset+filename_nyumc)
pos_nyumc,neg_nyumc = target_distribution(data_nyumc)

Number of total samples :  21907
Number of positive examples :  9391
Number of negative examples :  12516
Proportion of positive :  0.42868
21907


In [7]:
filename_gv = 'goviral.csv'
data_gv,gv_demo,gv_symp = read_file(directory_dataset+filename_gv)
pos_gv,neg_gv = target_distribution(data_gv)

Number of total samples :  520
Number of positive examples :  264
Number of negative examples :  256
Proportion of positive :  0.50769
520


In [8]:
filename_fw = 'fluwatch.csv'
data_fw,fw_demo,fw_symp = read_file(directory_dataset+filename_fw)
pos_fw,neg_fw = target_distribution(data_fw)

Number of total samples :  915
Number of positive examples :  449
Number of negative examples :  466
Proportion of positive :  0.49071
915


In [9]:
filename_hk = 'hongkong.csv'
data_hk,hk_demo,hk_symp = read_file(directory_dataset+filename_hk)
pos_hk,neg_hk = target_distribution(data_hk)

Number of total samples :  4954
Number of positive examples :  1512
Number of negative examples :  3442
Proportion of positive :  0.30521
4954


In [10]:
filename_ht = 'hutterite.csv'
data_ht,ht_demo,ht_symp = read_file(directory_dataset+filename_ht)
pos_ht,neg_ht = target_distribution(data_ht)

Number of total samples :  1281
Number of positive examples :  720
Number of negative examples :  561
Proportion of positive :  0.56206
1281


In [11]:
def balance_dataset(data):
    data1 = data.copy()
    data1_pos = data1.loc[data1['virus'] == 1]
    data1_neg = data1.loc[data1['virus'] == 0]
    maximum = max(data1_pos,)
    data1_neg_upsampled = resample(data1_neg,replace = True,n_samples = data1_pos.shape[0],random_state = 100)
    data1_upsampled = pd.concat([data1_pos,data1_neg_upsampled])
    print(data1_upsampled['virus'].value_counts())
    return data1_upsampled

In [12]:
def balance_smote(data):
    print("Original number of examples : ",data.shape[0])
    sm = SMOTE(random_state=12, ratio = 1.0)
    x_res, y_res = sm.fit_sample(data.drop('virus',axis=1), data['virus'])
    x_res = np.round(x_res)
    cols = list(data.columns)
    cols = [i for i in cols if i != 'virus']
    new_data = pd.DataFrame()
    for i in range(len(cols)):
        new_data[cols[i]] = x_res[:,i]
    new_data['virus'] = y_res
    print("New data shape : ",new_data.shape)
    pos,neg = target_distribution(new_data)
    return new_data

In [18]:
balanced_nyumc = balance_smote(data_nyumc)
balanced_nyumc.to_csv(store_directory+'nyumc.csv',index=False)
balanced_nyumc.head()

Original number of examples :  21907




New data shape :  (25032, 25)
Number of total samples :  25032
Number of positive examples :  12516
Number of negative examples :  12516
Proportion of positive :  0.5
25032


Unnamed: 0,fever,sorethroat,cough,muscle,headache,fatigue,vomit,nausea,diarrhea,chills,...,leg pain,runnynose,age 0-4,age 5-15,age 16-44,age 45-64,age 65+,male,female,virus
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1


In [17]:
balanced_gv = balance_smote(data_gv)
balanced_gv.to_csv(store_directory+'goviral.csv',index=False)
balanced_gv.head()

Original number of examples :  520
New data shape :  (528, 25)
Number of total samples :  528
Number of positive examples :  264
Number of negative examples :  264
Proportion of positive :  0.5
528




Unnamed: 0,fever,sorethroat,cough,muscle,headache,fatigue,vomit,nausea,diarrhea,chills,...,leg pain,runnynose,age 0-4,age 5-15,age 16-44,age 45-64,age 65+,male,female,virus
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0


In [14]:
balanced_fw = balance_smote(data_fw)
balanced_fw.to_csv(store_directory+'fluwatch.csv',index=False)
print(balanced_fw.shape)
balanced_fw.head()


Original number of examples :  915
New data shape :  (932, 25)
Number of total samples :  932
Number of positive examples :  466
Number of negative examples :  466
Proportion of positive :  0.5
932
(932, 25)




Unnamed: 0,fever,sorethroat,cough,muscle,headache,fatigue,vomit,nausea,diarrhea,chills,...,leg pain,runnynose,age 0-4,age 5-15,age 16-44,age 45-64,age 65+,male,female,virus
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0


In [15]:
balanced_hk = balance_smote(data_hk)
balanced_hk.to_csv(store_directory+'hongkong.csv',index=False)
balanced_hk.head()

Original number of examples :  4954
New data shape :  (6884, 25)
Number of total samples :  6884
Number of positive examples :  3442
Number of negative examples :  3442
Proportion of positive :  0.5
6884




Unnamed: 0,fever,sorethroat,cough,muscle,headache,fatigue,vomit,nausea,diarrhea,chills,...,leg pain,runnynose,age 0-4,age 5-15,age 16-44,age 45-64,age 65+,male,female,virus
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1


In [16]:
balanced_ht = balance_smote(data_ht)
balanced_ht.to_csv(store_directory+'hutterite.csv',index=False)
balanced_ht.head()

Original number of examples :  1281
New data shape :  (1440, 25)
Number of total samples :  1440
Number of positive examples :  720
Number of negative examples :  720
Proportion of positive :  0.5
1440




Unnamed: 0,fever,sorethroat,cough,muscle,headache,fatigue,vomit,nausea,diarrhea,chills,...,leg pain,runnynose,age 0-4,age 5-15,age 16-44,age 45-64,age 65+,male,female,virus
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0
