**Update the target variable to only keep the most predictive symptoms**

In [1]:
import warnings; warnings.simplefilter('ignore')
import pandas as pd
import numpy as np
import os
from collections import defaultdict

In [2]:
def read_file(filename):
    return pd.read_csv(filename)

In [3]:
def get_positive_negative(dataframe,target):
    positive = dataframe.loc[dataframe[target] == 1]
    negative = dataframe.loc[dataframe[target] == 0]
    return positive,negative

In [4]:
def create_unique_column(dataframe,target):
    cols = list(dataframe.columns)
    cols = [i for i in cols if i!= target]
    #concatenate the symptoms to create a unique string
    dataframe['set'] = dataframe[cols].astype(str).apply(''.join, axis=1)
    return dataframe


In [5]:
def get_unique(df,target):
    columns_to_consider = list(df.columns)
    columns_to_consider = [i for i in columns_to_consider if i != target]
    print("Number of rows before removing the duplicates : ",df.shape[0])
    new_df = df.drop_duplicates(keep = 'first',subset = columns_to_consider)
    print("Number of unique rows : ",new_df.shape[0])
    return new_df

In [6]:
def get_count(dataframe,groupby_name='set',count_name='virus'):
    count_ = dataframe.groupby(groupby_name)[count_name].count()
    return count_.to_dict()

In [7]:
def set_counts(dataframe,count_dict,name='set'):
    temp_list  = []
    for i in range(dataframe.shape[0]):
        particular_set = dataframe.iloc[i][name]
        if particular_set in list(count_dict.keys()):
            temp_list.append(count_dict[particular_set])
        elif particular_set not in list(count_dict.keys()):
            temp_list.append(0)
    return temp_list
        

In [8]:
def update_dataframe(dataframe,positive,negative,target='virus'):
    dataframe['positive'] = positive
    dataframe['negative'] = negative
    dataframe['sum'] = dataframe[['positive','negative']].sum(axis=1)
    dataframe['ratio'] = dataframe['positive']/dataframe['sum']
    dataframe.fillna(0,inplace=True)
    add_positive = sum(positive)
    add_sum = sum(list(dataframe['sum']))
    threshold = add_positive/add_sum
    print("Threshold : ",threshold)
    dataframe[target] = np.where(dataframe['ratio']>=threshold,1,0)
    return dataframe


In [9]:
def duplicate_rows(dataframe,columns_to_consider=['fever', 'sorethroat', 'cough', 'muscle', 'headache', 'fatigue',
       'vomit', 'nausea', 'diarrhea', 'chills', 'sneeze',
       'shortness of breath', 'phlegm', 'blockednose', 'earache', 'leg pain',
       'runnynose', 'virus']):
    new_df = pd.DataFrame()
    for i in range(dataframe.shape[0]):
        new_df = new_df.append([dataframe.iloc[i][:]]*dataframe.iloc[i]['sum'],ignore_index=True)
    new_df = new_df.sample(frac=1).reset_index(drop=True)
    new_df = new_df[columns_to_consider]
    return new_df
    

In [10]:
def save_file(filename,data):
    data.to_csv(filename,index=False)

In [11]:
def complete_transformation(filename,columns_to_consider=[]):
    data = read_file(filename)
    print("File read!")
    data = create_unique_column(data,'virus')
    print("Unique column created!")
    positve,negative = get_positive_negative(data,'virus')
    print("Got the positive and negative samples!")
    count_positive = get_count(positve,'set','virus')
    count_negative = get_count(negative,'set','virus')
    print("Got the counts of the positive and the negative!")
    unique_data = get_unique(data,'virus')
    print("Got the unique rows in the original dataset!")
    temp_positive = set_counts(unique_data,count_positive)
    temp_negative = set_counts(unique_data,count_negative)
    print("Got the counts for each distinct row!")
    unique_data = update_dataframe(unique_data,temp_positive,temp_negative)
    print("Updated the columns")
    new_data = duplicate_rows(unique_data,columns_to_consider)
    return new_data

In [12]:
def process(directory,store_directory,filename,columns_to_consider):
    data = complete_transformation(directory+filename,columns_to_consider)
    print(data.head())
    
    save_file(store_directory+filename,data)

In [13]:
directory = "../Data/Symptoms/Total/"
store_directory = "../Data/With_Improved_Target/Symptoms/Total/"
columns_to_consider = ['fever', 'sorethroat', 'cough', 'muscle', 'headache', 'fatigue',
       'vomit', 'nausea', 'diarrhea', 'chills', 'sneeze',
       'shortness of breath', 'phlegm', 'blockednose', 'earache', 'leg pain',
       'runnynose', 'virus']

In [14]:
filenames = ['nyumc.csv','goviral.csv','fluwatch.csv','hutterite.csv','hongkong.csv']

In [78]:
for i in filenames:
    print("\n")
    print(i)
    print("\n")
    process(directory,store_directory,i,columns_to_consider)



nyumc.csv


File read!
Unique column created!
Got the positive and negative samples!
Got the counts of the positive and the negative!
Number of rows before removing the duplicates :  21907
Number of unique rows :  74
Got the unique rows in the original dataset!
Got the counts for each distinct row!
Threshold :  0.0266124982882
Updated the columns
   fever  sorethroat  cough  muscle  headache  fatigue  vomit  nausea  \
0      0           1      0       0         0        0      0       0   
1      1           0      0       0         0        0      0       0   
2      1           0      0       0         0        0      0       0   
3      1           0      0       0         0        0      0       0   
4      1           0      0       0         0        0      0       0   

   diarrhea  chills  sneeze  shortness of breath  phlegm  blockednose  \
0         0       0       0                    0       0            0   
1         0       0       0                    0       0        

#### For demographics

In [15]:
directory = "../Data/With_Demographics/"
store_directory = "../Data/With_Improved_Target/With_Demographics/"
columns_to_consider = ['fever', 'sorethroat', 'cough', 'muscle', 'headache', 'fatigue',
       'vomit', 'nausea', 'diarrhea', 'chills', 'sneeze',
       'shortness of breath', 'phlegm', 'blockednose', 'earache', 'leg pain',
       'runnynose', 'age 0-4', 'age 5-15', 'age 16-44', 'age 45-64', 'age 65+',
       'male', 'female', 'virus']

In [16]:
for i in filenames:
    print("\n")
    print(i)
    print("\n")
    process(directory,store_directory,i,columns_to_consider)



nyumc.csv


File read!
Unique column created!
Got the positive and negative samples!
Got the counts of the positive and the negative!
Number of rows before removing the duplicates :  21907
Number of unique rows :  319
Got the unique rows in the original dataset!
Got the counts for each distinct row!
Threshold :  0.026612498288218378
Updated the columns
   fever  sorethroat  cough  muscle  headache  fatigue  vomit  nausea  \
0      1           0      0       0         0        0      0       0   
1      0           1      1       0         0        0      0       0   
2      1           0      0       0         0        0      0       0   
3      0           0      1       0         0        0      0       0   
4      1           0      0       0         0        0      0       0   

   diarrhea  chills  ...    leg pain  runnynose  age 0-4  age 5-15  age 16-44  \
0         0       0  ...           0          0        0         0          1   
1         0       0  ...           0      

#### Train

In [43]:
directory = "../Data/Symptoms/Train/"
store_directory = "../Data/With_Improved_Target/Symptoms/Train/"
columns_to_consider = ['fever', 'sorethroat', 'cough', 'muscle', 'headache', 'fatigue',
       'vomit', 'nausea', 'diarrhea', 'chills', 'sneeze',
       'shortness of breath', 'phlegm', 'blockednose', 'earache', 'leg pain',
       'runnynose', 'virus']

In [44]:
filenames = ['nyumc.csv','goviral.csv','fluwatch.csv','hongkong.csv','hutterite.csv']

In [45]:
for i in filenames:
    print("\n")
    print(i)
    print("\n")
    process(directory,store_directory,i,columns_to_consider)



nyumc.csv


File read!
Unique column created!
Got the positive and negative samples!
Got the counts of the positive and the negative!
Number of rows before removing the duplicates :  17526
Number of unique rows :  66
Got the unique rows in the original dataset!
Got the counts for each distinct row!
Threshold :  0.0269884742668
Updated the columns
   fever  sorethroat  cough  muscle  headache  fatigue  vomit  nausea  \
0      1           0      0       0         0        0      1       0   
1      1           0      0       0         0        0      0       0   
2      0           0      1       0         0        0      0       0   
3      1           0      0       0         0        0      0       1   
4      1           0      0       0         0        0      0       0   

   diarrhea  chills  sneeze  shortness of breath  phlegm  blockednose  \
0         1       0       0                    0       0            0   
1         0       0       0                    0       0        

#### Test

In [38]:
directory = "../Data/Symptoms/Test/"
store_directory = "../Data/With_Improved_Target/Symptoms/Test/"
columns_to_consider = ['fever', 'sorethroat', 'cough', 'muscle', 'headache', 'fatigue',
       'vomit', 'nausea', 'diarrhea', 'chills', 'sneeze',
       'shortness of breath', 'phlegm', 'blockednose', 'earache', 'leg pain',
       'runnynose', 'virus']

In [39]:
filenames = ['nyumc.csv','goviral.csv','fluwatch.csv','hongkong.csv','hutterite.csv']

In [41]:
for i in filenames:
    print("\n")
    print(i)
    print("\n")
    process(directory,store_directory,i,columns_to_consider)



nyumc.csv


File read!
Unique column created!
Got the positive and negative samples!
Got the counts of the positive and the negative!
Number of rows before removing the duplicates :  4381
Number of unique rows :  50
Got the unique rows in the original dataset!
Got the counts for each distinct row!
Threshold :  0.0251084227345
Updated the columns
   fever  sorethroat  cough  muscle  headache  fatigue  vomit  nausea  \
0      1           0      0       0         0        0      0       0   
1      1           0      0       0         0        0      0       0   
2      1           0      0       0         0        0      0       0   
3      1           0      0       0         0        0      0       0   
4      1           0      0       0         0        0      0       0   

   diarrhea  chills  sneeze  shortness of breath  phlegm  blockednose  \
0         0       0       0                    0       0            0   
1         0       0       0                    0       0         