In [1]:
import pandas as pd
import numpy as np
import os
import time
from collections import defaultdict
from sklearn import linear_model
from sklearn.metrics import accuracy_score,f1_score,recall_score,roc_curve
from sklearn import metrics
import seaborn as sns
import matplotlib.pyplot as plt

Flat Domain adaptation, one dataset is held out, the model is trained on the remaining four datasets. Prediction is done using the parameters of the dataset that is closest to the held out one.

In [2]:
symptoms = ['fever','cough','muscle','sorethroat','virus']

In [3]:
TRAIN_DIRECTORY = "../../../Data/Symptoms_Demo/Goviral/Train/"
TEST_DIRECTORY = "../../../Data/Symptoms_Demo/Goviral/Test/"
for fn in os.listdir(TRAIN_DIRECTORY):
    print(fn)

.DS_Store
hongkong.csv
loeb.csv
fluwatch.csv
hutterite.csv
goviral.csv


In [4]:
# TRAIN_DIRECTORY = "../../../Data/Symptoms_Demo/Goviral/Train/"
# TEST_DIRECTORY = "../../../Data/Symptoms_Demo/Goviral/Test/"
coefficients = defaultdict()

In [5]:
def read_file(filename):
    data = pd.read_csv(filename)
#     data = data[symptoms]
#     print(data.head())
    return data

In [6]:
def get_training_data(files_):
    data = defaultdict()
    columns = defaultdict()
    for i in files_:
        name = i
        name = name.replace('.csv','')
        data[name] = read_file(TRAIN_DIRECTORY+i)
        columns[name] = list(data[name].columns)
        columns[name].remove('virus')
    return data,columns

In [7]:
def overlap_columns(columns_):
    all_columns = list(columns_.values())
    overlap = list(set(all_columns[0]) & set(all_columns[1]) & set(all_columns[2]) & set(all_columns[3]))
    return overlap
    

In [8]:
def create_columns(columns_):
    overlap = overlap_columns(columns_)
    new_columns = []
    temp = []
    for i in columns_.keys():
        x = [i.replace('.csv','')+'_'+j for j in columns_[i]]
        temp.append(x)
    t = [val for sublist in temp for val in sublist]
    new_columns = t + overlap
    new_columns.append('virus')
    return new_columns

In [9]:
def create_new_dataframe(data,columns):
    new_columns = create_columns(columns)
    new_dataset = defaultdict()
    for i,name in enumerate(data.keys()):
        new_data = pd.DataFrame(columns=new_columns)
        dataset = data[name]
        for j in columns[name]:
            new_data[name+'_'+j] = dataset[j]
            new_data[j] = dataset[j]
        new_data['virus'] = dataset['virus']
        new_data.fillna(0,inplace=True)
        new_dataset[name] = new_data
    #concatenate all the dataframe
    new_dataset = pd.concat(new_dataset.values())
    return new_dataset

In [10]:
def ml_model(dataset):
    lm = linear_model.LogisticRegression()
    x_train = dataset.drop(['virus'],axis = 1)
    y_train = dataset['virus']
    x = lm.fit(x_train,y_train)
    coeff = x.coef_.tolist()[0]
    return lm,coeff

#### Heldout dataset : NYUMC

In [11]:
# files_nyumc = ['goviral.csv','fluwatch.csv','hongkong.csv','hutterite.csv']
# data_nyumc,columns_nyumc = get_training_data(files_nyumc)

# #create the dataframe for domain adaptation
# new_dataset_nyumc = create_new_dataframe(data_nyumc,columns_nyumc)

# coeff_without_nyumc = ml_model(new_dataset_nyumc)
# coefficients['nyumc'] = coeff_without_nyumc

#### Heldout dataset : Goviral

In [12]:
files_goviral = ['goviral.csv','fluwatch.csv','hongkong.csv','hutterite.csv']


data_goviral,columns_goviral = get_training_data(files_goviral)
new_dataset_goviral = create_new_dataframe(data_goviral,columns_goviral)

coeff_without_goviral = ml_model(new_dataset_goviral)
coefficients['goviral'] = coeff_without_goviral

#### Heldout dataset : FluWatch

In [13]:
# files_fluwatch = ['goviral.csv','fluwatch.csv','hongkong.csv','hutterite.csv']
# data_fluwatch,columns_fluwatch = get_training_data(files_fluwatch)
# new_dataset_fluwatch = create_new_dataframe(data_fluwatch,columns_fluwatch)

# coeff_without_fluwatch = ml_model(new_dataset_fluwatch)
# coefficients['fluwatch'] = coeff_without_fluwatch

#### Heldout dataset : HongKong

In [14]:
# files_hongkong = ['goviral.csv','fluwatch.csv','hongkong.csv','hutterite.csv']
# data_hongkong,columns_hongkong = get_training_data(files_hongkong)
# new_dataset_hongkong = create_new_dataframe(data_hongkong,columns_hongkong)

# coeff_without_hongkong = ml_model(new_dataset_hongkong)
# coefficients['hongkong'] = coeff_without_hongkong

#### Heldout dataset : Hutterite

In [15]:
# files_hutterite = ['goviral.csv','fluwatch.csv','hongkong.csv','hutterite.csv']
# data_hutterite,columns_hutterite = get_training_data(files_hutterite)
# new_dataset_hutterite = create_new_dataframe(data_hutterite,columns_hutterite)

# coeff_without_hutterite = ml_model(new_dataset_hutterite)
# coefficients['hutterite'] = coeff_without_hutterite

In [16]:
# remove the target variable
def remove_target(data_dict):
    for i in data_dict.keys():
        data_dict[i].drop(['virus'],axis = 1,inplace = True)
    return data_dict

In [17]:
#get the entire training data
files_ = ['goviral.csv','fluwatch.csv','hongkong.csv','hutterite.csv','loeb.csv']
data_,columns_ = get_training_data(files_)
print(data_.keys())
data_ = remove_target(data_)

dict_keys(['loeb', 'goviral', 'hongkong', 'fluwatch', 'hutterite'])


#### Model for testing the performance

In [18]:
def test_model(train_data,test_data):
    lm,coeff = ml_model(train_data)
    train = test_data.drop(['virus'],axis = 1)
    test = test_data['virus']
    y_pred = lm.predict(train)
    acc = accuracy_score(test,y_pred)
    fpr,tpr,threshold = roc_curve(test,y_pred)
    auc_score = metrics.auc(fpr,tpr)
    return acc,auc_score
    

#### Prepare data for testing

In [19]:
data_,columns = get_training_data(files_)

In [20]:
def create_data_for_testing(data,name,columns_):
    new_data = pd.DataFrame(columns = columns_)
    columns_for_data = list(data.columns)
    col = [x for x in columns_for_data if x != 'virus']
    for i in col:
        new_data[name+'_'+i] = data[i]
        new_data[i] = data[i]
    new_data['virus'] = data['virus']
    new_data.fillna(0,inplace = True)
    return new_data

In [21]:
def test_against_all(dataset_name,to_be_tested_names,data_,original_data,store_):
    columns = list(original_data.columns)
    data = data_[dataset_name]
    for i in to_be_tested_names:
        temp_data = create_data_for_testing(data,i,columns)
        acc,auc_score = test_model(original_data,temp_data)
        print("Comparing against ",i)
        print("Accuracy : ",acc)
        print("Auc Score : ",auc_score)
        print("____________________________")
        store_[i] = auc_score
    return store_

#### Test NYUMC

In [22]:
# store_nyumc = defaultdict()
# print("Testing NYUMC Data!\n")
# store_nyumc = test_against_all('nyumc',['goviral','fluwatch','hongkong','hutterite'],data_,new_dataset_nyumc,store_nyumc)

In [23]:
# store_nyumc

#### Test Goviral

In [24]:
store_gv = defaultdict()
print("Testing Goviral data!\n")
store_gv = test_against_all('hutterite',['goviral','fluwatch','hongkong','hutterite'],data_,new_dataset_goviral,store_gv)

Testing Goviral data!

Comparing against  goviral
Accuracy :  0.7447306791569087
Auc Score :  0.7543104083286272
____________________________
Comparing against  fluwatch
Accuracy :  0.615144418423107
Auc Score :  0.6288641441795368
____________________________
Comparing against  hongkong
Accuracy :  0.7080405932864949
Auc Score :  0.7262516266850002
____________________________
Comparing against  hutterite
Accuracy :  0.8930523028883685
Auc Score :  0.8920715986937412
____________________________


In [25]:
store_gv

defaultdict(None,
            {'fluwatch': 0.6288641441795368,
             'goviral': 0.7543104083286272,
             'hongkong': 0.7262516266850002,
             'hutterite': 0.8920715986937412})

#### Test Fluwatch

In [26]:
store_fw = defaultdict()
print("Testing FluWatch data!\n")
store_fw = test_against_all('fluwatch',['fluwatch','goviral','hongkong','hutterite'],data_,new_dataset_fluwatch,store_fw)

Testing FluWatch data!



NameError: name 'new_dataset_fluwatch' is not defined

In [None]:
store_fw

#### Test Hongkong

In [None]:
store_hk = defaultdict()
print("Testing Hongkong!\n")
store_hk = test_against_all('hongkong',['hongkong','goviral','fluwatch','hutterite'],data_,new_dataset_hongkong,store_hk)

In [None]:
store_hk

#### Test hutterite

In [None]:
store_ht = defaultdict()
print("Testing hutterite!\n")
store_ht = test_against_all('hutterite',['hutterite','goviral','fluwatch','hongkong'],data_,new_dataset_hutterite,store_ht)

In [None]:
store_ht