In [1]:
import pandas as pd
import numpy as np
import os
import time
from collections import defaultdict
from sklearn import linear_model
from sklearn.metrics import accuracy_score,f1_score,recall_score,roc_curve
from sklearn import metrics
import seaborn as sns
import matplotlib.pyplot as plt

Flat Domain adaptation, one dataset is held out, the model is trained on the remaining four datasets. Prediction is done using the parameters of the dataset that is closest to the held out one.

In [2]:
TRAIN_DIRECTORY = "../../../Data/Combined/"
# TEST_DIRECTORY = "../../../Data/Symptoms/Test/"
coefficients = defaultdict()

In [3]:
def read_file(filename):
    return pd.read_csv(filename)

def get_training_data(files_):
    data = defaultdict()
    columns = defaultdict()
    for i in files_:
        name = i
        name = name.replace('.csv','')
        data[name] = read_file(TRAIN_DIRECTORY+i)
        columns[name] = list(data[name].columns)
        columns[name].remove('virus')
    return data,columns

In [4]:
def overlap_columns(columns_):
    all_columns = list(columns_.values())
    overlap = list(set(all_columns[0]) & set(all_columns[1]) & set(all_columns[2]) & set(all_columns[3]))
    return overlap
    

In [5]:
def create_columns(columns_):
    overlap = overlap_columns(columns_)
    new_columns = []
    temp = []
    for i in columns_.keys():
        x = [i.replace('.csv','')+'_'+j for j in columns_[i]]
        temp.append(x)
    t = [val for sublist in temp for val in sublist]
    new_columns = t + overlap
    new_columns.append('virus')
    return new_columns

In [6]:
def create_new_dataframe(data,columns):
    new_columns = create_columns(columns)
    new_dataset = defaultdict()
    for i,name in enumerate(data.keys()):
        new_data = pd.DataFrame(columns=new_columns)
        dataset = data[name]
        for j in columns[name]:
            new_data[name+'_'+j] = dataset[j]
            new_data[j] = dataset[j]
        new_data['virus'] = dataset['virus']
        new_data.fillna(0,inplace=True)
        new_dataset[name] = new_data
    #concatenate all the dataframe
    new_dataset = pd.concat(new_dataset.values())
    return new_dataset

In [7]:
def ml_model(dataset):
    lm = linear_model.LogisticRegression()
    x_train = dataset.drop(['virus'],axis = 1)
    y_train = dataset['virus']
    x = lm.fit(x_train,y_train)
    coeff = x.coef_.tolist()[0]
    return lm,coeff

#### Heldout dataset : NYUMC

In [8]:
files_nyumc = ['goviral.csv','fluwatch.csv','hongkong.csv','hutterite.csv']
data_nyumc,columns_nyumc = get_training_data(files_nyumc)

#create the dataframe for domain adaptation
new_dataset_nyumc = create_new_dataframe(data_nyumc,columns_nyumc)

coeff_without_nyumc = ml_model(new_dataset_nyumc)
coefficients['nyumc'] = coeff_without_nyumc

#### Heldout dataset : Goviral

In [9]:
files_goviral = ['nyumc.csv','fluwatch.csv','hongkong.csv','hutterite.csv']
data_goviral,columns_goviral = get_training_data(files_goviral)
new_dataset_goviral = create_new_dataframe(data_goviral,columns_goviral)

coeff_without_goviral = ml_model(new_dataset_goviral)
coefficients['goviral'] = coeff_without_goviral

#### Heldout dataset : FluWatch

In [10]:
files_fluwatch = ['nyumc.csv','goviral.csv','hongkong.csv','hutterite.csv']
data_fluwatch,columns_fluwatch = get_training_data(files_fluwatch)
new_dataset_fluwatch = create_new_dataframe(data_fluwatch,columns_fluwatch)

coeff_without_fluwatch = ml_model(new_dataset_fluwatch)
coefficients['fluwatch'] = coeff_without_fluwatch

#### Heldout dataset : HongKong

In [11]:
files_hongkong = ['nyumc.csv','goviral.csv','fluwatch.csv','hutterite.csv']
data_hongkong,columns_hongkong = get_training_data(files_hongkong)
new_dataset_hongkong = create_new_dataframe(data_hongkong,columns_hongkong)

coeff_without_hongkong = ml_model(new_dataset_hongkong)
coefficients['hongkong'] = coeff_without_hongkong

#### Heldout dataset : Hutterite

In [12]:
files_hutterite = ['nyumc.csv','goviral.csv','fluwatch.csv','hongkong.csv']
data_hutterite,columns_hutterite = get_training_data(files_hutterite)
new_dataset_hutterite = create_new_dataframe(data_hutterite,columns_hutterite)

coeff_without_hutterite = ml_model(new_dataset_hutterite)
coefficients['hutterite'] = coeff_without_hutterite

#### Model for testing the performance

In [15]:
def test_model(train_data,test_data):
    lm,coeff = ml_model(train_data)
    train = test_data.drop(['virus'],axis = 1)
    test = test_data['virus']
    y_pred = lm.predict(train)
    acc = accuracy_score(test,y_pred)
    fpr,tpr,threshold = roc_curve(test,y_pred)
    auc_score = metrics.auc(fpr,tpr)
    return acc,auc_score
    

#### Prepare data for testing

In [16]:
files_ = ['nyumc.csv','goviral.csv','fluwatch.csv','hongkong.csv','hutterite.csv']

data_,columns = get_training_data(files_)

In [17]:
def create_data_for_testing(data,name,columns_):
    new_data = pd.DataFrame(columns = columns_)
    columns_for_data = list(data.columns)
    col = [x for x in columns_for_data if x != 'virus']
    for i in col:
        new_data[name+'_'+i] = data[i]
        new_data[i] = data[i]
    new_data['virus'] = data['virus']
    new_data.fillna(0,inplace = True)
    return new_data

In [18]:
def test_against_all(dataset_name,to_be_tested_names,data_,original_data,store_):
    columns = list(original_data.columns)
    data = data_[dataset_name]
    for i in to_be_tested_names:
        temp_data = create_data_for_testing(data,i,columns)
        acc,auc_score = test_model(original_data,temp_data)
        print("Comparing against ",i)
        print("Accuracy : ",acc)
        print("Auc Score : ",auc_score)
        print("____________________________")
        store_[i] = auc_score
    return store_

#### Test NYUMC

In [19]:
store_nyumc = defaultdict()
print("Testing NYUMC Data!\n")
store_nyumc = test_against_all('nyumc',['goviral','fluwatch','hongkong','hutterite'],data_,new_dataset_nyumc,store_nyumc)

Testing NYUMC Data!

Comparing against  goviral
Accuracy :  0.930341899849
Auc Score :  0.509587800473
____________________________
Comparing against  fluwatch
Accuracy :  0.930341899849
Auc Score :  0.509587800473
____________________________
Comparing against  hongkong
Accuracy :  0.802939699639
Auc Score :  0.537573846362
____________________________
Comparing against  hutterite
Accuracy :  0.94919432145
Auc Score :  0.505924761895
____________________________


In [20]:
store_nyumc

defaultdict(None,
            {'fluwatch': 0.5095878004731702,
             'goviral': 0.5095878004731702,
             'hongkong': 0.53757384636224315,
             'hutterite': 0.50592476189464963})

#### Test Goviral

In [21]:
store_gv = defaultdict()
print("Testing Goviral data!\n")
store_gv = test_against_all('goviral',['nyumc','fluwatch','hongkong','hutterite'],data_,new_dataset_goviral,store_gv)

Testing Goviral data!

Comparing against  nyumc
Accuracy :  0.428846153846
Auc Score :  0.5
____________________________
Comparing against  fluwatch
Accuracy :  0.640384615385
Auc Score :  0.64328637647
____________________________
Comparing against  hongkong
Accuracy :  0.517307692308
Auc Score :  0.557888300041
____________________________
Comparing against  hutterite
Accuracy :  0.513461538462
Auc Score :  0.561783756851
____________________________


In [22]:
store_gv

defaultdict(None,
            {'fluwatch': 0.64328637647023279,
             'hongkong': 0.55788830004076639,
             'hutterite': 0.56178375685102155,
             'nyumc': 0.5})

#### Test Fluwatch

In [23]:
store_fw = defaultdict()
print("Testing FluWatch data!\n")
store_fw = test_against_all('fluwatch',['nyumc','goviral','hongkong','hutterite'],data_,new_dataset_fluwatch,store_fw)

Testing FluWatch data!

Comparing against  nyumc
Accuracy :  0.428846153846
Auc Score :  0.5
____________________________
Comparing against  goviral
Accuracy :  0.640384615385
Auc Score :  0.64328637647
____________________________
Comparing against  hongkong
Accuracy :  0.517307692308
Auc Score :  0.557888300041
____________________________
Comparing against  hutterite
Accuracy :  0.513461538462
Auc Score :  0.561783756851
____________________________


In [24]:
store_fw

defaultdict(None,
            {'goviral': 0.64328637647023279,
             'hongkong': 0.55788830004076639,
             'hutterite': 0.56178375685102155,
             'nyumc': 0.5})

#### Test Hongkong

In [25]:
store_hk = defaultdict()
print("Testing Hongkong!\n")
store_hk = test_against_all('hongkong',['nyumc','goviral','fluwatch','hutterite'],data_,new_dataset_hongkong,store_hk)

Testing Hongkong!

Comparing against  nyumc
Accuracy :  0.768873637465
Auc Score :  0.498952056589
____________________________
Comparing against  goviral
Accuracy :  0.834679047235
Auc Score :  0.744820364573
____________________________
Comparing against  fluwatch
Accuracy :  0.834679047235
Auc Score :  0.744820364573
____________________________
Comparing against  hutterite
Accuracy :  0.839927331449
Auc Score :  0.725069119794
____________________________


In [26]:
store_hk

defaultdict(None,
            {'fluwatch': 0.74482036457278444,
             'goviral': 0.74482036457278444,
             'hutterite': 0.72506911979435607,
             'nyumc': 0.4989520565889442})

#### Test hutterite

In [27]:
store_ht = defaultdict()
print("Testing hutterite!\n")
store_ht = test_against_all('hutterite',['nyumc','goviral','fluwatch','hongkong'],data_,new_dataset_hutterite,store_ht)

Testing hutterite!

Comparing against  nyumc
Accuracy :  0.518345042935
Auc Score :  0.499248120301
____________________________
Comparing against  goviral
Accuracy :  0.574551131928
Auc Score :  0.575093984962
____________________________
Comparing against  fluwatch
Accuracy :  0.574551131928
Auc Score :  0.575093984962
____________________________
Comparing against  hongkong
Accuracy :  0.592505854801
Auc Score :  0.586525974026
____________________________


In [28]:
store_ht

defaultdict(None,
            {'fluwatch': 0.57509398496240605,
             'goviral': 0.57509398496240605,
             'hongkong': 0.58652597402597406,
             'nyumc': 0.49924812030075189})