#### Explore the data distribution
* How does the data look?
* Is the distribution similar in all the population/domains?
* How does the data distribution help in defining the trasport over observational data?


In [1]:
from __future__ import print_function
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline


In [2]:
data_dir = "../../Data/"

Since we are interested in finding the $P(symptom|Flu)$, we only sample data for positive flu cases.

In [14]:
def read_data(filename):
    data = pd.read_csv(filename)
#     data = data.loc[data['virus'] == 1]
    return data

The symptom variables have two values : {0,1}. Hence this can be modelled as a bernoulli distribution. Each symptom is denoted as a bernoulli distribution by a histrogram denoting the probability of occurence of the symptom.

In [15]:
symptoms = ['fever','cough','muscle','sorethroat']

In [28]:
#plot the distribution of the symptoms 
#kde = True draws the kernel density estimate for the histrograms
def plot_dist(df,title,kd=False):
    y_label = 'Frequency'
    for i in symptoms:
        sns.set()
        ax = sns.distplot(df[i],kde = kd)
        if kd:
            y_label = 'Normalized frequency'
        ax.set(xlabel = 'Symptom value',ylabel = y_label)
        ax.set_xlim(0,1)
        plt.title(i+" " + title)
#         plt.show()
        plt.savefig("../without_KDE/"+i+title+".pdf",bounds='tight')
        plt.close()

In [29]:
#the input to this function is a list of numpy arrays where
#each element of the array has the frequency of 0s and 1s of the particular symptoms
#in the order of the symptom mentioned in symptoms
def plot_bernoulli(df1,df2,df3,df4,title):
    
    colors = ['b','g','r','y']
    plt.figure(figsize=(14,10))
    fig, ax1 = plt.subplots()
    ax1.hist([df1['sorethroat'],df2['sorethroat'],df3['sorethroat'],df4['sorethroat']],alpha = 0.5,color=colors,
            label = ['GoViral','Fluwatch','Hongkong','Hutterite'])
    ax1.set_xlim(0,1)
    ax1.set_ylabel("Count",fontsize=12)
    ax1.set_xlabel("Symptom value",fontsize = 12)
    plt.xticks(fontsize=12)
    plt.yticks(fontsize=12)
#     plt.tight_layout()
    plt.title("Sorethroat distribution across different datasets")
    plt.legend(loc = 'best',fontsize=12)
    plt.savefig("Combined_sorethroat.pdf",bounds='tight')
    plt.show()

#     sns.distplot(df1['fever'],hist = True,kde = False)
#     sns.distplot(df2['fever'],hist = True,kde= False)
#     sns.distplot(df3['fever'],hist = True,kde= False)
#     ax = sns.distplot(df4['fever'],hist = True, kde = False)
#     ax.set(xlabel = 'Symptom value',ylabel = 'Normalized Frequency')
#     ax.set_xlim(0,1)
#     plt.show()
#     for i in symptoms:
#         sns.set()
#         ax = sns.distplot(df[i],kde = True)
#         ax = sns.distplot(df[i],kde = True)
        
#         ax.set(xlabel = 'Symptom value',ylabel = 'Normalized Frequency')
#         ax.set_xlim(0,1)
#         plt.title(i+" " + title)
# #         plt.savefig("../Fig/"+i+title+".pdf",bounds='tight')
#         plt.close()
        


Get the pvalues of the predictors

In [30]:
def p_value(data):
    X = np.matrix(data[symptoms])
    y = np.array(data['virus'])
    
    #import the library function
    import statsmodels.api as sm
    from scipy import stats
    
    X2 = sm.add_constant(X)
    est = sm.OLS(y, X2)
    est2 = est.fit()
    print(est2.summary())
#     print("f : ",x)
#     print("p :",p)

#### The study includes 4 datasets : GoViral, Fluwatch, HongKong, Hutterite

In [32]:
if __name__ == '__main__':
    #read_goviral
    gv = read_data(data_dir+"goviral.csv")
#     plot_bernoulli(gv,"distribution for Goviral.")
    plot_dist(gv,'distribution for Goviral.')
    print("_________________________________")
    #plot for fluwatch
    fw = read_data(data_dir+"fluwatch.csv")
    plot_dist(fw,'distribution for FluWatch.')
#     plot_bernoulli(fw,"distribution for Fluwatch.")
    
    #plot for hongkong
    hk = read_data(data_dir+"hongkong.csv")
    plot_dist(hk,'distribution for Hongkong.')
#     plot_bernoulli(hk,"distribution for HongKong.")
    
    #plot for hutterite
    ht = read_data(data_dir+"hutterite.csv")
    plot_dist(ht,'distribution for Hutterite.')
#     plot_bernoulli(gv,fw,hk,ht,"distribution for Hutterite.")
    
#     p_value(gv)
    p_value(fw)
#     p_value(hk)
#     p_value(ht)
    
    


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


_________________________________


  from pandas.core import datetools


                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.057
Model:                            OLS   Adj. R-squared:                  0.053
Method:                 Least Squares   F-statistic:                     13.74
Date:                Thu, 14 Feb 2019   Prob (F-statistic):           6.95e-11
Time:                        15:13:25   Log-Likelihood:                -636.16
No. Observations:                 915   AIC:                             1282.
Df Residuals:                     910   BIC:                             1306.
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.6391      0.033     19.446      0.0