#### How do the distributions look for the different datasets and as we go up higher the hierarchy?

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from collections import defaultdict

In [2]:
data_dir = "../../Data/"

In [14]:
#since we are interested in the P(x|Y=1) we sample accordingly
def read_data(filename):
    data = pd.read_csv(filename)
    data = data.loc[data['virus'] == 1]
    return data

In [15]:
symptoms = ['fever','cough','muscle','sorethroat']

In [16]:
#check if all values are present or not
def dist_dict(dict_):
    ans = [0,0]
    if 0 in dict_.keys():
        ans[0] = dict_[0]
    if 1 in dict_.keys():
        ans[1] = dict_[1]
    return ans

In [17]:
#get the conditonal probabilities for the gender
def gender_prob(df,symp):
    dist = defaultdict()
    
    #get the conditional probability distribution for males
    dist['male'] = dist_dict(df.loc[df['male'] == 1][symp].value_counts()) #/\
#                 df.loc[df['male']==1][symp].count()
    
    #get the conditional probability distribution for females
    dist['female'] = dist_dict(df.loc[df['female'] == 1][symp].value_counts()) #/\
#                 df.loc[df['female']==1][symp].count()
    
    #get the conditional probability distribution for age 0-4
    dist['age 0-4'] = dist_dict(df.loc[df['age 0-4'] == 1][symp].value_counts()) #/\
#                 df.loc[df['age 0-4']==1][symp].count()
    
    dist['age 5-15'] = dist_dict(df.loc[df['age 5-15'] == 1][symp].value_counts())#/\
#                 df.loc[df['age 5-15']==1][symp].count()
    
    dist['age 16-44'] = dist_dict(df.loc[df['age 16-44'] == 1][symp].value_counts())#/\
#                 df.loc[df['age 16-44']==1][symp].count()
    
    dist['age 45-64'] = dist_dict(df.loc[df['age 45-64'] == 1][symp].value_counts())#/\
#                 df.loc[df['age 45-64']==1][symp].count()
    
    dist['age 65+'] = dist_dict(df.loc[df['age 65+'] == 1][symp].value_counts())#/\
#                 df.loc[df['age 65+']==1][symp].count()
    
    dist = pd.DataFrame.from_dict(dist)
    return dist

In [18]:
#returns the probabilities of the values in a list
def prob(data):
    x = data['fever']
    vector = data.groupby('fever').size().div(len(data))

    vector = data.groupby('cough').size().div(len(data))
    
    vector = data.groupby('muscle').size().div(len(data))
    
    vector = data.groupby('sorethroat').size().div(len(data))
    

In [19]:
#input is list of tuples, where first element is the symptom and\
#second element is the pandas dataframe with the distribution
def plot_dist_demo(list_,name):
    for item in list_:
        i = item[1]
        ans_0 = list(i['male'])
        ans_1 = list(i['female'])
        plt.figure(figsize=(14,10))
        fig,ax = plt.subplots()
        index = np.arange(2)
        bar_width = 0.35
        opacity = 0.6
        rects1 = plt.bar(index, ans_0, bar_width,
        alpha=opacity,
        color='b',
        label='Male')

        rects2 = plt.bar(index + bar_width, ans_1, bar_width,
        alpha=opacity,
        color='g',
        label='Female')

        plt.xlabel('Symptom Value',fontsize = 12)
        plt.ylabel('Freqency',fontsize = 12)
        plt.title('Distribution of ' + item[0]+ " across gender",fontsize =12)
        plt.xticks(index + bar_width/2 , ('0','1'),fontsize = 12)
        plt.legend()
        plt.savefig("../Distributions/"+name+"/"+item[0]+".pdf",bounds="tight")
#         plt.tight_layout()
#         plt.show()
        plt.close()
        

In [48]:
def get_prob(df,symp):
    #get the conditional probability of the symptom given fever
    ans = (df[symp].value_counts()/df[symp].count()).to_dict()
    
    #check if both the values of 0 and 1 are present
    #if not set the missing value to 0
    if 0 not in ans.keys():
        answer = np.abs(ans[1])
#         return {0:0,1:ans[1]}
        return answer
    elif 1 not in ans.keys():
        answer = np.abs(ans[0])
#         return {0:ans[0],1:0}
        return answer
    else:
        answer = np.abs(ans[0] - ans[1])
        return answer
  

In [61]:
if __name__ == '__main__':
    
    #read the total data
    total = read_data(data_dir + "total.csv")
    
    #read male samples
    male = read_data(data_dir + "male.csv")
    
    #read female sample 
    female = read_data(data_dir + "female.csv")
    
    #age1 
    age1 = read_data(data_dir + "age1.csv")
    
    #age2 
    age2 = read_data(data_dir + "age2.csv")
    
     #age3 
    age3 = read_data(data_dir + "age3.csv")
    
    #age4 
    age4 = read_data(data_dir + "age4.csv")
    #get the count right now for fever
    
    #age5 
    age5 = read_data(data_dir + "age5.csv")
    
    #individually collected
    ic = read_data(data_dir + "individually_reported.csv")
    
    #healthworker reported
    hw = read_data(data_dir + "health_worker_facilitated.csv")
    
    #goviral
    gv  = read_data(data_dir + "goviral.csv")
    
    #fluwatch
    fw = read_data(data_dir + 'fluwatch.csv')
    
    #hongkong
    hk = read_data(data_dir + 'hongkong.csv')
    
    #hutterite
    ht = read_data(data_dir + 'hutterite.csv')
    
    #get the difference between the conditional probabilities
    #for all the levels
    
    final_dict = defaultdict()
    for symp in symptoms:
        temp = []
        
        temp.append(get_prob(total,symp)) #total
        temp.append(get_prob(male,symp))  #males
        temp.append(get_prob(female,symp))#females
        temp.append(get_prob(age1,symp)) #age1
        temp.append(get_prob(age2,symp)) #age2
        temp.append(get_prob(age3,symp)) #age3
        temp.append(get_prob(age4,symp)) #age4
        temp.append(get_prob(age5,symp)) #age5
        temp.append(get_prob(ic,symp))   #individually collected
        temp.append(get_prob(hw,symp))   #healthworker reported
        temp.append(get_prob(gv,symp))   #goviral
        temp.append(get_prob(fw,symp))   #fluwatch
        temp.append(get_prob(hk,symp))   #hongkong
        temp.append(get_prob(ht,symp))   #hutterite
        
        final_dict[symp] = temp
        
    final_dataframe = pd.DataFrame.from_dict(final_dict)
    final_dataframe['Nodes'] = ['Total','Male','Female','Age1',\
                               'Age2','Age3','Age4','Age5','Indi_repo',\
                               'Healthworker_facilitated','Goviral','Fluwatch','Hongkong',\
                               'Hutterite']
    final_dataframe.set_index('Nodes',inplace = True)
#     final_dataframe = final_dataframe.T
    print(final_dataframe)
    final_dataframe.to_csv("../Distributions/conditional_probability.csv")

                             cough     fever    muscle  sorethroat
Nodes                                                             
Total                     0.606455  0.443375  0.544168    0.001133
Male                      0.599778  0.475264  0.577543    0.077265
Female                    0.623154  0.418783  0.519197    0.075015
Age1                      0.463087  0.248322  0.740492    0.199105
Age2                      0.737775  0.462792  0.669738    0.060241
Age3                      0.580786  0.548035  0.268559    0.251092
Age4                      0.622449  0.469388  0.321429    0.107143
Age5                      0.509804  0.176471  0.274510    0.098039
Indi_repo                 0.384828  0.326897  0.268966    0.136552
Healthworker_facilitated  0.663698  0.473459  0.615248    0.036694
Goviral                   0.482759  0.089655  0.082759    0.524138
Fluwatch                  0.319540  0.485057  0.393103    0.121839
Hongkong                  0.629916  0.466151  0.522888    0.17

In [9]:
# if __name__ == '__main__':
#     fw = read_data(data_dir+"hutterite.csv")
#     dist_df = pd.DataFrame(columns = ['male','female','age 0-4','age 5-15'\
#                                      'age 16-44','age 45-64','age 65+'])
#     temp = []

#     for symp in symptoms: 
#         dist_symp = gender_prob(fw,symp)
#         temp.append((symp,dist_symp))
#     plot_dist_demo(temp,'Hutterite')