In [1]:
from db import database as D
from db import compilation as C
import pandas as pd
import numpy as np
import itertools
from bokeh.charts import Bar, Scatter
from bokeh.io import output_notebook, show

you are now accessing the db named HBNL3


In [2]:
output_notebook()

In [3]:
def remove_non_coga_ids(df):
    '''given an un-indexed df with POP & ID column...
       filters IDs to return only COGA subs.
       ADD TO UTILITIES MODULE'''
    
    # filter all non_coga POPs & IDs
    brain_dys = [i for i in df['ID'] if i.startswith(('a', 'c', 'h', 'p'))]
    relia_ids = df[df['POP'] == 'Relia']['ID'].tolist()
    alc_chal_ids  = df[df['POP'] == 'Alc-chal']['ID'].tolist()

    # IDS to filter 
    to_filter = relia_ids + alc_chal_ids+ brain_dys
    unique_ids = list(set(to_filter))

    # filter 
    return df[~df['ID'].isin(unique_ids)]

In [4]:
def filters(col, lower_range, upper_range):
    '''use with consec_erp_sessions function  '''
    if col >= lower_range and col <= upper_range:
        return '1'
    else:
        return '0'
    
def above_or_below(col, lower, upper):
    '''use with date_range argument of consec_erp_sessions function'''
    lst=[]
    if col >= lower and col <= upper:
        lst.append(col)
    return sum(lst)


def consec_erp_sessions(lower_range, upper_range, 
                   consecutive_runs=int, date_range=(),
                   print_matching_sessions=False, plot=False): 

    # query for all test dates in sessions 
    docs = D.Mdb['sessions'].find({}, {'a-date':1, 'b-date':1, 'c-date':1, 'd-date':1, 
                                       'e-date':1, 'f-date':1, 'g-date':1,'h-date':1,
                                       'i-date':1, 'ID':1, 'site':1, 'POP':1, '_id':0})
    df = C.buildframe_fromdocs(docs).reset_index()

    # remove non-coga subs
    only_coga = remove_non_coga_ids(df)

    # prepare df for itertools combinations 
    df_date = only_coga[[i for i in only_coga.columns if i.endswith('date')]].reset_index()

    # h-date & i-date dtype = objects 
    df_date[[i for i in df_date.columns]] = df_date[[i for i in df_date.columns]].apply(pd.to_datetime)

    # create new df for combinations 
    iter_df = pd.DataFrame(index=df_date.index)

    for a,b in itertools.combinations(df_date.columns, 2):
        iter_df['{}_{}'.format(b,a)] = df_date[b] - df_date[a]


    # filter by columns of interest 
    differences_of_interest = ['b-date_a-date', 'c-date_b-date',
                               'd-date_c-date', 'e-date_d-date',
                               'f-date_e-date', 'g-date_f-date',
                               'h-date_g-date', 'i-date_h-date']

    iter_df1 = iter_df[[i for i in iter_df.columns if i in differences_of_interest]]

    # get days as an int & remove all nan values 
    df1 = (iter_df1 / np.timedelta64(1, 'D')).astype(int)
    df2 = df1[df1[[i for i in df1.columns]] != -9223372036854775808].fillna(value=0)

    # add site and ID back to df
    df2['ID']  = only_coga['ID'].tolist()
    df2['site'] = only_coga['site'].tolist()
    df2[differences_of_interest] = df2[differences_of_interest].astype(int)
    df3 = df2.drop_duplicates()

    # create columns of 0 and 1 to indicate whether run difference fell between date range 
    df4 = df3.copy()
    df4['b-a_in_range'] = df4['b-date_a-date'].apply(filters, args=(lower_range, upper_range))
    df4['c-b_in_range'] = df4['c-date_b-date'].apply(filters, args=(lower_range, upper_range))
    df4['d-c_in_range'] = df4['d-date_c-date'].apply(filters, args=(lower_range, upper_range))
    df4['e-d_in_range'] = df4['e-date_d-date'].apply(filters, args=(lower_range, upper_range))
    df4['f-e_in_range'] = df4['f-date_e-date'].apply(filters, args=(lower_range, upper_range))
    df4['g-f_in_range'] = df4['g-date_f-date'].apply(filters, args=(lower_range, upper_range))
    df4['h-g_in_range'] = df4['h-date_g-date'].apply(filters, args=(lower_range, upper_range))
    df4['i-h_in_range'] = df4['i-date_h-date'].apply(filters, args=(lower_range, upper_range))

    # change dtype of columns created in filter function to from obj to int
    df4[[i for i in df4.columns if i.endswith('in_range')]] = df4[[i for i in df4.columns if i.endswith('in_range')]].astype(int)

    # sum rows that were created from filter function & select consecutive runs
    concat_cols = [i for i in df4.columns if i.endswith('in_range')]
    df4['total_consec'] = df4[concat_cols].sum(axis=1)
    
        
    if plot:
        # stacked bar graph of total # of consecutive sessions by site with no defined date range b/t first & last session
        df8 = df4[df4['total_consec'] !=0]
        p = Bar(df8, label='total_consec',stack='site',
        title='Number of consecutive ERP sessions between ' + str(lower_range) + ' days and ' + str(upper_range) + ' days', 
        xlabel = "Consecutive Sessions", ylabel="Individuals", legend='top_right')
        return show(p)

    
    if consecutive_runs:
        # returns df of consecutive runs only
        df5 = df4[df4['total_consec'] == consecutive_runs]
        if date_range: 
            # returns df of consec runs whose first and last run fall within user defined date range 
            df6 = df5.copy()
            # if date differences column fall within date range... sum them & turn # days into years 
            df6['b-a_in_range_consec_runs'] = df6['b-date_a-date'].apply(above_or_below, args=(lower_range, upper_range))
            df6['c-b_in_range_consec_runs'] = df6['c-date_b-date'].apply(above_or_below, args=(lower_range, upper_range))
            df6['d-c_in_range_consec_runs'] = df6['d-date_c-date'].apply(above_or_below, args=(lower_range, upper_range))
            df6['e-d_in_range_consec_runs'] = df6['e-date_d-date'].apply(above_or_below, args=(lower_range, upper_range))
            df6['f-e_in_range_consec_runs'] = df6['f-date_e-date'].apply(above_or_below, args=(lower_range, upper_range))
            df6['g-f_in_range_consec_runs'] = df6['g-date_f-date'].apply(above_or_below, args=(lower_range, upper_range))
            df6['h-g_in_range_consec_runs'] = df6['h-date_g-date'].apply(above_or_below, args=(lower_range, upper_range))
            df6['i-h_in_range_consec_runs'] = df6['i-date_h-date'].apply(above_or_below, args=(lower_range, upper_range))
            
            df6['consec_run_diff_days'] = df6[[i for i in df6.columns if i.endswith('runs')]].sum(axis=1)
            df6['years'] = df6['consec_run_diff_days'].apply(lambda x: int(x)/365)
            
        
            if print_matching_sessions:
                # date_range as a dictionary 
                df7 = df6[[i for i in df6.columns if i.endswith(('ID', 'runs'))]].set_index('ID')
                df_dict = df7.to_dict(orient='index')

                # filter out consec runs that fall within range
                consec_dict = {}
                for k,v in df_dict.items():
                    for k1,v1 in v.items():
                        if v1 > 0:
                            consec_dict.setdefault(k1[:3], []).append(k)
                            #dic.setdefault(k, {})[k1[:3]] = v1         
                return consec_dict  # print_matching...
            return df6[(df6['years'] > date_range[0]) & (df6['years'] < date_range[1])] # date_range...
        return df5 # consecutive_runs...

In [5]:
# set varaibles
lower_range = 20
upper_range = 730
dates = (3,4)

In [6]:
# returns IDs w/ 2 consec runs that are greater than 20 days but less than 730 days apart where...
#... # of time elapsed between first and last run is between 3-4 years
ids_dict = consec_erp_sessions(lower_range, upper_range,date_range=dates, consecutive_runs=2, print_matching_sessions=True)


In [7]:
# df version of the cell above 
consec_erp_sessions(lower_range, upper_range,date_range=dates, consecutive_runs=2).head()

Unnamed: 0,b-date_a-date,c-date_b-date,d-date_c-date,e-date_d-date,f-date_e-date,g-date_f-date,h-date_g-date,i-date_h-date,ID,site,...,b-a_in_range_consec_runs,c-b_in_range_consec_runs,d-c_in_range_consec_runs,e-d_in_range_consec_runs,f-e_in_range_consec_runs,g-f_in_range_consec_runs,h-g_in_range_consec_runs,i-h_in_range_consec_runs,consec_run_diff_days,years
49,2757,502,651,742,839,954,0,0,10006013,uconn,...,0,502,651,0,0,0,0,0,1153,3.158904
145,721,654,782,1842,0,0,0,0,10012039,uconn,...,721,654,0,0,0,0,0,0,1375,3.767123
323,724,672,0,0,0,0,0,0,10021014,uconn,...,724,672,0,0,0,0,0,0,1396,3.824658
360,852,671,722,870,0,0,0,0,10024008,uconn,...,0,671,722,0,0,0,0,0,1393,3.816438
387,721,1572,683,0,0,0,0,0,10024066,uconn,...,721,0,683,0,0,0,0,0,1404,3.846575


In [8]:
# plot of # of consec sessions at least 1 year but no more than 2 years from first test date by site 
consec_erp_sessions(365, 730, plot=True)

    >  if sessions a-b are < 2 years apart and f-g are < 2 years apart -- calculate date range
    > if sessions a thru d are less than 2.5 years apart -- make sure a-testdate and d-testdate are less than 9 years 1 month...

# GENERAL PURPOSE FUNCTION TO QUERY DB AND RETURN STACKED BAR GRAPH

    > reduce the number of arguments used to create stacked bar graph 

In [9]:
def plot_stacked(collection_keys_lst, remove_non_coga=False,
                 stacked_axes_tup=False, stack_labels_dict=False):
    '''given a list of keys, plots a stacked bar graph of a given key'''
    
    # create projection 
    proj = {i: 1 for i in collection_keys_lst}
    proj.update({'_id':0})
    
    docs = D.Mdb['sessions'].find({}, proj)
    df = C.buildframe_fromdocs(docs).reset_index()
    
    if remove_non_coga:
        only_coga = remove_non_coga_ids(df)
        p = Bar(only_coga, label=stacked_axes_tup[0], stack=stacked_axes_tup[1],
                xlabel=stack_labels_dict['x'], ylabel=stack_labels_dict['y'],
                title=stack_labels_dict['title'], legend=stack_labels_dict['legend'])
        return show(p)
        
    if stack_labels_dict:
        p = Bar(df, label=stacked_axes_tup[0], stack=stacked_axes_tup[1],
                xlabel=stack_labels_dict['x'], ylabel=stack_labels_dict['y'],
                title=stack_labels_dict['title'], legend=stack_labels_dict['legend'])
        return show(p)

In [10]:
keys_r = ['ID', 'POP', 'site','alc_dep_dx', 'alc_dep_ons', 'handedness', 'sex']

stack_dict = {'x': 'Sex Distribution by Site',
              'y' : 'Count', 
              'title': 'University',
              'legend': 'top_left'}

#plots distribution of POP by site
stack_tup = ('site', 'sex')
plot_stacked(keys_r, remove_non_coga=True,
            stacked_axes_tup=stack_tup, stack_labels_dict=stack_dict)

In [11]:
def get_record_keys(collection_name):
    '''returns all keys in a given collection'''

    return sorted(list(D.Mdb[collection_name].find_one().keys()))

def check_record_keys(collection_name, lst_of_keys):
    '''informs user if key exists in a collection & tells which one(s) cannot be found '''
    
    existing_keys = get_record_keys(collection_name)
    compare = set(existing_keys) & set(lst_of_keys)

    if len(list(compare)) != len(lst_of_keys):
        missing = set(compare) ^ set(lst_of_keys)
        missing_str = str(list(missing)).replace('[', '').replace(']', '').replace('"', '')
        print(missing_str, 'is not a key in', collection_name, 'collection')

In [12]:
keys = ['ID', 'POP', 'site']

check_record_keys('neuropsych', keys)

'POP' is not a key in neuropsych collection


# OTHER

In [13]:
def dob_bins(col):
    '''used with plot_ages_by_site()'''
    
    col = int(col)
    if col >= 1900 and col <=1949:
        return '1900-1949'
    if col >= 1950 and col <= 1959:
        return '1950s'
    if col >= 1960 and col <= 1969:
        return '1960s'
    if col >= 1970 and col <= 1979:
        return '1970s'
    if col >= 1980 and col <= 1989:
        return '1980s'
    if col >= 1990 and col<= 1999:
        return '1990s'
    if col >= 2000 and col<= 2009:
        return '2000s'
    if col >= 2010 and col<= 2017:
        return '2010s'

def plot_decades_by_site():
    
    #query DB for all DOBS, site
    docs = D.Mdb['sessions'].find({}, {'site' : 1, 'ID':1, 'DOB':1, 'POP':1, '_id': 0})
    df = C.buildframe_fromdocs(docs).reset_index().dropna()
    
    
    #remove all non-coga subs
    only_coga = remove_non_coga_ids(df)
    
    #create new column for year & change from float --> int
    only_coga1 = only_coga.copy()
    only_coga1['year'] = [y.year for y in only_coga1.DOB]
    only_coga1['year'] = only_coga1['year'].astype(int)
    
    #create bins to reduce data 
    only_coga1['dob-bins'] = only_coga1['year'].apply(dob_bins)
    
    #p = Bar(only_coga, label='dob-bins', stack='site', palette=['brown', 'silver',
                                                     #'gold', 'red', 'orange',
                                                     #'black', 'purple'])
    
    p = Bar(only_coga1, label='dob-bins', stack='site', title='Subject Ages By Site')
    return show(p)

In [14]:
plot_decades_by_site()