**How to monitor memory usage while this performance test runs**

As this process takes 

First install psrecord:

`pip install psrecord`

Next, find your process PID and substitue into the following command (replace PID with the actual integer value):

`psrecord PID --interval 10 --plot plot1.png`

The above command will monitor the designated PID every 10 seconds until Ctrl-C is pressed.

In [None]:
import sqlite3
import pandas as pd
import numpy as np
import scipy as sp
import scipy.stats as stats
import pylab as plt
from collections import Counter
import datetime

# files and kungfauxpandas loading require reference from one directory level up
import os
os.chdir('..')

# while not currently plotting, would like to add this feature
%matplotlib notebook
pd.set_option('display.width', 110)

# flag to control where data is loaded to
mode = 'psycopg2'

# how many times to run each test for tracking mean/std dev

# sqlite stuff
if mode == 'sqlite3':
    import sqlite3
    conn = sqlite3.connect("../../data/sample_data.db")
    cursor = conn.cursor()
elif mode == 'psycopg2': # alternatively use postgresql
    import psycopg2
    connect_str = "dbname='sepsis' user='sepsis' host='localhost' " + \
                  "password='sepsis'"
    conn = psycopg2.connect(connect_str)
    cursor = conn.cursor()

qlog_conn = sqlite3.connect('../../data/kfp_log.db')
q_cursor = qlog_conn.cursor()

start = datetime.datetime.now()
# because names are created as case sensistive in postgres, must be quoted...
# should probably fix that...
sql = '''
SELECT d."SubjectId",
    d."EncounterId",
    d."Source",
    -- d.StartDate,
    d."Code",
    d."Type",
    MAX("FlowsheetValue") AS MaxScore,
    -- AVG("FlowsheetValue") AS MeanScore,
    MIN("FlowsheetValue") AS MinScore,
    COUNT("FlowsheetValue") AS NumLoggedScores
 FROM diagnoses d
 LEFT JOIN flowsheet f
 ON d."EncounterId" = f."EncounterId"
 GROUP BY d."SubjectId", d."EncounterId", d."Source", d."Code", d."Type"
 ORDER BY NumLoggedScores DESC
 limit
'''
# timing this query on databases

#start = datetime.datetime.now()
#df = pd.read_sql(sql,conn)
#print((datetime.datetime.now() - start).total_seconds())
# w/no limit - medium sepsis database
#   sqlite - 80 to 160 seconds
#   postgres - 30 seconds

#sql = 'SELECT subjectid, encounterid, source, code, type FROM "diagnoses" limit 100'


In [None]:
# query cache
store = {}

def prefetch_query(n):
    if n not in store:
        store[n] = pd.read_sql(sql + n, conn)        
    return store[n]

In [None]:
# sizes of patient population to evaluate
patient_population = ['10', '100', '1000', '10000', '100000']
# how many times to run test to calculate mean/std dev
default_repetitions = 1

def show_timings(df):
    q = pd.read_sql("SELECT * FROM kfp_log order by fauxify_end",qlog_conn)
    print('Method used     :', q.tail(1)['faux_method'].iloc[0])
    print('Time for query  :', (pd.to_datetime(q.tail(1)['query_end']) - pd.to_datetime(q.tail(1)['query_start'])).iloc[0].total_seconds())
    print('Time for fauxify:', (pd.to_datetime(q.tail(1)['fauxify_end']) - pd.to_datetime(q.tail(1)['fauxify_start'])).iloc[0].total_seconds())
    print('Size of dataset :', len(df), 'rows')

# rerun_query option doesn't time fauxify method... need to fix that
def time_method(kfpd, repetitions = default_repetitions, verbose = True, rerun_query = True):
    for n in patient_population:
        fdf = None
        # track each run for calculations
        query_timings = []
        fauxify_timings = []
        for i in range(1, repetitions + 1):
            # if dataframe provided, don't need to re-run query
            if rerun_query:
                fdf=kfpd.read_sql(sql + n,conn)
                q = pd.read_sql("SELECT * FROM kfp_log order by fauxify_end",qlog_conn)
                query_timings.append((pd.to_datetime(q.tail(1)['query_end']) - pd.to_datetime(q.tail(1)['query_start'])).iloc[0].total_seconds())
                fauxify_timings.append((pd.to_datetime(q.tail(1)['fauxify_end']) - pd.to_datetime(q.tail(1)['fauxify_start'])).iloc[0].total_seconds())
            else:
                df = prefetch_query(n)
                start = datetime.datetime.now()
                fdf=kfpd.plugin.fauxify(df)
                fauxify_timings.append((datetime.datetime.now() - start).total_seconds())
            if verbose:
                print('Iteration ', i, 'of ', repetitions)
                print('Method used              :', type(kfpd.plugin).__name__)
                print('Size of dataset returned :', len(fdf), 'rows')
                if rerun_query:
                    print('Time for query           :', query_timings[-1])
                print('Time for fauxify         :', fauxify_timings[-1])
        print('Method used             :', type(kfpd.plugin).__name__)
        print('Size of dataset returned:', len(fdf), 'rows')
        print('    Fauxify Mean   :', np.mean(fauxify_timings))
        print('    Fauxify Std Dev:', np.std(fauxify_timings))
        if rerun_query:
            print('    Query Mean   :', np.mean(query_timings))
            print('    Query Std Dev:', np.std(query_timings))
        else:
            print('    See previous run for query timings')
    return fdf

In [None]:
from importlib import reload
from kungfauxpandas import KungFauxPandas, TrivialPlugin, DataSynthesizerPlugin, KDEPlugin, KFP_DataDescriber
kfpd = KungFauxPandas()

In [None]:
#kfpd.plugin = TrivialPlugin()
#fdf = time_method(kfpd, verbose = False, repetitions = 10)
#fdf.head()

In [None]:
kfpd.plugin = TrivialPlugin()
fdf = time_method(kfpd, verbose = False, rerun_query = False, repetitions = 10)
fdf.head()

### Kernel Density Estimator Plugin testing

In [None]:
kfpd.plugin = KDEPlugin(verbose = False, mode='independent_attribute_mode')
fdf = time_method(kfpd, verbose = False, rerun_query = False, repetitions = 10)
fdf.head()

In [None]:
kfpd.plugin = KDEPlugin(verbose = False, mode='correlated_attribute_mode')
fdf = time_method(kfpd, verbose = False, rerun_query = False, repetitions = 10)
fdf.head()

### DataSynthesizer, two different methods with no configuration

In [None]:
#kfpd.plugin = DataSynthesizerPlugin(mode='correlated_attribute_mode')
#for n in ['10', '100', '1000', '10000', '100000']:
#    fdf=kfpd.read_sql(sql + n,conn)
#    show_timings(fdf)

kfpd.plugin = DataSynthesizerPlugin(mode='correlated_attribute_mode')
fdf = time_method(kfpd, verbose = False, rerun_query = False, repetitions = 10)
fdf.head()

In [None]:
#kfpd.plugin = DataSynthesizerPlugin(mode='independent_attribute_mode')
#for n in ['10', '100', '1000', '10000', '100000']:
#    fdf=kfpd.read_sql(sql + n,conn)
#    show_timings(fdf)

kfpd.plugin = DataSynthesizerPlugin(mode='independent_attribute_mode')
fdf = time_method(kfpd, verbose = False, rerun_query = False, repetitions = 10)
fdf.head()

### Now try DataSynthesizerPlugin with some manual configuration

In [None]:
kfpd.plugin = DataSynthesizerPlugin(mode='correlated_attribute_mode',
                                    candidate_keys = {'SubjectId': True, 'EncounterId': True},
                                    categorical_attributes = {'Source': True,
                                                              'Code': True,
                                                              'Type': True,
                                                              'MaxScore': False,
                                                              'MinScore': False,
                                                              'NumLoggedScores': False}
                                   )
fdf = time_method(kfpd, verbose = False, rerun_query = False, repetitions = 10)
fdf.head()

In [None]:
kfpd.plugin = DataSynthesizerPlugin(mode='independent_attribute_mode',
                                    candidate_keys = {'SubjectId': True, 'EncounterId': True},
                                    categorical_attributes = {'Source': True,
                                                              'Code': True,
                                                              'Type': True,
                                                              'MaxScore': False,
                                                              'MinScore': False,
                                                              'NumLoggedScores': False})
fdf = time_method(kfpd, verbose = False, rerun_query = False, repetitions = 10)
fdf.head()

In [None]:
# testing changes to degree_of_bayesian_network
kfpd.plugin = DataSynthesizerPlugin(mode='correlated_attribute_mode',
                                    candidate_keys = {'SubjectId': True, 'EncounterId': True},
                                    categorical_attributes = {'Source': True,
                                                              'Code': True,
                                                              'Type': True,
                                                              'MaxScore': False,
                                                              'MinScore': False,
                                                              'NumLoggedScores': False},
                                   degree_of_bayesian_network = 3) # default is 2
fdf = time_method(kfpd, verbose = False, rerun_query = False, repetitions = 10)
fdf.head()

In [None]:
test_df = pd.DataFrame({'unique_id': [40552133, 83299697, 96360391, 43551783, 92110570, 87411981, 26772988, 87390284, 34538374, 13208258],
                         #'datetime': ['2017-11-09 02:26:13', '2017-07-20 20:35:41', '2017-12-23 22:48:30', '2017-10-04 05:19:36', '2017-10-15 04:03:31', '2017-08-12 11:35:34', '2017-08-07 12:57:29', '2017-09-20 12:17:48', '2017-08-23 12:39:54', '2017-06-29 07:59:25'],
                         'alpha_numeric_code': ['A4152', 'A414', 'A400', 'A392', 'A4151', 'A392', 'A4181', 'P369', 'B377', 'R6521'],
                         'constant': ['constant_value', 'constant_value', 'constant_value', 'constant_value', 'constant_value', 'constant_value', 'constant_value', 'constant_value', 'constant_value', 'constant_value'],
                         'categorical' : ['category1', 'category2', 'category1', 'category1', 'category2', 'category1', 'category2', 'category1', 'category2', 'category3'],
                         #'float_score': [30.80887770115334, 31.647178703213896, 33.23121156661242, 33.64713140102367, 33.07404123596502, 34.206309535666364, 34.90974444556692, 39.06948372169004, 35.94952085309618, 29.5140595543271],
                         'int_score': [294, 286, 278, 272, 256, 242, 216, 210, 208, 190]})

kfpd.plugin = TrivialPlugin()
fdf=kfpd.plugin.fauxify(test_df)
print(fdf.head())

kfpd.plugin = KDEPlugin(verbose = False)
fdf=kfpd.plugin.fauxify(test_df)
print(fdf.head())

kfpd.plugin = DataSynthesizerPlugin(mode="independent_attribute_mode")
fdf=kfpd.plugin.fauxify(test_df)
print(fdf.head())

test_df.to_csv('sample_data_no_dates.csv', index=False)