In [None]:
import sqlite3
import pandas as pd
import numpy as np
import scipy as sp
import scipy.stats as stats
import pylab as plt
from collections import Counter
import datetime

%matplotlib notebook

# flag to control where data is loaded to
mode = 'psycopg2'

# sqlite stuff
if mode == 'sqlite3':
    import sqlite3
    conn = sqlite3.connect("../../data/sepsis.db")
    cursor = conn.cursor()
elif mode == 'psycopg2': # alternatively use postgresql
    import psycopg2
    connect_str = "dbname='sepsis' user='sepsis' host='localhost' " + \
                  "password='sepsis'"
    conn = psycopg2.connect(connect_str)
    cursor = conn.cursor()

qlog_conn = sqlite3.connect('../../data/kfp_log.db')
q_cursor = qlog_conn.cursor()

start = datetime.datetime.now()
# because names are created as case sensistive in postgres, must be quoted...
# should probably fix that...
sql = '''
SELECT d."SubjectId",
    d."EncounterId",
    d."Source",
    -- d.StartDate,
    d."Code",
    d."Type",
    MAX("FlowsheetValue") AS MaxScore,
    -- AVG("FlowsheetValue") AS MeanScore,
    MIN("FlowsheetValue") AS MinScore,
    COUNT("FlowsheetValue") AS NumLoggedScores
 FROM diagnoses d
 LEFT JOIN flowsheet f
 ON d."EncounterId" = f."EncounterId"
 GROUP BY d."SubjectId", d."EncounterId", d."Source", d."Code", d."Type"
 ORDER BY NumLoggedScores DESC
 limit
'''
# timing this query on databases
# df = pd.read_sql(sql,conn)
# sqlite - 42 to 60 seconds
# postgres - 30 seconds

#sql = 'SELECT subjectid, encounterid, source, code, type FROM "diagnoses" limit 100'


In [None]:
def show_timings(df):
    q = pd.read_sql("SELECT * FROM kfp_log order by fauxify_end",qlog_conn)
    print('Method used     :', q.tail(1)['faux_method'].iloc[0])
    print('Time for query  :', (pd.to_datetime(q.tail(1)['query_end']) - pd.to_datetime(q.tail(1)['query_start'])).iloc[0])
    print('Time for fauxify:', (pd.to_datetime(q.tail(1)['fauxify_end']) - pd.to_datetime(q.tail(1)['fauxify_start'])).iloc[0])
    print('Size of dataset :', len(df), 'rows')

In [None]:
from importlib import reload
from kungfauxpandas import KungFauxPandas, TrivialPlugin, DataSynthesizerPlugin, KDEPlugin
kfpd = KungFauxPandas()

In [None]:
kfpd.plugin = TrivialPlugin()
for n in ['10', '100', '1000', '10000', '100000']:
    fdf=kfpd.read_sql(sql + n,conn)
    show_timings(fdf)

fdf.head()

In [None]:
kfpd.plugin = KDEPlugin()
for n in ['10', '100', '1000', '10000', '100000']:
    fdf=kfpd.read_sql(sql + n,conn)
    show_timings(fdf)

In [None]:
kfpd.plugin = DataSynthesizerPlugin()


### DataSynthesizer, two different methods with no configuration

In [None]:
kfpd.plugin = DataSynthesizerPlugin(mode='correlated_attribute_mode')
for n in ['10', '100', '1000', '10000', '100000']:
    fdf=kfpd.read_sql(sql + n,conn)
    show_timings(fdf)

fdf.head()

In [None]:
kfpd.plugin = DataSynthesizerPlugin(mode='independent_attribute_mode')
for n in ['10', '100', '1000', '10000', '100000']:
    fdf=kfpd.read_sql(sql + n,conn)
    show_timings(fdf)

fdf.head()

### Now try DataSynthesizerPlugin with some manual configuration

In [None]:
kfpd.plugin = DataSynthesizerPlugin(mode='correlated_attribute_mode',
                                    candidate_keys = {'SubjectId': True, 'EncounterId': True},
                                    categorical_attributes = {'Source': True, 'Code': True, 'Type': True}
                                   )
for n in ['10']: #, '100', '1000', '10000', '100000']:
    fdf=kfpd.read_sql(sql + n,conn)
    show_timings(fdf)

fdf.head()

In [None]:
kfpd.plugin = DataSynthesizerPlugin(mode='independent_attribute_mode',
                                    candidate_keys = {'SubjectId': True, 'EncounterId': True},
                                    categorical_attributes = {'Source': True, 'Code': True, 'Type': True}
                                   )
for n in ['10', '100', '1000', '10000', '100000']:
    fdf=kfpd.read_sql(sql + n,conn)
    show_timings(fdf)

fdf.head()

In [None]:
degree_of_bayesian_network
kfpd.plugin = DataSynthesizerPlugin(mode='correlated_attribute_mode',
                                    candidate_keys = {'SubjectId': True, 'EncounterId': True},
                                    categorical_attributes = {'Source': True, 'Code': True, 'Type': True},
                                    degree_of_bayesian_network = 3 # default is 2
                                   )
for n in ['10']: #, '100', '1000', '10000', '100000']:
    fdf=kfpd.read_sql(sql + n,conn)
    show_timings(fdf)

fdf.head()