In [18]:
import sqlite3
import pandas as pd
import numpy as np
import scipy as sp
import scipy.stats as stats
import pylab as plt
from collections import Counter
import datetime

%matplotlib notebook

# flag to control where data is loaded to
mode = 'sqlite3'

# sqlite stuff
if mode == 'sqlite3':
    import sqlite3
    conn = sqlite3.connect("../../data/sepsis.db")
    cursor = conn.cursor()
elif mode == 'psycopg2': # alternatively use postgresql
    import psycopg2
    connect_str = "dbname='sepsis' user='sepsis' host='localhost' " + \
                  "password='sepsis'"
    conn = psycopg2.connect(connect_str)
    cursor = conn.cursor()

qlog_conn = sqlite3.connect('../../data/kfp_log.db')
q_cursor = qlog_conn.cursor()

start = datetime.datetime.now()
# because names are created as case sensistive in postgres, must be quoted...
# should probably fix that...
sql = '''
SELECT d."SubjectId",
    d."EncounterId",
    d."Source",
    -- d.StartDate,
    d."Code",
    d."Type",
     MAX("FlowsheetValue") AS MaxScore,
     AVG("FlowsheetValue") AS MeanScore,
     MIN("FlowsheetValue") AS MinScore,
     COUNT("FlowsheetValue") AS NumLoggedScores
 FROM diagnoses d
 LEFT JOIN flowsheet f
 ON d."EncounterId" = f."EncounterId"
 GROUP BY d."SubjectId", d."EncounterId", d."Source", d."Code", d."Type"
 ORDER BY NumLoggedScores DESC
 limit
'''

#sql = 'SELECT subjectid, encounterid, source, code, type FROM "diagnoses" limit 100'


#df = pd.read_sql(sql,conn, index_col=['SubjectId', 'EncounterId'])
#   df = pd.read_sql(sql,conn)
#df['StartDate'] = df['StartDate'].astype('datetime64')
# print(df.dtypes)
# print('Elapsed time:', datetime.datetime.now() - start)
# df.head()

# sqlite - 42 to 60 seconds
# postgres - 29 seconds

In [14]:
def show_timings(df):
    q = pd.read_sql("SELECT * FROM kfp_log order by fauxify_end",qlog_conn)
    print('Method used     :', q.tail(1)['faux_method'].iloc[0])
    print('Time for query  :', (pd.to_datetime(q.tail(1)['query_end']) - pd.to_datetime(q.tail(1)['query_start'])).iloc[0])
    print('Time for fauxify:', (pd.to_datetime(q.tail(1)['fauxify_end']) - pd.to_datetime(q.tail(1)['fauxify_start'])).iloc[0])
    print('Size of dataset :', len(df), 'rows')

In [15]:
from importlib import reload
from kungfauxpandas import KungFauxPandas, TrivialPlugin, DataSynthesizerPlugin, KDEPlugin
kfpd = KungFauxPandas()

In [20]:
kfpd.plugin = TrivialPlugin()
for n in ['10', '100', '1000', '10000']:
    fdf=kfpd.read_sql(sql + n,conn)
    show_timings(fdf)

fdf.head()

Method used     : TrivialPlugin
Time for query  : 0 days 00:00:00.670517
Time for fauxify: 0 days 00:00:00.000025
Size of dataset : 10 rows
Method used     : TrivialPlugin
Time for query  : 0 days 00:00:00.723215
Time for fauxify: 0 days 00:00:00.000007
Size of dataset : 100 rows
Method used     : TrivialPlugin
Time for query  : 0 days 00:00:00.668223
Time for fauxify: 0 days 00:00:00.000007
Size of dataset : 1000 rows
Method used     : TrivialPlugin
Time for query  : 0 days 00:00:00.695884
Time for fauxify: 0 days 00:00:00.000008
Size of dataset : 5351 rows


Unnamed: 0,SubjectId,EncounterId,Source,Code,Type,MaxScore,MeanScore,MinScore,NumLoggedScores
0,40552133,288,Encounter,A4152,ICD-10-CM,100.0,30.808878,0.0,294
1,83299697,625,Encounter,A414,ICD-10-CM,100.0,31.647179,0.0,286
2,96360391,985,Encounter,A400,ICD-10-CM,100.0,33.231212,0.0,278
3,43551783,984,Encounter,A392,ICD-10-CM,98.0,33.647131,0.0,272
4,92110570,934,Encounter,A4151,ICD-10-CM,99.0,33.074041,0.0,256


In [22]:
kfpd.plugin = DataSynthesizerPlugin()
#fdf=kfpd.read_sql(sql,conn, index_col=['SubjectId', 'EncounterId'])
fdf=kfpd.read_sql(sql + ' 10',conn)#, fauxify = {'categorical_attributes' : {'Source' : True}})
fdf.head()

#fdf=kfpd.read_sql(sql,conn, index_col=['SubjectId', 'EncounterId'])
#fdf=kfpd.read_sql(sql,conn, fauxify = {'categorical_attributes' : {'Source' : True}})
#df=pd.read_sql(sql + ' 10',conn)
#fdf = kfpd.plugin.fauxify(df)#, categorical_attributes = {'Source' : True, 'Code': True, 'Type': True})
fdf.head()

Skipping read from csv and returing the input data frame
Looking for next attribute-parents pair.
    Considering attribute MinScore
    Considering attribute MaxScore
    Considering attribute Type
    Considering attribute Source
Looking for next attribute-parents pair.
    Considering attribute MinScore
    Considering attribute MaxScore
    Considering attribute Type
Looking for next attribute-parents pair.
    Considering attribute MinScore
    Considering attribute MaxScore
Looking for next attribute-parents pair.
    Considering attribute MaxScore
kungfauxpandas.read_sql() exception while attempting to fauxify data: Length of values does not match length of index


ValueError: Length of values does not match length of index

In [7]:
kfpd.plugin = KDEPlugin()
for n in ['10', '100', '1000', '10000']:
    fdf=kfpd.read_sql(sql + n,conn)
    show_timings(fdf)

Processing column SubjectId as a int64
Processing column EncounterId as a int64
Processing column Source as a object
Processing column Code as a object
Processing column Type as a object
Processing column maxscore as a float64
Processing column minscore as a object
Processing column numloggedscores as a int64
Method used     : KDEPlugin
Time for query  : 0 days 00:00:29.382304
Time for fauxify: 0 days 00:00:00.314669
Size of dataset : 10 rows
Processing column SubjectId as a int64
Processing column EncounterId as a int64
Processing column Source as a object
Processing column Code as a object
Processing column Type as a object
Processing column maxscore as a object
Processing column minscore as a object
Processing column numloggedscores as a int64
Method used     : KDEPlugin
Time for query  : 0 days 00:00:31.511094
Time for fauxify: 0 days 00:00:00.006992
Size of dataset : 100 rows
Processing column SubjectId as a int64
Processing column EncounterId as a int64
Processing column Source a

In [8]:
fdf

Unnamed: 0,SubjectId,EncounterId,Source,Code,Type,maxscore,minscore,numloggedscores
0,47813182,60086,Encounter,A4152,ICD-10-CM,100.000000,0.0,198
1,76765512,71737,Encounter,A413,ICD-10-CM,99.646631,0.0,220
2,94573700,94551,Encounter,K8592,ICD-10-CM,99.000000,0.0,222
3,95611412,55704,Encounter,A4150,ICD-10-CM,96.000000,0.0,218
4,72305258,90022,Encounter,A408,ICD-10-CM,100.000000,0.0,212
5,15933287,29783,Encounter,P364,ICD-10-CM,100.000000,0.0,262
6,27924017,91407,Encounter,P368,ICD-10-CM,100.000000,0.0,218
7,79250704,75877,Encounter,P360,ICD-10-CM,100.000000,0.0,254
8,74752671,25205,Encounter,A427,ICD-10-CM,97.000000,0.0,210
9,80799905,90734,Encounter,R6521,ICD-10-CM,99.000000,0.0,222


In [None]:
col = 'Code'
out_dict = dict()

colfact = df[col].factorize()
cc=Counter(colfact[0])
 
# convert from counts to proportions

for key in cc:
     cc[key] = cc[key] / len(df)

fakes = choice(elements,p=weights, replace=True, size=len(df))

out_dict[col] = [colfact[1][xx] for xx in fakes]


In [None]:
len(cc.values()), len(df), len(cc)/len(df)

In [None]:
col = 'Code'
out_dict = dict()

colfact = df[col].factorize()
cc=Counter(colfact[0])
 
# convert from counts to proportions

for key in cc:
     cc[key] = cc[key] / len(df)

fakes = choice(elements,p=weights, replace=True, size=len(df))

out_dict[col] = [colfact[1][xx] for xx in fakes]
#out_dict

In [None]:
col = 'SubjectId'
kd = stats.gaussian_kde(df[col], bw_method='silverman')
out_dict[col]=np.int64(kd.resample()[0])


In [None]:
df.head()

In [None]:
pd.crosstab(df.Codeode, df.squishcode)

In [None]:
np.corrcoef(df.Code, df.squishcode)

In [None]:
sdf = df.sample(50000)
for thiscol in sdf.columns:
    if sdf[thiscol].dtype=='object':
        print('Converting column ', thiscol)
        sdf[thiscol] = sdf[thiscol].factorize()[0]
    
#np.cov(sdf)

In [None]:
cc = np.corrcoef(sdf.transpose())
#cc = np.cov(sdf.transpose())
#cc[5,1]
plt.imshow(cc,cmap='inferno')
plt.colorbar()

In [None]:
#sdf.head()
#help(np.correlate)
df.iloc[3]

In [None]:
from statsmodels.nonparametric import kernel_density as kd

In [None]:
woo = kd.KDEMultivariate(np.array(sdf.iloc[:,[2,4,9]]), var_type=3*'u')
#help(kd.KDEMultivariate)

In [None]:
np.array(data=sdf.sample(2000).iloc[:,[2,4,9]])

In [None]:
xx = range(40)
bb = list(itertools.product(xx,xx,xx))

In [None]:
np.array(sdf.iloc[2]).shape

In [None]:
from scipy.optimize import fsolve
import statsmodels.api as sm
import numpy as np

# fit
kde = woo#sm.nonparametric.KDEMultivariate()  # ... you already did this

# sample
u = np.random.random()

# 1-d root-finding
def func(x):
    return kde.cdf([x]) - u
#sample_x = brentq(func, -99999999, 99999999)  # read brentq-docs about these constants
                                              # constants need to be sign-changing for the function


In [None]:
#u = np.random.random()
#u
#sample_x = brentq(func, -99999999, 99999999)

In [None]:
def func(x):
    return kde.cdf([x]) - u

x0=[92,4,5,3,6,7,8,9,10,11]


In [None]:
from scipy.optimize import minimize
darf = minimize(func,np.array(x0))
print(darf)

In [None]:
x0, func(x0)


In [None]:
func([0,0,0,0,0,3,0,0,0,0])

In [None]:
bork = np.mgrid[0:10,0:10, 0:10]

In [None]:
xx = range(4)

import itertools
ins = list(itertools.product(xx,xx,xx,xx,xx,xx,xx,xx,xx,xx))

vals = [func(i) for i in ins[1004:2004]]
func(ins[1004:2004])

In [None]:
func(bork[32532])

In [None]:
u


In [None]:
#kde.cdf(bork[9000:10000])
func(x0)

In [None]:
list(bork[0])

In [None]:
x0

In [None]:
import statsmodels.api as sm
nobs = 300
np.random.seed(1234)  # Seed random generator
c1 = np.random.normal(size=(nobs,1))
c2 = np.random.normal(2, 1, size=(nobs,1))
  
#Estimate a bivariate distribution and display the bandwidth found:
   
#dens_u = sm.nonparametric.KDEMultivariate(data=[c1,c2], var_type='cc', bw='normal_reference')
#dens_u.bw

woo = sm.nonparametric.KDEMultivariate(data=sdf.iloc[:,[2,4,9]], var_type=3*'u')



In [None]:
woo.cdf()

In [None]:
len(sdf)

In [None]:
len(set(sdf.iloc[:,9]))

In [None]:
np.corrcoef(sdf.iloc[:,[2,9]])