In [1]:
import json
import os
import pandas as pd
import time

from datetime import date as d
import argparse
import csv
import itertools as it
from collections import OrderedDict

import sys
import yaml
import pandas.io.sql as sqlio
import psycopg2

import hcc_risk_models as hrm

In [2]:
def small_test():

    #model = 'V2217_79_O1'  # accepts icd10 only
    model = 'V2216_79_O2'

    demographics = pd.DataFrame({
        'pt_id': ['07358519','07098439'],
        'sex': [1,1],
        'dob': ['1944-6-6', '1946-1-1'],
        'ltimcaid': [0,0],
        'nemcaid': [0,0],
        'orec': [0,0],
    })

    diagnoses = pd.DataFrame({
        'pt_id': ['07358519','07358519','07358519','07358519','07358519','07358519',
                 '07358519','07358519','07358519','07358519','07358519','07358519',
                 '07358519','07358519','07358519','07358519','07358519','07358519',
                 '07358519','07358519','07358519','07358519','07358519','07358519',
                 '07358519','07358519','07358519','07358519','07358519','07358519',
                 '07098439','07098439','07098439','07098439','07098439','07098439',
                 '07098439','07098439','07098439'
                 ],
        'diag_code': ['S0990XA', 'M1990', 'M79642', 'I8310', 'M79674', 'I480', 
                      'M79606','M1711', 'G629', 'I872', 'I2510', 'M25432', 
                      'W19XXXA', 'S5002XA','M79675', 'I10', 'G92', 'J449', 
                      'S61213A', 'Z5181', 'R55', 'B351','M79672', 'M7989', 
                      'Z6828', 'I4891', 'M25422', 'M25532', 'Z01810','Z86718',
                      'I10','V722','E785','Z0120','Z01818','M542','E119','M4712','M4722'],
    
        'diag_type': [0,0,0,0,0,0,
                      0,0,0,0,0,0,
                      0,0,0,0,0,0,
                      0,0,0,0,0,0,
                      0,0,0,0,0,0,
                      0,9,0,0,0,0,0,0,0],
    })

    result = hrm.evaluate_model(model, demographics, diagnoses)
    return result

In [160]:
def test2():

    #model = 'V2217_79_O1'  # accepts icd10 only
    model = 'V2216_79_O2'

    demographics = pd.DataFrame({
        'pt_id': ['10040682'],
        'sex': [1],
        'dob': ['1944-6-6'],
        'ltimcaid': [0],
        'nemcaid': [0],
        'orec': [0],
    })

    diagnoses = pd.DataFrame({
        'pt_id': ['10040682'] * 34,
        'diag_code': ['E785', 'R252', 'Z86010', 'E519', 'E538', 
                      'F329', 'Z01818', 'Z01818', 'K580', 'R252', 
                      'M75110', 'Z7902', 'M25519', 'I2510', 'M19011', 
                      'I2510', 'I2510', 'Z800', 'N390', 'R9082', 
                      'M25511', 'M25519', 'R9082', 'E559', 'M75121', 
                      'E785', 'M24111', 'I2510', 'G8918', 'M94211', 
                      'M75101', 'M75111', 'I2510', 'G8918'],
        'diag_type': [0] * 34,
    })

    result = hrm.evaluate_model(model, demographics, diagnoses)
    return result

In [254]:
def big_test ():
    model = 'V2216_79_O2'
    data_science_repo_dir = '/Users/elsie/data-science-repo/'
    creds_file='/Users/elsie/creds.yml'
    redshift_schema = 'data_science'
    long_form_dx_table = 'bcop_dx_episode'
    
    conn_string = os.environ.get("DW_DEV_CONN")
    conn = psycopg2.connect(conn_string)
    cur = conn.cursor()
    cur.execute("SET search_path TO data_science")
    sql = """ SELECT DISTINCT episode_id, diagnosis_code, icd_version, '1' as sex_cd, 
    '1944-06-06' as date_of_birth,  '0' as mcaid_cd FROM data_science.{} where episode_id='09791223'""".format(long_form_dx_table)

    episode_dx = sqlio.read_sql_query(sql, conn)
    print ("Number of episodes: {}".format(len(episode_dx['episode_id'].unique())))
    print ("Number of dx codes: {}".format(len(episode_dx)))
    
    dem = episode_dx[['episode_id','sex_cd','date_of_birth']].drop_duplicates()
    zerolist =  [0] * len(dem)
    demographics = pd.DataFrame({
        'pt_id': dem['episode_id'],
        'sex': dem['sex_cd'],
        'dob': dem['date_of_birth'],
        'ltimcaid': zerolist,
        'nemcaid': zerolist,
        'orec': zerolist,
        })
    diagnoses = pd.DataFrame({
    'pt_id': episode_dx['episode_id'],
    'diag_code': episode_dx['diagnosis_code'],
    'diag_type': episode_dx['icd_version'].astype(int),
    })
    
    start = time.clock()
    result = hrm.evaluate_model(model, demographics, diagnoses)  
    print ("Execution time: {}".format(time.clock() - start))
    return result
    

In [255]:
results = None
if __name__ == '__main__':
    results = big_test()

Number of episodes: 1
Number of dx codes: 15
Execution time: 0.01575300000001789


In [256]:
results

{'model_info': {'model_name': 'V2216_79_O2',
  'model_description': 'CMS-HCC 2017 Initial Model, 79 HCC Variables',
  'model_segments': {'CNA': 'Community NonDual Aged',
   'CND': 'Community NonDual Disabled',
   'CFA': 'Community Full Benefit Dual Aged',
   'CFD': 'Community Full Benefit Dual Disabled',
   'CPA': 'Community Partial Benefit Dual Aged',
   'CPD': 'Community Partial Benefit Dual Disabled',
   'INS': 'Long Term Institutional',
   'NE': 'New Enrollees',
   'SNPNE': 'SNP New Enrollees'},
  'model_coefficients': {'description': 'coefficients for 9 regression models developed using CY2013/2014 data and CMS denominator 9,185.29 (3/6/2016)',
   'cms_denominator': 9185.29}},
 'patients': [{'pt_id': '09791223',
   'demographic_data': {'dob': '1944-06-06',
    'sex': 1,
    'ltimcaid': 0,
    'nemcaid': 0,
    'orec': 0,
    'age': 74},
   'diagnoses_to_hccs': [{'diag_code': 'C50919',
     'diag_type': 0,
     'cc': 12,
     'assign_type': 'primary',
     'hcc': 12,
     'cc_descr

In [183]:
opensource_hcc = pd.DataFrame(columns=['episode_id','opensource_hcc_list'])
for p in results['patients']:
    #print ("Episode: {}".format(p['pt_id']))
    diags = p['diagnoses_to_hccs']
    diags_df = pd.DataFrame(diags)
    hcc_list = []
    if ('hcc' in diags_df.columns):
        hcc_list = [item.astype(int) for item in diags_df['hcc'].unique()]
    opensource_hcc = opensource_hcc.append(
        {'episode_id': p['pt_id'], 'opensource_hcc_list': hcc_list},
        ignore_index=True)

In [193]:
conn_string = os.environ.get("DW_DEV_CONN")
conn = psycopg2.connect(conn_string)
cur = conn.cursor()
cur.execute("SET search_path TO data_science")
sql = """SELECT * FROM data_science.bcop_episode_hcc_risk"""
clarify_hcc = sqlio.read_sql_query(sql, conn)

In [194]:
cols = clarify_hcc.columns[1:]
bool_df = clarify_hcc[cols].apply(lambda x: x > 0, raw=True)
clarify_hcc['clarify_hcc_list'] = bool_df.apply(
    lambda x: [int(item.replace('hcc','')) for item in list(cols[x.values])], axis=1)
clarify_hcc[['episode_id','clarify_hcc_list']].head(2)

Unnamed: 0,episode_id,clarify_hcc_list
0,7011705,[12]
1,7117267,[]


In [188]:
print (opensource_hcc.shape)
print (clarify_hcc.shape)

(3128, 2)
(3128, 81)


In [250]:
compare = pd.merge(opensource_hcc, clarify_hcc[['episode_id','clarify_hcc_list']])
diffs = pd.DataFrame()
for index, row in compare.iterrows():
    cfy = row['clarify_hcc_list']
    ops = [item for item in row['opensource_hcc_list'] if item != 0]
    if sorted(cfy) != sorted(ops):
        diffs = diffs.append(row)
print ("Number of differences: {}".format(len(diffs)))

In [253]:
diffs

Unnamed: 0,clarify_hcc_list,episode_id,opensource_hcc_list
73,"[18, 40, 58]",7399433,"[18, 40, 0, 12, 58]"
93,"[18, 58, 108, 111]",11966837,"[108, 111, 0, 18, 12, 58]"
139,[],8172857,[176]
178,[],9791223,[12]
190,[],8906311,[12]
226,[23],9329648,"[12, 23]"
312,[],8861857,[12]
366,[],13448854,[12]
464,[78],9171220,"[78, 12]"
484,[],9971925,[12]


#### examples from bcop data of diffferences
* looks like clarify module is missing a lot of hcc12s which is linked to C5019 ICD10 code 
* episode id with cancellation: 12272518 (diabetes with chronic complications cancels diabets without complications)