In [1]:
!git clone https://github.com/gulraizchoudhary/predict_next_diagnosis_using_markovchain.git

Cloning into 'predict_next_diagnosis_using_markovchain'...
remote: Enumerating objects: 22, done.[K
remote: Counting objects: 100% (22/22), done.[K
remote: Compressing objects: 100% (16/16), done.[K
remote: Total 22 (delta 5), reused 13 (delta 2), pack-reused 0[K
Unpacking objects: 100% (22/22), done.


In [2]:
def confusionMatrix(total_classes, gt, predicted):
    total = total_classes-len(gt)
    TP = len(list(set(predicted).intersection(set(gt))))
    FP = len(predicted)-TP
    FN = len(gt)-TP
    TN = total_classes-(TP+FP+FN)
    Accuracy = (TP+TN)/total
    MissclassificationRate = (FP+FN)/total
    TPR = TP/(FN+TP) #Sensitivity, recall
    FPR = FP/(TN+FP) 
    TNR= TN/(TN+FP) #Specificity, selectivity
    FNR= FN/(FN+TP) #miss rate
    Precision=0
    Fscore =0
    LikelihoodRatio=0
    
    if (FP+ TP) !=0:
        Precision = TP/(FP+ TP)
    
    AUC = (TP+(0.5*FP))/len(predicted)
    
    if (Precision+TPR) !=0:
        Fscore = 2*((Precision*TPR)/(Precision+TPR))
    
    if (1-TPR) !=0:
        LikelihoodRatio = Precision/(1-TPR)
    
    return TP, FP, FN, TN, Accuracy, MissclassificationRate, TPR, FPR, FNR, TNR, Precision, AUC, Fscore, LikelihoodRatio


def getCM(predicted, t_classes):
    cm =[]
    for list in predicted:
        if len(list[1])>0:
            cm.append(confusionMatrix(t_classes, list[0], list[1]))
    
    return cm

def printStat(cm):
    confusion_map=[list(i) for i in zip(*cm)]
    onfusion_mean =[np.mean(k) for k in confusion_map]
    onfusion_std =[np.std(k) for k in confusion_map]
    print("TP: "+str(onfusion_mean[0]))
    print("FP: "+str(onfusion_mean[1]))
    print("FN: "+str(onfusion_mean[2]))
    print("TN: "+str(onfusion_mean[3]))
    print("Accuracy: "+str(onfusion_mean[4]))
    print("MissclassificationRate: "+str(onfusion_mean[5]))
    print("TPR: "+str(onfusion_mean[6]))
    print("FPR: "+str(onfusion_mean[7]))
    print("FNR: "+str(onfusion_mean[8]))
    print("TNR: "+str(onfusion_mean[9]))
    print("Precision: "+str(onfusion_mean[10]))
    print("AUC: "+str(onfusion_mean[11]))
    print("AUC STD: "+str(onfusion_std[11]))
    print("Fscore: "+str(onfusion_mean[12]))
    print("LikelihoodRatio: "+str(onfusion_mean[13]))
    

In [3]:

import numpy as np
from sklearn.model_selection import train_test_split


# Markov chain stored as adjacency list.
markov = {}

def update_markov(current : str, next : str) -> None:
    """Add item to the markov.
    Args:
        current (three digit ICD-10 codes is str): Input ICD-10 code.
        next (ICD-10 code as str): Output ICD-10 code.
    """

    # Add the input ICD-10 code to the lexicon if it in there yet.
    if current not in markov:
        markov.update({current: {next: 1} })
        return

    # Retrieve the probabilties of the input ICD-10 code.
    options = markov[current]

    # Check if the output ICD-10 codes that is in the propability list.
    if next not in options:
        options.update({next : 1})
    else:
        options.update({next : options[next] + 1})

    # Update the markov
    markov[current] = options

def normalize() -> None:
    """normalize the frequencies to a 0-1 float"""
    for code, transition in markov.items():
        transition = dict((key, value / sum(transition.values())) for key, value in transition.items())
        markov[code] = transition
    

def predict(code : str) -> str:
    """Attempt to predict the next ICD-10 code in the markov chain.
    Args:
        ICD-10 code (str): Last ICD-10 known code from patient history.
    Returns:
        str: Next ICD-10 code.
        None: current ICD-10 is not in the markov chain.
    """
    if code not in markov:
        return None

    options = markov[code]
    print("markov options")
    print(options)
    return np.random.choice(list(options.keys()), p=list(options.values()))
    

def train_markov(train):
    """update the markov chain using train"""
    
    for line in train:
        # Update markov chain.
        code = line.strip().split(' ')
        for i in range(len(code) - 1):
            update_markov(code[i], code[i+1])

def predict_next(test):
    """Predict the next ICD-10 code from a given"""
    pred = []
    gt = []
    for line in test:
        codes = line.strip().split(' ')
        print(codes)
        #Divide the sequence into two equals portion as a training and testing
        t_head, t_tail = codes[:len(codes)//2], codes[len(codes)//2:]
        print(t_head[-1])
        print("tail")
        print(t_tail)
    
        # Select the last ICD-10 code from the training sequence of the given set.
        p_code = predict(t_head[-1])
        print("p_code ".format(p_code))
        
        #book keeping the total number of the classes
        gt.append(t_tail[0])
        gt.append(p_code)
        
        pred.append((str(p_code),t_head[-1]))
        
    
    return pred, len(set(gt))


def load_dataset():
    """1- Load the dataset from the file
       2- Split dataset into train and test
    """
    data = tuple(open("./predict_next_diagnosis_using_markovchain/dataset.txt", 'r'))
    
    #split data into training and testing    
    train, test = train_test_split(data,test_size=0.2)
    
    return train, test


In [4]:
#Split data into train and test
train, test = load_dataset()

In [5]:
#update the markov model
train_markov(train)



In [6]:
#Normalize the markov model
normalize()



In [7]:
try:
  #predict only the next diagnosis
  predicted, classes  =  predict_next(test)
  cm = getCM(predicted, classes)
  printStat(cm)
except (KeyboardInterrupt, EOFError):
    pass

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
['K00', 'K55']
markov options
{'K00': 0.25573549257759787, 'M70': 0.08906882591093117, 'M15': 0.09581646423751687, 'K55': 0.058704453441295545, 'M50': 0.10728744939271255, 'J30': 0.011470985155195682, 'S00': 0.012145748987854251, 'J40': 0.150472334682861, 'N30': 0.01349527665317139, 'M20': 0.03171390013495277, 'M45': 0.018893387314439947, 'S20': 0.0020242914979757085, 'L80': 0.010121457489878543, 'S70': 0.002699055330634278, 'N80': 0.002699055330634278, 'L20': 0.016194331983805668, 'N40': 0.004723346828609987, 'K20': 0.03643724696356275, 'N10': 0.004048582995951417, 'S40': 0.002699055330634278, 'M05': 0.017543859649122806, 'N60': 0.0006747638326585695, 'M65': 0.001349527665317139, 'K40': 0.004048582995951417, 'S30': 0.0006747638326585695, 'L00': 0.019568151147098516, 'N70': 0.0006747638326585695, 'K80': 0.001349527665317139, 'K90': 0.001349527665317139, 'S60': 0.005398110661268556, 'S80': 0.007422402159244264, 'S50': 0.00

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
p_code 
['E00', 'E10', 'I30']
E00
tail
['E10', 'I30']
markov options
{'E10': 0.06593406593406594, 'K00': 0.14285714285714285, 'M50': 0.03296703296703297, 'M15': 0.04395604395604396, 'L20': 0.02197802197802198, 'J00': 0.06593406593406594, 'J20': 0.01098901098901099, 'E70': 0.02197802197802198, 'K55': 0.03296703296703297, 'G40': 0.03296703296703297, 'K20': 0.03296703296703297, 'J09': 0.01098901098901099, 'S80': 0.01098901098901099, 'H00': 0.02197802197802198, 'L00': 0.01098901098901099, 'I10': 0.08791208791208792, 'I30': 0.054945054945054944, 'H90': 0.054945054945054944, 'J30': 0.01098901098901099, 'I80': 0.03296703296703297, 'J40': 0.01098901098901099, 'H40': 0.01098901098901099, 'S00': 0.01098901098901099, 'I20': 0.054945054945054944, 'K40': 0.01098901098901099, 'G50': 0.01098901098901099, 'S60': 0.01098901098901099, 'M20': 0.01098901098901099, 'N30': 0.01098901098901099, 'H25': 0.01098901098901099, 'F30': 0.0109890109890

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [8]:
predict("H90")

markov options
{'I10': 0.12079701120797011, 'J00': 0.07098381070983811, 'I30': 0.15753424657534246, 'M15': 0.0547945205479452, 'K00': 0.1307596513075965, 'I60': 0.016811955168119553, 'J20': 0.024283935242839352, 'N30': 0.00933997509339975, 'M70': 0.0323785803237858, 'M50': 0.045454545454545456, 'I20': 0.10149439601494396, 'K55': 0.024906600249066, 'S20': 0.0018679950186799503, 'M05': 0.011830635118306352, 'K80': 0.0018679950186799503, 'K50': 0.0012453300124533001, 'M45': 0.00684931506849315, 'J09': 0.021170610211706103, 'M30': 0.0006226650062266501, 'N17': 0.0006226650062266501, 'S60': 0.0018679950186799503, 'L00': 0.007471980074719801, 'I80': 0.0361145703611457, 'K40': 0.0043586550435865505, 'L80': 0.0043586550435865505, 'S00': 0.0112079701120797, 'J40': 0.021793275217932753, 'M20': 0.013075965130759652, 'S80': 0.0024906600249066002, 'K20': 0.009962640099626401, 'I95': 0.0012453300124533001, 'I70': 0.008717310087173101, 'L60': 0.00311332503113325, 'M65': 0.0012453300124533001, 'J30': 

'I20'

In [9]:
!pip install icd10-cm


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting icd10-cm
  Downloading icd10_cm-0.0.4-py2.py3-none-any.whl (675 kB)
[K     |████████████████████████████████| 675 kB 4.5 MB/s 
[?25hInstalling collected packages: icd10-cm
Successfully installed icd10-cm-0.0.4


In [10]:
data = tuple(open("./predict_next_diagnosis_using_markovchain/dataset.txt", 'r'))

In [11]:
data[0]

'H90 I10 K00 M15 M50 M70\n'

In [12]:
import icd10
code = icd10.find("H90")
print(code.description)         # Acute bronchitis due to Mycoplasma pneumoniae
if code.billable:
    print(code, "is billable")  # J20.0 is billable

print(code.chapter)             # X
print(code.block)               # J00-J99
print(code.block_description)   # Diseases of the respiratory system

Conductive and sensorineural hearing loss
VIII
H60-H95
Diseases of the ear and mastoid process


In [13]:
for line in data:
  for code in line.strip().split(' '):
    print(icd10.find(code).description)
    print('\t')
  print('\n')

[1;30;43mStreaming output truncated to the last 5000 lines.[0m


Type 1 diabetes mellitus
	
Acute nasopharyngitis [common cold]
	
Acute bronchitis
	
Bronchitis, not specified as acute or chronic
	
Cervical disc disorders
	
Soft tissue disorders related to use, overuse and pressure
	


Type 1 diabetes mellitus
	
Acute nasopharyngitis [common cold]
	
Bronchitis, not specified as acute or chronic
	
Disorders of tooth development and eruption
	
Polyosteoarthritis
	
Soft tissue disorders related to use, overuse and pressure
	


Type 1 diabetes mellitus
	
Acute nasopharyngitis [common cold]
	
Bronchitis, not specified as acute or chronic
	
Disorders of tooth development and eruption
	
Cervical disc disorders
	
Soft tissue disorders related to use, overuse and pressure
	


Type 1 diabetes mellitus
	
Acute nasopharyngitis [common cold]
	
Bronchitis, not specified as acute or chronic
	
Polyosteoarthritis
	
Cervical disc disorders
	
Soft tissue disorders related to use, overuse and pressure
	


AttributeError: ignored