In [3]:
import pandas as pd 
import numpy as np

In [4]:
basefile = pd.read_csv("data/2020_Claims_SingleSource_v1.4.csv",dtype=str)
newfile = pd.read_csv("data/2021_Claims_SingleSource_v1.3.csv",dtype=str)

basefile.columns = basefile.columns.str.replace(' ', '_')
newfile.columns = newfile.columns.str.replace(' ', '_')
# Version to capture 
basefile["VERSION"] = "BASE"
newfile["VERSION"] = "NEW"
# Const to capture the whole file
basefile["CONST"]="CONST"
newfile["CONST"]="CONST"

### Count comparison

In [5]:
basefile.count()

Measure_ID           19434
DATA_ELEMENT_NAME    19434
CODING_SYSTEM        19434
CODE                 19434
MODIFIER               883
PLACE_OF_SERVICE       724
AGE                  19434
GENDER               19434
VERSION              19434
CONST                19434
dtype: int64

In [6]:
newfile.count()

Measure_ID           21380
DATA_ELEMENT_NAME    21380
CODING_SYSTEM        21380
CODE                 21380
MODIFIER               370
PLACE_OF_SERVICE       355
AGE                  21380
GENDER               21380
VERSION              21380
CONST                21380
dtype: int64

### Joining the two files 

In [7]:
key=newfile.columns.values.tolist()

In [8]:
key.remove("VERSION")

In [9]:
joint=newfile.merge(basefile,on=key,how="outer",suffixes=["_new","_base"])

In [10]:
def getJoinMeta(row):
    val = "NONE"
    if (row["VERSION_new"] == "NEW") & (row ["VERSION_base"] == "BASE"):
        val = "COMMON"
    elif row["VERSION_new"] == "NEW":
        val = "NEW"
    else: 
        val = "BASE" 
    return val

In [11]:
joint["VERSION"]= joint.apply(getJoinMeta,axis=1)
joint=joint.drop(["VERSION_new","VERSION_base"],axis=1)

In [12]:
joint[joint.VERSION=="NONE"]

Unnamed: 0,Measure_ID,DATA_ELEMENT_NAME,CODING_SYSTEM,CODE,MODIFIER,PLACE_OF_SERVICE,AGE,GENDER,CONST,VERSION


In [13]:
joint

Unnamed: 0,Measure_ID,DATA_ELEMENT_NAME,CODING_SYSTEM,CODE,MODIFIER,PLACE_OF_SERVICE,AGE,GENDER,CONST,VERSION
0,1,ENCOUNTER_CODE,C4,99202,,,18 - 75,"M, F",CONST,COMMON
1,1,ENCOUNTER_CODE,C4,99203,,,18 - 75,"M, F",CONST,COMMON
2,1,ENCOUNTER_CODE,C4,99204,,,18 - 75,"M, F",CONST,COMMON
3,1,ENCOUNTER_CODE,C4,99205,,,18 - 75,"M, F",CONST,COMMON
4,1,ENCOUNTER_CODE,C4,99212,,,18 - 75,"M, F",CONST,COMMON
...,...,...,...,...,...,...,...,...,...,...
25968,437,G_CODE_PD,HCPCS,G9641,,,≥0,"M, F",CONST,BASE
25969,437,G_CODE_PN,HCPCS,G9641,,,≥0,"M, F",CONST,BASE
25970,437,G_CODE_PD_Exl,HCPCS,G9640,,,≥0,"M, F",CONST,BASE
25971,437,G_CODE_PD,HCPCS,G9639,,,≥0,"M, F",CONST,BASE


In [14]:
onlybase=joint[joint.VERSION=="BASE"]
len(onlybase)

4560

In [15]:
onlynew=joint[joint.VERSION=="NEW"]
len(onlynew)

6513

In [16]:
intersection=joint[joint.VERSION=="COMMON"]

In [17]:
len(intersection)

14900

# New against Base comparison

## Defining functions

In [18]:
def columnToSet(df,column:str):
    return set(df[column].unique().tolist())

In [19]:
def analyze_difference(joint,partcolumn,partid,subcol,debug=False):
    subset=joint[joint[partcolumn]==partid]
    basedf=subset[(subset.VERSION=="BASE") | (subset.VERSION=="COMMON")]
    newdf=subset[(subset.VERSION=="NEW")|(subset.VERSION=="COMMON")]
    
    newids=columnToSet(newdf,subcol)
    baseids=columnToSet(basedf,subcol)
    
    added=newids-baseids
    removed=baseids-newids
    
    #get all the changed IDs 
    changedids=columnToSet(subset[subset.VERSION!="COMMON"],subcol)
    
    
    changed=changedids-added-removed
    
    if debug==True:
        print("Added")
        print(added)
        print("Removed")
        print(removed)
        print("Changed")
        print(changed)
    
    return (subset,added|removed|changed) 
    

In [20]:
(df,changed)=analyze_difference(joint,"Measure_ID","93","CODE")

### Iterative process

In [21]:
(df,changed)=analyze_difference(joint,"CONST","CONST","Measure_ID")

In [22]:
(dfL1,changed)=analyze_difference(df,"Measure_ID","93","DATA_ELEMENT_NAME")

In [23]:
(dfL2,changed)=analyze_difference(dfL1,"DATA_ELEMENT_NAME","CPT_II_PD_Exe","CODE")

In [24]:
dfL2[dfL2.CODE=="4131F"]

Unnamed: 0,Measure_ID,DATA_ELEMENT_NAME,CODING_SYSTEM,CODE,MODIFIER,PLACE_OF_SERVICE,AGE,GENDER,CONST,VERSION
9983,93,CPT_II_PD_Exe,CPT_II,4131F,1P,,≥2,"M, F",CONST,NEW
23101,93,CPT_II_PD_Exe,CPT_II,4131F,"1P, ≠ 2P, 3P, 8P",,≥2,"M, F",CONST,BASE


### Recursion

In [25]:
(df,changed_mid)=analyze_difference(joint,"CONST","CONST","Measure_ID")

In [26]:
mid={}
changed_den={}
for i in changed_mid:
    print("===for Measure ID "+i+"=== ")
    (mid[i],changed_den[i])=analyze_difference(df,"Measure_ID",i,"DATA_ELEMENT_NAME",True)
    

===for Measure ID 50=== 
Added
set()
Removed
set()
Changed
{'CPT_II_PD', 'CPT_II_PN_X', 'CPT_II_PN', 'ENCOUNTER_CODE'}
===for Measure ID 419=== 
Added
set()
Removed
{'G_CODE_5_PN_X', 'DX_CODE', 'G_CODE_2_PN', 'G_CODE_3_PD_Exe', 'G_CODE_5_PD', 'ENCOUNTER_CODE', 'G_CODE_2_PD', 'G_CODE_4_PD_Exe', 'G_CODE_1_PD_Exl'}
Changed
set()
===for Measure ID 39=== 
Added
{'DX_CODE'}
Removed
{'DX_CODE_X'}
Changed
{'ENCOUNTER_CODE'}
===for Measure ID 128=== 
Added
set()
Removed
set()
Changed
{'G_CODE_PD_Exe', 'ENCOUNTER_CODE'}
===for Measure ID 21=== 
Added
set()
Removed
set()
Changed
{'PROC_CODE'}
===for Measure ID 317=== 
Added
set()
Removed
set()
Changed
{'ENCOUNTER_CODE'}
===for Measure ID 1=== 
Added
{'PROC_CODE_PD_Exl', 'DX_CODE_PD_Exl'}
Removed
{'G_CODE_PN_X'}
Changed
{'CPT_II_PN_X', 'ENCOUNTER_CODE', 'CPT_II_PD', 'CPT_II_PN', 'G_CODE_PD_Exl'}
===for Measure ID 47=== 
Added
set()
Removed
set()
Changed
{'CPT_II_PD', 'CPT_II_PN_X', 'CPT_II_PN', 'ENCOUNTER_CODE'}
===for Measure ID 155=== 
Added
set