In [82]:
import pydicom as pydcm
import matplotlib.pyplot as plt
import pandas as pd
import sys
from pydicom.datadict import dictionary_VR
from pydicom import dcmread,Dataset
from pydicom.filewriter import dcmwrite
import numpy as np
#import cv2
import random
import time
import datetime


from google.cloud import storage
from io import BytesIO
client = storage.Client()

In [83]:
def blob_to_image(bucket, blob_name):
    blob = bucket.get_blob(blob_name)
    blobstring = blob.download_as_string()
    dicomfile = BytesIO(blobstring)
    dicomread = dcmread(dicomfile)
    #plt.imshow(dicomread.pixel_array)
    return dicomread

def img_diff(raw, deid):
    diff = raw.pixel_array-deid.pixel_array
    result = np.count_nonzero(diff)
    return(result)    

def find(list,thing): 
    "finds indices of something in a list"
    return [i for i, j in enumerate(list) if j == thing]


def blob_to_df(filelist, bucket):
    SOP = []
    patient = []
    study = []
    series = []
    DICOM = []
    for i in range(len(filelist)):
        dicom = blob_to_image(bucket, filelist[i])
        DICOM.append(dicom)
        patient.append(dicom.PatientID)
        study.append(dicom.StudyInstanceUID)
        series.append(dicom.SeriesInstanceUID)
        SOP.append(dicom.SOPInstanceUID)
    
    d = {'dicom': DICOM, 'patientID': patient, 'studyID': study, 'seriesID': series, 
         'SOPID': SOP}
    df = pd.DataFrame(d)
    return(df)


In [97]:
bucket_in = client.get_bucket('midi-tests')

#imports dicom files that have been through GCP de-id
#that GCP de-id script ignores patient ID and doesn't shift dates
blobs_in = bucket_in.list_blobs(prefix='runtests/ds1_1-23-7-21-sc7-20')
filelist_in = [item.name for item in blobs_in] #extracting file names from blobs

df = blob_to_df(filelist_in[1:], bucket_in)
#df_in = blob_to_df(filelist_in[1:], bucket_in)

In [100]:
#keep list, copied from GCP script keep list
keeplist = ['(0009, 1008)','(7005, 1008)','(7005, 100b)','(7005, 100e)','(7005, 100f)','(7005, 1012)','(7005, 1017)','(7005, 1018)','(7005, 1019)','(7005, 101a)','(7005, 101b)',
               '(7005, 101e)','(7005, 1020)','(7005, 1030)','(0019,  0014)','(0019, 10a3)','(0027, 1033)','(0043, 1035)','(0043, 1036)','(0043, 1037)','(0010, 0040)','(0010, 2203)','(0010, 1020)',
               '(0010, 1030)','(0010, 21c0)','(0010, 21a0)','(0008, 0018)','(0020, 000d)','(0020, 000e)','(0028, 1054)','(2010, 0010)','(0008, 0070)','(0010, 1010)','(0008, 0016)','(0008, 1150)',
               '(0002, 0002)','(0002, 0010)',
               '(0002, 0003)']

In [88]:
#function for finding dates
#for each regex date format, checks if the input text is a date, or if a subset of the input text is a date
#if it's a subset, the first character in the input string needs to be the first character of the date. So 202305105555 would be a date match, but 555520230510 wouldn't
#another thing to note is that months of format m and mm, and dates of format d and dd are matched for (so both 01 and 1 are a valid month format)
#if there is a match to the format, the month day and year are taken from the date and they are changed to match m d (so 1 instead of 01) and yyyy if they don't already
#These values and the end index of the date in the string are returned
#If there is no match, the a match is attempted using the next format. This continues until either a match is found, or each format has been tried without a match, and then none is returned
def finddates(text):
    today = datetime.today()
    thisyear = today.year
   
    #only checks years up to 2022. This would need to be changed
    #also need to change the order in which date formats are checked. yyyymmdd should be checked first
    mmddyyyy = re.compile('(0[1-9]|1[0-2]|[1-9])(0[1-9]|1[0-9]|2[0-9]|3[0-1]|[1-9])(19[0-9][0-9]|20[0-2][0-9])')
    match = mmddyyyy.match(text)
    if match is not None:
        m = match.group(1)
        if m[0] == 0:
            m = m[1:]
        d = match.group(2)
        if d[0] == 0:
            d = d[1:]
        y = match.group(3)
        end = match.end()
        dt = [y,m,d,end]
        return(dt)
        
    ddmmyyyy = re.compile('(0[1-9]|1[0-9]|2[0-9]|3[0-1]|[1-9])(0[1-9]|1[0-2]|[1-9])(19[0-9][0-9]|20[0-2][0-9])')
    match = ddmmyyyy.match(text)
    if match is not None:
        m = match.group(2)
        if m[0] == 0:
            m = m[1:]
        d = match.group(1)
        if d[0] == 0:
            d = d[1:]
        y = match.group(3)
        end = match.end()
        dt = [y,m,d,end]
        return(dt)
    
    yyyymmdd = re.compile('(19[0-9][0-9]|20[0-2][0-9])(0[1-9]|1[0-2]|[1-9])(0[1-9]|1[0-9]|2[0-9]|3[0-1]|[1-9])')
    match = yyyymmdd.match(text)
    if match is not None:
        m = match.group(2)
        if m[0] == 0:
            m = m[1:]
        d = match.group(3)
        if d[0] == 0:
            d = d[1:]
        y = match.group(1)
        end = match.end()
        dt = [y,m,d,end]
        return(dt)
    
    
    yyyyddmm = re.compile('(19[0-9][0-9]|20[0-2][0-9])(0[1-9]|1[0-9]|2[0-9]|3[0-1]|[1-9])(0[1-9]|1[0-2]|[1-9])')
    match = yyyyddmm.match(text)
    if match is not None:
        m = match.group(3)
        if m[0] == 0:
            m = m[1:]
        d = match.group(2)
        if d[0] == 0:
            d = d[1:]
        y = match.group(1)
        end = match.end()
        dt = [y,m,d,end]
        return(dt)

    
    
    
    mm_dd_yyyy = re.compile('(0[1-9]|1[0-2]|[1-9])(\.|/| |-)(0[1-9]|1[0-9]|2[0-9]|3[0-1]|[1-9])(\.|/| |-)(19[0-9][0-9]|20[0-2][0-9])')
    match = mm_dd_yyyy.match(text)
    if match is not None:
        m = match.group(1)
        if m[0] == 0:
            m = m[1:]
        d = match.group(3)
        if d[0] == 0:
            d = d[1:]
        y = match.group(5)
        end = match.end()
        dt = [y,m,d,end]
        return(dt)
    
    dd_mm_yyyy = re.compile('(0[1-9]|1[0-9]|2[0-9]|3[0-1]|[1-9])(\.|/| |-)(0[1-9]|1[0-2]|[1-9])(\.|/| |-)(19[0-9][0-9]|20[0-2][0-9])')
    match = dd_mm_yyyy.match(text)
    if match is not None:
        m = match.group(3)
        if m[0] == 0:
            m = m[1:]
        d = match.group(1)
        if d[0] == 0:
            d = d[1:]
        y = match.group(5)
        end = match.end()
        dt = [y,m,d,end]
        return(dt)
    
    yyyy_mm_dd = re.compile('(19[0-9][0-9]|20[0-2][0-9])(\.|/| |-)(0[1-9]|1[0-2]|[1-9])(\.|/| |-)(0[1-9]|1[0-9]|2[0-9]|3[0-1]|[1-9])')
    match = yyyy_mm_dd.match(text)
    if match is not None:
        m = match.group(3)
        if m[0] == 0:
            m = m[1:]
        d = match.group(5)
        if d[0] == 0:
            d = d[1:]
        y = match.group(1)
        end = match.end()
        dt = [y,m,d,end]
        return(dt)
    
    yyyy_dd_mm = re.compile('(19[0-9][0-9]|20[0-2][0-9])(\.|/| |-)(0[1-9]|1[0-9]|2[0-9]|3[0-1]|[1-9])(\.|/| |-)(0[1-9]|1[0-2]|[1-9])')
    match = yyyy_dd_mm.match(text)
    if match is not None:
        m = match.group(5)
        if m[0] == 0:
            m = m[1:]
        d = match.group(3)
        if d[0] == 0:
            d = d[1:]
        y = match.group(1)
        end = match.end()
        dt = [y,m,d,end]
        return(dt)
    
    
    #for dates with delimiters (. / ' ' or -), yy is a valid year format. So 10-23-98 would count as a valid date along with 10-23-1998
    #for yy year formats, will default to 2000s unless that would make the year greater than the current year, in which case 1900 will be used
    #so if the yy is 15, the year will be changed to 2015. if the yy is 45 the year will be changed to 1945
    y2_mm_dd = re.compile('([0-9][0-9])(\.|/| |-)(0[1-9]|1[0-2]|[1-9])(\.|/| |-)(0[1-9]|1[0-9]|2[0-9]|3[0-1]|[1-9])')
    match = y2_mm_dd.match(text)
    if match is not None:
        m = match.group(3)
        if m[0] == 0:
            m = m[1:]
        d = match.group(5)
        if d[0] == 0:
            d = d[1:]
        y = match.group(1)
        if int('20'+y)<=thisyear:
            y = '20'+y
        else:
            y = '19'+y
        end = match.end()
        dt = [y,m,d,end]
        return(dt)
    
    dd_mm_y2 = re.compile('(0[1-9]|1[0-9]|2[0-9]|3[0-1]|[1-9])(\.|/| |-)(0[1-9]|1[0-2]|[1-9])(\.|/| |-)([0-9][0-9])')
    match = dd_mm_y2.match(text)
    if match is not None:
        m = match.group(3)
        if m[0] == 0:
            m = m[1:]
        d = match.group(1)
        if d[0] == 0:
            d = d[1:]
        y = match.group(5)
        if int('20'+y)<=thisyear:
            y = '20'+y
        else:
            y = '19'+y
        end = match.end()
        dt = [y,m,d,end]
        return(dt)
    
    y2_dd_mm= re.compile('([0-9][0-9])(\.|/| |-)(0[1-9]|1[0-9]|2[0-9]|3[0-1]|[1-9])(\.|/| |-)(0[1-9]|1[0-2]|[1-9])')
    match = y2_dd_mm.match(text)
    if match is not None:
        m = match.group(5)
        if m[0] == 0:
            m = m[1:]
        d = match.group(3)
        if d[0] == 0:
            d = d[1:]
        y = match.group(1)
        if int('20'+y)<=thisyear:
            y = '20'+y
        else:
            y = '19'+y
        end = match.end()
        dt = [y,m,d,end]
        return(dt)
    
    mm_dd_y2 = re.compile('(0[1-9]|1[0-2]|[1-9])(\.|/| |-)(0[1-9]|1[0-9]|2[0-9]|3[0-1]|[1-9])(\.|/| |-)([0-9][0-9])')
    match = mm_dd_y2.match(text)
    if match is not None:
        m = matchgroup(1)
        if m[0] == 0:
            m = m[1:]
        d = match.group(3)
        if d[0] == 0:
            d = d[1:]
        y = match.group(5)
        if int('20'+y)<=thisyear:
            y = '20'+y
        else:
            y = '19'+y
        end = match.end()
        dt = [y,m,d,end]
        return(dt)
    
    return(None)
    

    
    
    

In [1]:
import datefinder
from datetime import datetime, timedelta

In [99]:
#whitelist of common radiology acronyms. Will probably need to be expanded
whitelist = ['CT', 'MRI', 'MR', 'CTA', 'PA', 'AP','CXR','IV','LLQ','LUQ','PET','RLQ','RUQ','US','NM','RT']

In [108]:
import re
import random



file = 0
#vrs to look at
vrs = ['LO', 'SH', 'LT', 'ST', 'UT','IS']


#minimum valid date is 1900-01-02
mindt = datetime(1900,1,2)
maxdt = datetime.now()

orgtextlist = []
newtextlist = []
orglist = []
newlist = []
taglist = []
ilist = []




#regex rules. Finds 'at ' followed by 2-4 capital letters, and 'by ' followed by 2 capital letters
loc = re.compile('at ([A-Z]{2,4})')
dr = re.compile('by ([A-Z]{2})')


#loops through each dicom file
for f in range(len(df)):
    file = df.iloc[f][0]
    
    #creates random int between 1 and 100 using the series id as a seed to use as dateshift
    random.seed(df.iloc[f][3])
    randdelta = random.randint(1,100)
    
    
    #creates random number between 100000-999999 to use as patient id using the current patient ID as a seed
    #might need to change this method to make it more secure
    random.seed(file.PatientID)
    newpid = random.randint(100000,999999)
    file.PatientID = str(newpid)
    
    #loops through each element in the dicom file
    for elem in file.iterall():
        
        #shifts dates in DA VRs
        if elem.VR == 'DA':
            str1 = str(elem.value)
            orglist.append(str1)
            #appends whitespace to dates bc otherwise datefinder wont find the date
            str1 = ' '+str1+' '
            #looks for dates in DA elem using datefinder package (different from the one I created)
            #this could potentially be changed in future versions to just using regex with format yyyymmdd
            x = datefinder.find_dates(str1, source=True, index=False, strict=False, base_date=None)
            #loops through found dates (should just be 1 date)
            for i in x:
                dt = i[0]
                org = i[1]
                #shifts the date back 1-100 days using the previously created random number
                dt = dt - timedelta(days = randdelta)
                #takes the year month and day of the new date
                y = str(dt.year)
                m = str(dt.month)
                d = str(dt.day)
                #changes month to mm format if in m
                if len(m) == 1:
                    m = '0'+m
                #changes day to dd format if in d
                if len(d) == 1:
                    d = '0'+d
                newstr = y+m+d
                #replaces string with new date in yyyymmdd format
                str1 = str1.replace(org,newstr)
                newtextlist.append(newstr)
                orgtextlist.append(org)
                ilist.append(f)
                taglist.append(elem.name)
            #replaces element value with new string (without the extra whitespaces)
            elem.value = str1[1:len(str1)-1]
            newlist.append(str1[1:len(str1)-1])
            
        #does the same exact thing as above, but for DT
        if elem.VR == 'DT':
            str1 = str(elem.value)
            orglist.append(str1)
            if '.' in str1:
                str1 = str1[:str1.index('.')]
            str1 = ' '+str1+' '
            x = datefinder.find_dates(str1, source=True, index=False, strict=False, base_date=None)
            for i in x:
                dt = i[0]
                org = i[1]
                dt = dt - timedelta(days = randdelta)
                y = str(dt.year)
                m = str(dt.month)
                d = str(dt.day)
                h = str(dt.hour)
                mi = str(dt.minute)
                sec = str(dt.second)
                if len(m) == 1:
                    m = '0'+m
                if len(d) == 1:
                    d = '0'+d
                #creates new shifted date in format yyyymmddhourminsec
                newstr = y+m+d+h+mi+sec
                str1 = str1.replace(org,newstr)
                newtextlist.append(newstr)
                orgtextlist.append(org)
                ilist.append(f)
                taglist.append(elem.name)
            elem.value = str1[1:len(str1)-1]
            newlist.append(str1[1:len(str1)-1])
            
        #ignores elements not of VR 'LO', 'SH', 'LT', 'ST', 'UT','IS'
        #ignores Patient ID, Code Value, Private Creator, Actual Frame Duration, and Primary Counts Accumulates elements
        elif elem.VR in vrs and elem.name != 'Patient ID' and elem.name != 'Code Value' and elem.name != 'Private Creator' and elem.name != 'Actual Frame Duration' and elem.name != 'Primary (Prompts) Counts Accumulated':
            #ignores tags in automatic keep list
            if elem.tag not in keeplist:
                str1 = str(elem.value)
                
                #searches for match with location regex ('at 'ABC)
                matchloc = loc.search(str1)
                if matchloc is not None:
                    #if there's a match, checks the acronym isn't in the whitelist
                    if matchloc.group(1) not in whitelist:
                        #if it's not in the whitelist, replaces the acronym with '1'
                        strloc = loc.sub('1',str1)
                        #the append statements are just for creating a df to double check results, not a part of the de-id process
                        taglist.append(elem.name)
                        ilist.append(f)
                        orglist.append(str1)
                        orgtextlist.append(matchloc.group(0))
                        str1 = strloc
                        newlist.append(str1)
                        newtextlist.append('1')
                        #updates elem value with new string
                        elem.value = str1
                #does the same as above, but for the doctor initial regex ('by 'AB)
                matchdr = dr.search(str1)
                if matchdr is not None:
                    if matchdr.group(1) not in whitelist:
                        strdr = dr.sub('1',str1)
                        taglist.append(elem.name)
                        ilist.append(f)
                        orglist.append(str1)
                        orgtextlist.append(matchdr.group(0))
                        str1 = strdr
                        newlist.append(str1)
                        newtextlist.append('1')
                        elem.value = str1
                
                #date/patient id finder
                #creates list of strings made up of digits and certain delimiters (- / . and ' ') where the first character is a digit and the length is 6-15
                x = re.findall('[\d][\d -/\.]{6,15}', str1)
                if len(x)!=0:
                    #loops through the strings list
                    for n in x:
                        #checks that the first digit is numeric. this is a reduntant part of the code that wasn't deleted on accident
                        if n[0].isnumeric():
                            #checks if the string is/contains a date
                            date = finddates(n)
                            if date is not None:
                                #takes the end index of the date within the string
                                end = int(date[3])
                                #takes the year month and day of each date and puts into a datetime format
                                orgdate = datetime(int(date[0]),int(date[1]),int(date[2]))
                                #shifts the date back 1-100 days using random number
                                newdate = orgdate-timedelta(days = randdelta)
                                #identifies original date string using end index
                                orgstring = n[0:end]
                                #takes the string of the newdate, cutting off the hour/min/sec values
                                #new str is in format yyyy-mm-dd. might potentially change this so that '-'s are removed
                                newstr = str(newdate)[:10]
                                orglist.append(str1)
                                orgtextlist.append(orgstring)
                                #replaces the original date string with the new date string within the element string
                                str1 = str1.replace(orgstring,newstr)
                                #again all of the appends are not part of the de-id process
                                taglist.append(elem.name)
                                ilist.append(f)
                                
                                #if VR is IS, replace element value with '1' and update the element value
                                if elem.VR == 'IS':
                                    elem.value = '1'
                                    newtextlist.append('1')
                                    newlist.append('1')
                                else:
                                    #these are yet again just for keeping track of what the post processing script is doing
                                    newtextlist.append(newstr)
                                    newlist.append(str1)
                                    
                            #if the string of digits/delimiters doesn't contain a date and isn't IS
                            elif elem.VR != 'IS':
                                #finds all numeric only strings of length 6-15 within the previously identified string of digits and delimiters
                                longints = re.findall('[0-9]{6,15}', str1)
                                if longints is not None:
                                    #replaces each of these long strings of integers with a '1'
                                    for orgint in longints:
                                        taglist.append(elem.name)
                                        ilist.append(f)
                                        orglist.append(str1)
                                        orgtextlist.append(orgint)
                                        str1 = str1.replace(orgint,'1')
                                        taglist.append(elem.name)
                                        newtextlist.append('1')
                                        newlist.append(str1)
                    #after looping through the list of numeric strings, updates element with all changes made to the element string
                    #only if VR isn't IS, as all IS VR tags with dates in them have already been changed to 1
                    if elem.VR != 'IS':
                        elem.value = str1
                    

In [9]:
#creates df for checking results. not necessaryf ro de-id process
changed = pd.DataFrame(list(zip(orglist, newlist, taglist, orgtextlist, newtextlist, ilist)),
               columns =['org', 'new', 'tag', 'org_text', 'new_text', 'i'])
changed

NameError: name 'pd' is not defined

In [81]:
#uploads changed dicom files to midi-runs gcp bucket
bucket = client.get_bucket('midi-runs')
for i in range(1,len(filelist_in)):
    filepath = 'sc2023-07-20-ds1_1-23-2023-07-21/'+filelist_in[i]
    blob = bucket.blob(filepath)
    df.iloc[i-1][0].save_as('test8.dcm')
    blob.upload_from_filename('test8.dcm')
    

In [63]:
str1 = 'ah at MGH'
loc = re.compile('at ([A-Z]{2,4})')
ex = loc.search(str1)
if ex is not None:
    if ex.group(1) not in whitelist:


MGH


In [64]:
whitelist = ['CT', 'MRI', 'MR', 'CTA', 'PA', 'AP','CXR','IV','LLQ','LUQ','PET','RLQ','RUQ','US','NM','RT']

In [None]:
str1 = 'at MGH'
strloc = loc.sub('*',str1)

In [24]:
#automatic code final
import re

file = 0
#vrs to look at
vrs = ['LO', 'SH', 'LT', 'ST', 'UT','IS']
str1 = ''
x = []
x2 = []

patient = []
study = []
SOP = []
series = []
element = []
tag = []
value = []
vr = []
newvalue = []

#regex rules
loc = re.compile('at [A-Z]{2,4}')
dr = re.compile('by [A-Z]{2}')

#blacklist
blacklist = [' MGH', ' ALH']

for i in range(len(df)):
    file = df.iloc[i][0]
    for elem in file.iterall():
        if elem.VR in vrs and elem.name != 'Code Value' and elem.name != 'Private Creator' and elem.name != 'Actual Frame Duration' and elem.name != 'Primary (Prompts) Counts Accumulated':
            if elem.tag not in keeplist:
                str1 = str(elem.value)
                try:
                    #regex replacement
                    strloc = loc.sub('*',str1)
                    str1 = dr.sub('*',strloc)
                    for b in blacklist:
                        if b in str1:
                            str1 = str1.replace(b,'+')
                    elem.value = str1
                    #date/pid finder
                    x = re.findall('[0-9]{6,15}', str1)
                    if len(x) != 0:
                        #creating df to look into results
                        patient.append(df.iloc[i][1])
                        study.append(df.iloc[i][2])
                        series.append(df.iloc[i][3])
                        SOP.append(df.iloc[i][4])
                        element.append(str(elem))
                        tag.append(elem.name)
                        vr.append(elem.VR)
                        value.append(str(elem.value))
                        
                        #loop through each long str of integers
                        for n in x:
                            str1 = str(elem.value)
                            #date format change
                            if len(n) == 8 and elem.VR != 'IS':
                                out1 = re.split('(19[0-9][0-9]|20[0-2][0-9])(0[1-9]|1[0-2])(0[1-9]|1[0-9]|2[0-9]|3[0-1])',n)
                                dt1 = ' '.join(out1)
                                
                                out2 = re.split('(0[1-9]|1[0-9]|2[0-9]|3[0-1])(0[1-9]|1[0-2])(19[0-9][0-9]|20[0-2][0-9])',dt1)
                                dt2 = ' '.join(out2)
                                
                                out3 = re.split('(0[1-9]|1[0-2])(0[1-9]|1[0-9]|2[0-9]|3[0-1])(19[0-9][0-9]|20[0-2][0-9])',dt2)
                                dt3 = ' '.join(out3)
                                #replace if the str wasn't changed
                                if len(out1) == 1 and len(out2)==1 and len(out3)==1:
                                    str2 = str1.replace(n,'')
                                    elem.value = str2
                                #replace w/ new format if the str was changed
                                else:
                                    str2 = str1.replace(n,dt3)
                                    elem.value = str2
                            else:
                                #replace non-len of 8 and IS
                                str2 = str1.replace(n,'')
                                elem.value = str2
                        #add new val to df
                        newvalue.append(str(elem.value))
                                
                except Exception as error:
                    print(str(elem)+'the error:  '+str(error))
                    

In [25]:
df.iloc[99][0].AdditionalPatientHistory

'Ordered for 675-36-2449 by Dr. Davis'

In [127]:
d = {'patient':patient, 'study':study, 'series':series, 'SOP':SOP, 'tag':tag, 'vr':vr, 'element':element, 'value':value, 'newvalue':newvalue}
flagged = pd.DataFrame(d)

In [128]:
flagged.groupby(['tag'])['tag'].count()

tag
Accession Number                        353
Acquisition Comments                    245
Additional Patient History              498
Attenuation Correction Method           118
Clinical Trial Subject ID               263
Image Comments                           12
Imaging Service Request Comments          1
Medical Alerts                           95
Patient Comments                        495
Patient ID                              721
Private tag data                        200
Protocol Name                           190
Requested Procedure Comments            235
Requested Procedure Description         374
Scheduled Procedure Step Description     12
Series Description                      295
Series Number                           590
Study Comments                          174
Study Description                       326
Study ID                                407
Text Value                                2
Name: tag, dtype: int64

In [129]:
flagged[flagged['tag']=='Accession Number']

Unnamed: 0,patient,study,series,SOP,tag,vr,element,value,newvalue
421,1326414678,2.2.379.0.1.5971236.3.509.2117853776506399936,2.2.379.0.1.5971236.3.509.6412913489296797476,2.2.379.0.1.5971236.3.509.1118646326684973163,Accession Number,SH,"(0008, 0050) Accession Number ...",20170304E698535,-2017-03-04-E
425,1507439060,2.2.874.0.0.8400978.5.486.2510732133849908984,2.2.874.0.0.8400978.5.486.1747132007124743752,2.2.874.0.0.8400978.5.486.2019300706932341803,Accession Number,SH,"(0008, 0050) Accession Number ...",20170726E609517,-2017-07-26-E
505,1814567196,1.5.566.0.3.5921028.1.523.6121491292105488144,1.5.566.0.3.5921028.1.523.5821238071079767946,1.5.566.0.3.5921028.1.523.6321575997177780656,Accession Number,SH,"(0008, 0050) Accession Number ...",20180805E673674,-2018-08-05-E
988,4953814140,1.3.241.0.0.3328322.7.935.2685937606730192613,1.3.241.0.0.3328322.7.935.9222594282780660977,1.3.241.0.0.3328322.7.935.1641820698690379306,Accession Number,SH,"(0008, 0050) Accession Number ...",20171114E506367,-2017-11-14-E
994,4953814140,1.3.241.0.0.3328322.7.935.2685937606730192613,1.3.241.0.0.3328322.7.935.9222594282780660977,1.3.241.0.0.3328322.7.935.3386585305478125613,Accession Number,SH,"(0008, 0050) Accession Number ...",20171114E506367,-2017-11-14-E
...,...,...,...,...,...,...,...,...,...
5561,9473050187,2.5.444.1.3.9432617.9.170.1605644608138802224,2.5.444.1.3.9432617.9.170.1185165576631833947,2.5.444.1.3.9432617.9.170.1220587225385277156,Accession Number,SH,"(0008, 0050) Accession Number ...",20140722E804773,-2014-07-22-E
5570,9473050187,2.5.444.1.3.9432617.9.170.1605644608138802224,2.5.444.1.3.9432617.9.170.1185165576631833947,2.5.444.1.3.9432617.9.170.3053293679157621944,Accession Number,SH,"(0008, 0050) Accession Number ...",20140722E804773,-2014-07-22-E
5579,9473050187,2.5.444.1.3.9432617.9.170.1605644608138802224,2.5.444.1.3.9432617.9.170.1185165576631833947,2.5.444.1.3.9432617.9.170.2941125927500826196,Accession Number,SH,"(0008, 0050) Accession Number ...",20140722E804773,-2014-07-22-E
5588,9473050187,2.5.444.1.3.9432617.9.170.1605644608138802224,2.5.444.1.3.9432617.9.170.1185165576631833947,2.5.444.1.3.9432617.9.170.3114818742644719069,Accession Number,SH,"(0008, 0050) Accession Number ...",20140722E804773,-2014-07-22-E


In [130]:
df.iloc[0][0]['PatientID']

(0010, 0020) Patient ID                          LO: ''

In [136]:
flagged['newvalue'][90:300]

90                 Ordered for  *
91     PET-CT SERIES -2010-12-03-
92                               
93                 Ordered for  *
94                               
                  ...            
295    PET-CT SERIES -2010-12-03-
296                              
297                Ordered for  *
298                              
299                              
Name: newvalue, Length: 210, dtype: object

In [137]:
len(filelist_in)

722

In [9]:
bucket = client.get_bucket('midi-import')
for i in range(1,len(filelist_in)):
    filepath = 'preprocessed/ds1_1-23preprocessed6-27/'+filelist_in[i]
    blob = bucket.blob(filepath)
    df.iloc[i-1][0].save_as('test6.dcm')
    blob.upload_from_filename('test6.dcm')
    

In [7]:
str1 = 'Performed 20171126'

n = '20171126'

out1 = re.split('(19[0-9][0-9]|20[0-2][0-9])(0[1-9]|1[0-2])(0[1-9]|1[0-9]|2[0-9]|3[0-1])',n)
dt1 = ' '.join(out1)

out2 = re.split('(0[1-9]|1[0-9]|2[0-9]|3[0-1])(0[1-9]|1[0-2])(19[0-9][0-9]|20[0-2][0-9])',dt1)
dt2 = ' '.join(out2)

out3 = re.split('(0[1-9]|1[0-2])(0[1-9]|1[0-9]|2[0-9]|3[0-1])(19[0-9][0-9]|20[0-2][0-9])',dt2)
dt3 = ' '.join(out3)

dt3

if len(out1) == 1 and len(out2)==1 and len(out3)==1:
    str2 = str1.replace(n,'')
    
else:
    str2 = str1.replace(n,dt3)

print(dt1)
#this might end up being more computational expensive than the if statements

 2017 11 26 


In [35]:
#automatic code final
import re

file = 0
#vrs to look at
vrs = ['LO', 'SH', 'LT', 'ST', 'UT','IS']
str1 = ''
x = []
x2 = []

patient = []
study = []
SOP = []
series = []
element = []
tag = []
value = []
vr = []
newvalue = []

#regex rules
loc = re.compile('at [A-Z]{2,4}')
dr = re.compile('by [A-Z]{2}')

#blacklist
blacklist = [' MGH', ' ALH']

for i in range(len(df)):
    file = df.iloc[i][0]
    for elem in file.iterall():
        if elem.VR in vrs and elem.name != 'Code Value' and elem.name != 'Private Creator':
            if elem.tag not in keeplist:
                str1 = str(elem.value)
                try:
                    #regex replacement
                    strloc = loc.sub('*',str1)
                    str1 = dr.sub('*',strloc)
                    for b in blacklist:
                        if b in str1:
                            str1 = str1.replace(b,'+')
                    elem.value = str1
                    #date/pid finder
                    x = re.findall('[0-9]{6,15}', str1)
                    if len(x) != 0:
                        #creating df to look into results
                        patient.append(df.iloc[i][1])
                        study.append(df.iloc[i][2])
                        series.append(df.iloc[i][3])
                        SOP.append(df.iloc[i][4])
                        element.append(str(elem))
                        tag.append(elem.name)
                        vr.append(elem.VR)
                        value.append(str(elem.value))
                        
                        for n in x:
                            str1 = elem.value
                            #date format change
                            if len(n) == 8:
                                yr = n[:4]
                                m = n[4:6]
                                d = n[6:]
                                if 1895<=int(yr)<=2023 and 0<int(m)<=12 and 0<int(d)<=31:
                                    if elem.VR == 'IS':
                                        #remove dates in IS VRs
                                        str2 = str1.replace(n,'')
                                        elem.value = str2
                                    else:
                                        #reformat dates in all other VRs
                                        str3 = yr+'-'+m+'-'+d
                                        str4 = str1.replace(n,str3)
                                        elem.value = str4
                                else:
                                    #replace non-dates len of 8
                                    str2 = str1.replace(n,'')
                                    elem.value = str2
                            else:
                                #replace non-len of 8
                                str2 = str1.replace(n,'')
                                elem.value = str2
                            #add new val to df
                        newvalue.append(str(elem.value))
                                
                except:
                    print(elem)
                    

Unnamed: 0,patient,study,series,SOP,tag,vr,element,value
0,1326414678,2.2.379.0.1.5971236.3.509.2117853776506399936,2.2.379.0.1.5971236.3.509.6412913489296797476,2.2.379.0.1.5971236.3.509.1118646326684973163,Accession Number,SH,"(0008, 0050) Accession Number ...",20170304E
1,1507439060,2.2.874.0.0.8400978.5.486.2510732133849908984,2.2.874.0.0.8400978.5.486.1747132007124743752,2.2.874.0.0.8400978.5.486.2019300706932341803,Accession Number,SH,"(0008, 0050) Accession Number ...",20170726E
2,1814567196,1.5.566.0.3.5921028.1.523.6121491292105488144,1.5.566.0.3.5921028.1.523.5821238071079767946,1.5.566.0.3.5921028.1.523.6321575997177780656,Accession Number,SH,"(0008, 0050) Accession Number ...",20180805E
3,4953814140,1.3.241.0.0.3328322.7.935.2685937606730192613,1.3.241.0.0.3328322.7.935.9222594282780660977,1.3.241.0.0.3328322.7.935.1641820698690379306,Accession Number,SH,"(0008, 0050) Accession Number ...",20171114E
4,4953814140,1.3.241.0.0.3328322.7.935.2685937606730192613,1.3.241.0.0.3328322.7.935.9222594282780660977,1.3.241.0.0.3328322.7.935.3386585305478125613,Accession Number,SH,"(0008, 0050) Accession Number ...",20171114E
...,...,...,...,...,...,...,...,...
348,9473050187,2.5.444.1.3.9432617.9.170.1605644608138802224,2.5.444.1.3.9432617.9.170.1185165576631833947,2.5.444.1.3.9432617.9.170.1220587225385277156,Accession Number,SH,"(0008, 0050) Accession Number ...",20140722E
349,9473050187,2.5.444.1.3.9432617.9.170.1605644608138802224,2.5.444.1.3.9432617.9.170.1185165576631833947,2.5.444.1.3.9432617.9.170.3053293679157621944,Accession Number,SH,"(0008, 0050) Accession Number ...",20140722E
350,9473050187,2.5.444.1.3.9432617.9.170.1605644608138802224,2.5.444.1.3.9432617.9.170.1185165576631833947,2.5.444.1.3.9432617.9.170.2941125927500826196,Accession Number,SH,"(0008, 0050) Accession Number ...",20140722E
351,9473050187,2.5.444.1.3.9432617.9.170.1605644608138802224,2.5.444.1.3.9432617.9.170.1185165576631833947,2.5.444.1.3.9432617.9.170.3114818742644719069,Accession Number,SH,"(0008, 0050) Accession Number ...",20140722E


In [None]:
bucket = client.get_bucket('midi-import')
for i in dicomlist:
    filepath = 'preprocessed/ds1_1-23preprocessed5-19/'+filelist_in[1+i]
    blob = bucket.blob(filepath)
    df.iloc[i][0].save_as('test2.dcm')
    blob.upload_from_filename('test2.dcm')
    

In [35]:
df.iloc[30][0]['SeriesNumber']

(0020, 0011) Series Number                       IS: ''

In [None]:
#manual code final 
file = 0
vrs = ['LO', 'SH', 'LT', 'ST', 'UT','IS']
str1 = ''
x = []
x2 = []

patient = []
study = []
SOP = []
series = []
element = []
tag = []
value = []
vr = []

for i in range(len(df)):
    file = df.iloc[i][0]
    for elem in file.iterall():
        if elem.VR in vrs:
            if str(elem.value).isalpha() == False:
                str1 = elem.value
                try:
                    x = re.findall('[0-9]+', str1)
                    x2 = list(filter(filterLength, x))
                    if len(x2) != 0:
                        patient.append(df.iloc[i][1])
                        study.append(df.iloc[i][2])
                        series.append(df.iloc[i][3])
                        SOP.append(df.iloc[i][4])
                        element.append(str(elem))
                        tag.append(elem.name)
                        vr.append(elem.VR)
                        value.append(elem.value)
                except:
                    print(elem)
                                      
                        
    

In [None]:
hkkj  ll4kk

In [6]:
file = 0
vrs = ['LO', 'SH', 'LT', 'ST', 'UT','IS', 'UL', 'US']
str1 = ''
x = []
x2 = []

patient = []
study = []
SOP = []
series = []
element = []
tag = []
value = []
vr = []

for i in range(len(df)):
    file = df.iloc[i][0]
    for elem in file.iterall():
        if elem.VR in vrs:
            if str(elem.value).isalpha() == False:
                str1 = str(elem.value)
                try:
                    x = re.findall('[0-9]+', str1)
                    x2 = list(filter(filterLength, x))
                    if len(x2) != 0:
                        patient.append(df.iloc[i][1])
                        study.append(df.iloc[i][2])
                        series.append(df.iloc[i][3])
                        SOP.append(df.iloc[i][4])
                        element.append(str(elem))
                        tag.append(elem.name)
                        vr.append(elem.VR)
                        value.append(elem.value)
                except:
                    print(elem)
                                      
                        
    



In [40]:
str1

b'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x

In [5]:
file = 0
vrs = ['LO', 'SH', 'LT', 'ST', 'UT', 'IS', 'UL', 'US']
str1 = ''
x = []
x2 = []

patient = []
study = []
SOP = []
series = []
element = []
tag = []
value = []
vr = []

for i in range(len(df)):
    file = df.iloc[i][0]
    for elem in file.iterall():
        if str(elem.value).isalpha() == False:
            str1 = str(elem.value)
            try:
                x = re.findall('[0-9]+', str1)
                x2 = list(filter(filterLength, x))
                if len(x2) != 0:
                    patient.append(df.iloc[i][1])
                    study.append(df.iloc[i][2])
                    series.append(df.iloc[i][3])
                    SOP.append(df.iloc[i][4])
                    element.append(str(elem))
                    tag.append(elem.name)
                    vr.append(elem.VR)
                    value.append(elem.value)
            except:
                print(elem)
                                      
                        
    



KeyboardInterrupt: 

In [1]:
file = 0
vrs = ['LO', 'SH', 'LT', 'ST', 'UT']
str1 = ''
x = []
x2 = []

patient = []
study = []
SOP = []
series = []
element = []
tag = []
value = []
vr = []

for i in range(len(df)):
    file = df.iloc[i][0]
    for elem in file.iterall():
        if str(elem.value).isalpha() == False:
            str1 = str(elem.value)
            try:
                x = re.findall('[0-9]+', str1)
                x2 = list(filter(filterLength, x))
                #if len(x2) != 0:
                    #patient.append(df.iloc[i][1])
                    #study.append(df.iloc[i][2])
                    #series.append(df.iloc[i][3])
                    #SOP.append(df.iloc[i][4])
                    #element.append(str(elem))
                    #tag.append(elem.name)
                    #vr.append(elem.VR)
                    #value.append(elem.value)
            except:
                print(elem)
                                      
                        
    

NameError: name 'df' is not defined

In [28]:
d = {'patient':patient, 'study':study, 'series':series, 'SOP':SOP, 'tag':tag, 'vr':vr, 'element':element, 'value':value}
flagged = pd.DataFrame(d)

In [31]:
flagged.groupby(['tag'])['tag'].count()

tag
Accession Number    353
Name: tag, dtype: int64

In [8]:
flagged2 = flagged[flagged['tag']!='Code Value']
flagged2 = flagged2[flagged2['tag']!='Patient ID']
flagged2 = flagged2[flagged2['tag']!='Study ID']
flagged2 = flagged2[flagged2['tag']!='Clinical Trial Subject ID']
flagged2 = flagged2[flagged2['tag']!='Accession Number']

In [9]:
flagged2['tag'].unique()

array(['Additional Patient History', 'Patient Comments',
       'Actual Frame Duration', 'Series Description', 'Study Comments',
       'Series Number', 'Scheduled Procedure Step Description',
       'Study Description', 'Private Creator', 'Protocol Name',
       'Requested Procedure Comments', 'Requested Procedure Description',
       'Imaging Service Request Comments',
       'Attenuation Correction Method', 'Acquisition Comments',
       'Private tag data', 'Image Comments', 'Text Value',
       'Medical Alerts'], dtype=object)

In [10]:
len(flagged)

7161

In [11]:
unique_flagged = flagged2.drop_duplicates(subset=['study','element'])

In [12]:
len(unique_flagged)

89

In [25]:
unique_flagged.to_csv('allflaggedtest.csv')

In [102]:
unique_flagged[unique_flagged['tag']=='Image Comments']

Unnamed: 0,study,series,SOP,tag,vr,element
1304,1.3.241.0.0.3328322.7.935.2685937606730192613,1.3.241.0.0.3328322.7.935.9222594282780660977,1.3.241.0.0.3328322.7.935.2137823494280056407,Image Comments,LT,"(0020, 4000) Image Comments ..."
2114,1.3.241.0.0.3328322.7.935.2685937606730192613,1.3.241.0.0.3328322.7.935.9222594282780660977,1.3.241.0.0.3328322.7.935.3037455751552642927,Image Comments,LT,"(0020, 4000) Image Comments ..."
3874,2.3.439.1.0.0712194.8.594.3353626527435595311,2.3.439.1.0.0712194.8.594.2584876857362372891,2.3.439.1.0.0712194.8.594.2118017278604801017,Image Comments,LT,"(0020, 4000) Image Comments ..."
4050,2.3.439.1.0.0712194.8.594.3353626527435595311,2.3.439.1.0.0712194.8.594.2584876857362372891,2.3.439.1.0.0712194.8.594.3217557278435333498,Image Comments,LT,"(0020, 4000) Image Comments ..."
4333,2.5.444.1.3.9432617.9.170.1605644608138802224,2.5.444.1.3.9432617.9.170.1185165576631833947,2.5.444.1.3.9432617.9.170.2630043865913544067,Image Comments,LT,"(0020, 4000) Image Comments ..."
4416,2.5.760.0.0.0071116.8.497.2211300014030572904,2.5.760.0.0.0071116.8.497.6426282347040498683,2.5.760.0.0.0071116.8.497.1302127158402175177,Image Comments,LT,"(0020, 4000) Image Comments ..."
4479,2.5.760.0.0.0071116.8.497.2211300014030572904,2.5.760.0.0.0071116.8.497.6426282347040498683,2.5.760.0.0.0071116.8.497.2112746267588715374,Image Comments,LT,"(0020, 4000) Image Comments ..."
4606,2.5.760.0.0.0071116.8.497.2211300014030572904,2.5.760.0.0.0071116.8.497.6426282347040498683,2.5.760.0.0.0071116.8.497.6996487001063066160,Image Comments,LT,"(0020, 4000) Image Comments ..."


In [122]:
unique_flagged['SOP'][1304]

'1.3.241.0.0.3328322.7.935.2137823494280056407'

In [123]:
df[df['SOPID']=='1.3.241.0.0.3328322.7.935.2137823494280056407']

Unnamed: 0,dicom,patientID,studyID,seriesID,SOPID
176,"[(0008, 0005) Specific Character Set ...",PLACEHOLDER,1.3.241.0.0.3328322.7.935.2685937606730192613,1.3.241.0.0.3328322.7.935.9222594282780660977,1.3.241.0.0.3328322.7.935.2137823494280056407


In [125]:
df.iloc[176][0]['ImageComments'].value

'Ordered for 4953814140 by MC'