In [2]:
import glob

In [3]:
import os
import re
import inspect

def _get_parser_list(dirname):
    files = [f.replace('.py', '')
            for f in os.listdir(dirname) 
            if not f.startswith('__')]
            
    return files

def _import_parsers(parserfiles):
    m = re.compile('.+parser$', re.I)
    _modules = __import__('parsers',
            globals(),
            locals(),
            parserfiles,
            0) 

    _parsers = [(k, v) for k, v in inspect.getmembers(_modules)
            
            if inspect.ismodule(v) and m.match(k)]
    _classes = dict()
    for k, v in _parsers:
        _classes.update({k: v for k, v in inspect.getmembers(v)
                if inspect.isclass(v) and m.match(k)})
    
    return _classes

def load(dirname):
    parserfiles = _get_parser_list(dirname)
    return _import_parsers(parserfiles)

In [4]:
_p_uri = os.path.normpath(os.path.join(os.getcwd(), *['..','parsers']))
_p_files = _get_parser_list(dirname=_p_uri)
for _p in _p_files:
    print(_p)

#_p_classes = load(_p_uri)

#for k,v in _p_classes.items():
#    print(k , v)

base_parser
dicom_parser
excel_parser
exif_parser
ics_parser
imagetext_parser
name_parser
pdf_parser
powerpoint_parser
word_parser
zip_parser


In [61]:
_files = []
_paths = [
    os.path.join("D:\\", *["purge", "dicom", "DICOM Datasets"], "*.docx"),
    os.path.join("D:\\", *["purge", "dicom", "DICOM Datasets"], "*.doc"),
    os.path.join("D:\\", *["purge", "dicom", "DICOM Datasets"], "*.pdf")
]

#_base_folder = os.path.join("D:\\", *["purge", "dicom", "DICOM Datasets"], "*.docx") #", "
for _p in _paths:
    _files.extend(glob.glob(_p))

print(len(_files))
print(_files)


57
['D:\\purge\\dicom\\DICOM Datasets\\22093011_KAMLESH W_O DULICHAND_BRAIN AND HEAD.docx', 'D:\\purge\\dicom\\DICOM Datasets\\22093014_SUNITA W_O ROTASH_BRAIN AND HEAD.docx', 'D:\\purge\\dicom\\DICOM Datasets\\22093015_SAURAV_BRAIN AND HEAD.docx', 'D:\\purge\\dicom\\DICOM Datasets\\22100114_NIRMAL W_O NARYAN_BRAIN AND HEAD.docx', 'D:\\purge\\dicom\\DICOM Datasets\\Arvind kumar  b.docx', 'D:\\purge\\dicom\\DICOM Datasets\\BABY PARI 04 YRS----CT BRAIN.docx', 'D:\\purge\\dicom\\DICOM Datasets\\barji.docx', 'D:\\purge\\dicom\\DICOM Datasets\\BHAGOTI DEVI.docx', 'D:\\purge\\dicom\\DICOM Datasets\\CHETRAM CT HEAD.docx', 'D:\\purge\\dicom\\DICOM Datasets\\CT HEAD -SITA RAM.docx', 'D:\\purge\\dicom\\DICOM Datasets\\DEVI SAHAY.docx', 'D:\\purge\\dicom\\DICOM Datasets\\GOPAL SINGH CT HEAD.docx', 'D:\\purge\\dicom\\DICOM Datasets\\Gulabi devi ct head.docx', 'D:\\purge\\dicom\\DICOM Datasets\\Haneek.docx', 'D:\\purge\\dicom\\DICOM Datasets\\jagdish.docx', 'D:\\purge\\dicom\\DICOM Datasets\\JAYA K

In [6]:
import fitz

def get_pdfcontents(uri: str) -> dict:
    """ """
    data = []

    doc = fitz.open(uri)

    data = doc.get_toc(simple=True)

    return {
        "type": "toc",
        "toc": data
    }

def get_pdflines(uri: str) -> dict:
    """ """
    _doc = fitz.open(uri)
    _pages = {}
    _lines = []
    for _idx, _page in enumerate(_doc):
        _text = _page.get_text('text', sort=True, flags=2)
        if _text:
            _pages[_idx] = _text
            # the text is split on lines
            _lines.extend([x.strip() for x in _text.split("\n") if x.strip()])

    return {
        "type": "lines",
        "pages": _pages,
        "lines": _lines
    }

In [7]:
import docx2txt

def get_doxcontents(uri: str) -> dict:
    """ """
    text = docx2txt.process(uri)
    return {
        "type": "strings",
        "strings": text
    }

In [8]:
_funcs = {
    ".docx": get_doxcontents,
    ".pdf": get_pdflines
}

In [9]:
_data = {}
for _f in _files:
    print("FILE:\n\t",_f)
    _, _ext = os.path.splitext(_f)
    _func = _funcs.get(_ext.lower())
    if _func:
        _result = _func(uri=_f)
        #print("RESULT:\n\t",_result)
        if _result.get("type") == "strings":
            _lines =  [x.strip() for x in _result.get("strings","").split("\n") if x.strip()]
        elif _result.get("type") == "lines":
            _lines =  _result.get("lines",[])
                #list(filter(lambda item: item, _result.get("strings","").split("\n")))

        _data[_f] = {
            "contents": _result,
        }
        if len(_lines) > 0:
            _data[_f]["lines"] = _lines


FILE:
	 D:\purge\dicom\DICOM Datasets\22093011_KAMLESH W_O DULICHAND_BRAIN AND HEAD.docx
FILE:
	 D:\purge\dicom\DICOM Datasets\22093014_SUNITA W_O ROTASH_BRAIN AND HEAD.docx
FILE:
	 D:\purge\dicom\DICOM Datasets\22093015_SAURAV_BRAIN AND HEAD.docx
FILE:
	 D:\purge\dicom\DICOM Datasets\22100114_NIRMAL W_O NARYAN_BRAIN AND HEAD.docx
FILE:
	 D:\purge\dicom\DICOM Datasets\Arvind kumar  b.docx
FILE:
	 D:\purge\dicom\DICOM Datasets\BABY PARI 04 YRS----CT BRAIN.docx
FILE:
	 D:\purge\dicom\DICOM Datasets\barji.docx
FILE:
	 D:\purge\dicom\DICOM Datasets\BHAGOTI DEVI.docx
FILE:
	 D:\purge\dicom\DICOM Datasets\CHETRAM CT HEAD.docx
FILE:
	 D:\purge\dicom\DICOM Datasets\CT HEAD -SITA RAM.docx
FILE:
	 D:\purge\dicom\DICOM Datasets\DEVI SAHAY.docx
FILE:
	 D:\purge\dicom\DICOM Datasets\GOPAL SINGH CT HEAD.docx
FILE:
	 D:\purge\dicom\DICOM Datasets\Gulabi devi ct head.docx
FILE:
	 D:\purge\dicom\DICOM Datasets\Haneek.docx
FILE:
	 D:\purge\dicom\DICOM Datasets\jagdish.docx
FILE:
	 D:\purge\dicom\DICOM D

In [10]:
_contents = {}
for _fp, _dv in _data.items():
    for _l in _dv.get("lines",[]):
        if _l.lower() not in _contents:
            _contents[_l.lower()] = 0
        _contents[_l.lower()] += 1

for _c in sorted(_contents.keys()):
    print(_c[:30], _contents.get(_c))

011y 1
018y 1
022y 2
032y 1
034y 1
039y 1
040y 1
042y 1
046y 1
056y 1
062y 1
070y 1
1-oct-22 5
1/10/2022 12:35:05 pm 1
1/10/2022 1:02:27 pm 1
2-oct-22 4
2/10/2022 10:02:26 pm 1
2/10/2022 10:29:21 pm 1
2/10/2022 10:43:53 pm 1
2/10/2022 3:00:42 pm 1
2/10/2022 3:57:18 pm 1
22093011 1
22093014 1
22093015 1
22100114 1
2210025 1
2210027 1
2210028 1
22100313 1
22100314 1
22100316 1
22100317 1
22100319 1
22100320 1
24-sep-22 3
25-sep-22 1
26-sep-22 7
27-sep-22 3
28-sep-22 3
29-sep-22 4
3-oct-22 4
3/10/2022 10:07:44 am 1
3/10/2022 10:47:08 pm 1
3/10/2022 10:55:46 pm 1
3/10/2022 3:09:27 pm 1
3/10/2022 4:13:16 pm 1
3/10/2022 4:34:15 pm 1
3/10/2022 5:03:00 pm 1
3/10/2022 5:20:56 pm 1
3/10/2022 6:21:36 pm 1
3/10/2022 7:24:11 pm 1
3/10/2022 8:25:56 pm 1
3/10/2022 9:31:30 pm 1
3/10/2022 9:36:29 pm 1
30-sep-22 3
30/9/2022 2:17:04 pm 1
30/9/2022 2:27:40 pm 1
30/9/2022 2:41:48 pm 1
30/9/2022 3:06:26 pm 1
30/9/2022 3:36:11 pm 1
30/9/2022 4:44:26 pm 1
4-oct-22 5
above study was performed on 6 9
above stud

In [53]:
import re

_fields = [
    "age",
    "date",
    "gender",
    "patient id",
    "patient name",
    "ref doctor",
    "ref. by",
    "report date",
    "sex",
    "study date",
]

_regs = {
    "notice": re.compile("^(this report is not valid.*$)", re.IGNORECASE),
    "scan": re.compile("(^.*scan of brain.*$)", re.IGNORECASE),
    #"impression": re.compile("^impression:(.*)$", re.IGNORECASE),
    #"observations": re.compile("^observations:(.*)$", re.IGNORECASE),
    #"protocol": re.compile("^protocol:(.*)$", re.IGNORECASE),
    #"disclaimer": re.compile("^disclaimer:(.*)$", re.IGNORECASE),
    #"advice": re.compile("^advice:(.*)$", re.IGNORECASE),
    "advice": re.compile("^(please correlate.*)$", re.IGNORECASE),
    "__misc": re.compile("^([a-zA-Z]+):(.*)$", re.IGNORECASE),
}

_mappings = {
    "ref. by": "referral",
    "ref doctor": "referral",
    "adv": "advice"
}

In [43]:
_test = "ADVICE:  Please Correlate Clinically"
for _rl, _re in _regs.items():
    _m = _re.match(_test)
    print(_m)
    if _m:
        if len(_m.groups()) > 1:
            print(_m.group(1).strip().lower()," -> ", _m.group(2).strip())
        else:
            print(_m.group(1).strip())
 


None
None
None
<re.Match object; span=(0, 36), match='ADVICE:  Please Correlate Clinically'>
advice  ->  Please Correlate Clinically


In [44]:
#_uri = r"D:\purge\dicom\DICOM Datasets\2210025_SURBHID_OSURESH_BRAIN AND HEAD.pdf"
#_uri = r"D:\purge\dicom\DICOM Datasets\22100320_NEERAJ_BRAIN AND HEAD.pdf"
#_uri = r"D:\purge\dicom\DICOM Datasets\22093011_KAMLESH W_O DULICHAND_BRAIN AND HEAD.docx"
_uri = r"D:\purge\dicom\DICOM Datasets\BABY PARI 04 YRS----CT BRAIN.docx"

In [45]:
# process the fields and extract to a text...
for _fp, _dv in _data.items():
    _dout = (_uri and _fp == _uri)

    _record = {}
    _k = None
    _ls = []
    for _l in _dv.get("lines",[]):
        print(f"Line: {_l}") if _dout else None
        _mfnd = False
        if _l.lower() in _fields:
            if _k and _ls:
                _record[_k] = '\n'.join(_ls)
                print(f"L Field: {_k} {_ls}") if _dout else None
            _k = _l.lower()
            _ls = []
            print(f"L New: {_k}") if _dout else None
            continue
        else:
            for _rl, _re in _regs.items():
                _m = _re.match(_l)
                print(f"M Reg: {_k} {_m}") if _dout else None
                if _m:
                    if len(_m.groups()) > 1:
                        _nk = _m.group(1).strip().lower()
                        _nl = _m.group(2).strip()
                    else:
                        _nk = None
                        _nl = _m.group(1).strip()

                    if _k and _ls:
                        _record[_k] = '\n'.join(_ls)
                        print(f"M Field: {_k} {_ls}") if _dout else None

                    _k = _nk if _nk else _rl
                    _ls = []
                    print(f"M New: {_k}") if _dout else None
                    _ls.append(_nl)
                    _mfnd = True
                    continue

        if _mfnd is True:
            continue

        if _k:
            _ls.append(_l)
            print(f"G Field: {_k} {_ls}") if _dout else None

    if _k and _ls:
        _record[_k] = '\n'.join(_ls)    

    #print(f"File: {_fp}\n\t\t{_record}")
    _data[_fp]["record"] = _record


Line: Patient ID
L New: patient id
Line: Patient Name
L New: patient name
Line: Age
L New: age
Line: yrs
M Reg: age None
M Reg: age None
M Reg: age None
M Reg: age None
G Field: age ['yrs']
Line: Date
L Field: age ['yrs']
L New: date
Line: 2-Oct-22
M Reg: date None
M Reg: date None
M Reg: date None
M Reg: date None
G Field: date ['2-Oct-22']
Line: Gender
L Field: date ['2-Oct-22']
L New: gender
Line: Ref Doctor
L New: ref doctor
Line: CT SCAN OF BRAIN (PLAIN)
M Reg: ref doctor None
M Reg: ref doctor <re.Match object; span=(0, 24), match='CT SCAN OF BRAIN (PLAIN)'>
M New: scan
M Reg: scan None
M Reg: scan None
Line: Suboptimal study due to motion blur. Scan reported with these limitations.
M Reg: scan None
M Reg: scan None
M Reg: scan None
M Reg: scan None
G Field: scan ['CT SCAN OF BRAIN (PLAIN)', 'Suboptimal study due to motion blur. Scan reported with these limitations.']
Line: OBSERVATIONS:
M Reg: scan None
M Reg: scan None
M Reg: scan None
M Reg: scan <re.Match object; span=(0, 13)

In [46]:
for _dr, _dv in _data.get(_uri,{}).items():
    #print(f"{_dr}: ==>\n\t {_dv}")
    print(f"{_dr}: ==>")
    if "record" in _dv:
        for _fk, _fv in _dv.get("record", {}).items():
            print(f"\t{_fk}:\t{+fv}")

    elif "lines" in _dv:
        for _lv in _dv.get("lines", []):
            print(f"\t|{_lv}")

    else:
        print(f"\t:{_dv}")


contents: ==>
	:{'type': 'strings', 'strings': 'Patient ID\n\n\t\n\n\tPatient Name\n\n\t\n\n\tAge\n\n\tyrs\n\n\tDate\n\n\t2-Oct-22\n\n\tGender\n\n\t\n\n\tRef Doctor\n\n\t\n\n\n\nCT SCAN OF BRAIN (PLAIN)\n\n\n\nSuboptimal study due to motion blur. Scan reported with these limitations. \n\n\n\nOBSERVATIONS:\n\nCerebral parenchyma shows normal density & gray white matter differentiation. Septum pellucidum and falx cerebri are seen in midline.\n\nBilateral basal ganglia and thalami are normal.\n\nCerebellar hemispheres and brain stem  \xa0are showing normal attenuation values.\n\nVentricular system with basal cisterns & cerebral cortical sulci including sylvian fissures are normally visualized. \n\nNo mass effect or midline shift.\n\nno obvious calcification, mass lesion or hemorrhage is seen.\n\nExtra-axial spaces are clear. No significant subdural / extradural collection, hematoma or mass lesion is observed.\n\nNo obvious skull fracture is seen. \n\nVisualized paranasal sinuses are norma

In [58]:
fields = set()
fields.add("file")
rows = []

for _dr, _dv in _data.items():
    #print(f"{_dr}: ==>\n\t {_dv}")
    #print(f"{_dr}: ==>")
    if "record" in _dv:
        row = {
            "file": _dr
        }
        for _fk, _fv in _dv.get("record", {}).items():
            _uk = _mappings.get(_fk)
            _fk = _uk if _uk else _fk
            row.setdefault(_fk, _fv)
            fields.add(_fk)
            #if _fk in ["adv","advice"]:
            #    print(f"\t{_fk}: {_fv}")
        rows.append(row)

    elif "lines" in _dv:
        for _lv in _dv.get("lines", []):
            print(f"\t|{_lv}")

    else:
        print(f"\t:{_dv}")

print(fields)

{'patient id', 'gender', 'file', 'report date', 'scan', 'disclaimer', 'referral', 'date', 'impression', 'study date', 'notice', 'protocol', 'sex', 'patient name', 'advice', 'observations', 'age'}


In [59]:
import csv

with open('DICOMReports.csv', 'w', newline='') as output_file:
    dict_writer = csv.DictWriter(output_file, list(fields))
    dict_writer.writeheader()
    dict_writer.writerows(rows)