In [1]:
# Override hash function

import hashlib

def hash(s : str) -> str :
    m = hashlib.sha256()
    m.update(s.encode('utf-8'))
    return m.hexdigest()

### Import Library

In [None]:
import docx
from docx.table import Table
from docx.document import Document
from docx2python import docx2python
import numpy as np
import pandas as pd
from IPython.display import display
from glob import glob
import re
from typing import Dict , List

In [None]:
# Docx จาก lib to Dataframe
def doc_table_to_df(table : Table) -> pd.DataFrame:
    return pd.DataFrame([cell.text for cell in row.cells] for row in table.rows)

In [None]:
feature = [
    "hash_id",
    "hn",
    "sample_collection_date",
    "date_of_submission",
    "sex",
    "species",
    "submitted_sample",
    "collect_method",
    "vitek_id",
    "bact_species",
    "vet_read",
    "vet_recommend",
    "report_issued_date",
    "filename"
]

In [None]:
#file 2020-2021
filenames = glob("./data2020_2021/**/*.docx")


![doc](../img/docx_example.png)

### Transform Third Format [2020 - Present]
![doc](../img/third_format.png)

In [None]:
class TransformThirdFormat:
    detail_para_keys = ["Date of submission", "Date of sample collection", "Owner", "Patient", "Species", "Breed",
                        "Sex", "Birth date", "Submitted sample", "Methods of collection", "Methods (ID/AST)",
                        "Isolate No.", "identification"
                        ]

    @staticmethod
    def hn_check(paragraphs: str):
        pos = paragraphs.find("Results of Antimicrobial Susceptibility Testing")
        if pos == -1 :
            return np.nan
        paragraphs = paragraphs[pos:].split('\n')[1].strip()
        hn = paragraphs.find("HN")
        clinic = paragraphs.find("Clinician")
        if hn >= 0 and clinic >= 0:
            return paragraphs[hn + 2:clinic].strip()
        elif hn >= 0:
            return paragraphs[hn + 2:].split('\n')[0].strip
        elif clinic >= 0:
            return paragraphs[:clinic].strip()
        else :
            return paragraphs

    @staticmethod
    def transform_first_paragraph(paragraphs: str) -> Dict:
        d = {"HN": TransformThirdFormat.hn_check(paragraphs)}
        check_keys = TransformThirdFormat.detail_para_keys[:]
        findkey = False
        key = None
        for para in paragraphs.split("\n"):
            if findkey:
                _keys = check_keys[:]
                for k in _keys:
                    if k == para:
                        findkey = False
            if findkey:
                d.update({key: para.strip()})
                findkey = False
            else:
                _keys = check_keys[:]
                for k in _keys:
                    if k in para:
                        key = k
                        check_keys.remove(k)
                        findkey = True

        return d

    @staticmethod
    def vet_transform(paragraphs: str) -> Dict:
        vet_detail = {}
        position = paragraphs.rfind("ผู้ทำการอ่านผลและแนะนำ")
        if position == -1:
            return vet_detail
        vet = paragraphs[:position - 1].split("\n")[-1]
        vet_detail["date"] = paragraphs[position +
                                        len("ผู้ทำการอ่านผลและแนะนำ") + 1:].split("\n")[0].strip()
        vet_detail["vet_read"] = vet.split(",")[0].strip()
        if len(vet.split(",")) == 1:
            print(paragraphs)
        else:
            vet_detail["vet_recommend"] = vet.split(",")[1].strip()
        return vet_detail

    @staticmethod
    def read_paragraph(filename: str) -> Dict:
        doc = re.sub(r'(\n\s*)+\n+', '\n', docx2python(filename).text)
        first_para = TransformThirdFormat.transform_first_paragraph(doc)
        vet_para = TransformThirdFormat.vet_transform(doc)
        return {
            "hash_id": hash(filename),
            "hn": first_para.get("HN", np.nan),
            "sample_collection_date": first_para.get("Date of sample collection", np.nan),
            "date_of_submission": first_para.get("Date of submission", np.nan),
            "sex": first_para.get("Sex", np.nan),
            "species": first_para.get("Species", np.nan),
            "submitted_sample": first_para.get("Submitted sample", np.nan),
            "collect_method": first_para.get("Methods of collection", np.nan),
            "vitek_id": filename.split('\\')[-1].split('_')[-2],
            "bact_species": first_para.get("identification", np.nan),
            "vet_read": vet_para.get("vet_read", np.nan),
            "vet_recommend": vet_para.get("vet_recommend", np.nan),
            "report_issued_date": vet_para.get("date", np.nan),
            "filename": filename
        }

### Text Scraper

In [None]:
df = pd.DataFrame(columns=feature)
for filename in filenames:
    if "รายงานรับตัวอย่าง" in filename or "ระหว่างดำเนินการ" in filename:
        continue
    df = df.append(TransformThirdFormat.read_paragraph(
            filename), ignore_index=True)


In [None]:
df.to_csv("./Dataset2020-2021.csv",index=False)
df = pd.read_csv("./Dataset2020-2021.csv")

### Part 2

In [None]:
feature_part_2 = ["hash_id","Antimicrobials","MIC","S/I/R"]

df_2 = pd.DataFrame(columns=feature_part_2)
for filename in filenames :
    doc = docx.Document(filename)
    for table in doc.tables:
        data_table = doc_table_to_df(table)
        if data_table[0][0] == "Antimicrobials":
            if len(data_table.columns) != 3:
                data_table = data_table.drop(columns=2,axis=1)
                data_table = data_table.rename(columns={3 : 2})
            hash_id = df.set_index('filename').loc[filename,"hash_id"]
            for index, row in data_table.iterrows():
                if row[0] != "Antimicrobials" and row[0] != "":
                    df_2 = df_2.append({
                        "hash_id" : hash_id,
                        "Antimicrobials" : row[0],
                        "MIC" : row[1],
                        "S/I/R" : row[2]
                    },ignore_index=True)

In [None]:
df_2.to_csv("./ResultAST2020-2021.csv",index=False)

### Part 3

In [None]:
df = pd.read_csv("./Dataset2020-2021.csv")

In [None]:
feature_part_3 = ["hash_id","Answer_Antimicrobials","Paragraph",'filename']

In [None]:
def read_answer(filename : str) :
    paragraphs =  re.sub(r'(\n\s*)+\n+', '\n',docx2python(filename).text)
    li = []
    for para in paragraphs.split('\n'):
        if "--\t" in para and 'mg/kg' in para.replace(' ','').lower() :
            for p in para.split():
                if "mg" in p.strip():
                    break
                if p.strip()[0] in [chr(ord('A') + i) for i in range(26)] + [chr(ord('a') + i) for i in range(26)]:
                    li.append(("".join([ch for ch in p.strip() if ch.isalpha()]) , para))
    return li

In [None]:
def read_answer_2(filename : str) :
    paragraphs =  re.sub(r'(\n\s*)+\n+', '\n',docx2python(filename).text)
    li = []
    for para in paragraphs.split('\n'):
        if "--\t" in para and 'units/kg' in para.replace(' ','').lower() :
            for p in para.split():
                if "units" in p.strip():
                    break
                if p.strip()[0] in [chr(ord('A') + i) for i in range(26)] + [chr(ord('a') + i) for i in range(26)]:
                    li.append(("".join([ch for ch in p.strip() if ch.isalpha()]) , para))
    return li

In [None]:
df_4 = pd.DataFrame(columns=feature_part_3)
for filename in filenames:
    doc = docx.Document(filename)
    if len(doc.tables) <= 1:
        continue
    for ans in read_answer_2(filename):
        df_4 = df_4.append({
            "hash_id" : df.set_index("filename").loc[filename].hash_id,
            "Answer_Antimicrobials" : ans[0],
            "Paragraph" : ans[1],
            "filename" : filename
        },ignore_index=True)

In [None]:
df_4

df 4 ไม่มียาที่ต้องการ (ไม่ใช้)

In [None]:
df_3 = df_3.drop(index=df_3[df_3.Answer_Antimicrobials == "acid"].index)
df_3 = df_3.drop(index=df_3[df_3.Answer_Antimicrobials == "amoxicillinclavulanic"].index)
df_3 = df_3.drop(index=df_3[df_3.Answer_Antimicrobials == "mic"].index)
df_3 = df_3.drop(index=df_3[df_3.Answer_Antimicrobials == "enrofloxacin"].index)
df_3 = df_3.drop(index=df_3[df_3.Answer_Antimicrobials == "sulfamethoxazoletrimethoprim"].index)

In [None]:
df_3 = pd.DataFrame(columns=feature_part_3)
for filename in filenames:
    doc = docx.Document(filename)
    if len(doc.tables) <= 1:
        continue
    for ans in read_answer(filename):
        df_3 = df_3.append({
            "hash_id" : df.set_index("filename").loc[filename].hash_id,
            "Answer_Antimicrobials" : ans[0],
            "Paragraph" : ans[1],
            "filename" : filename
        },ignore_index=True)

In [None]:
df_3

In [None]:
anti_2 = {'Amikacin' : 'amikacin',
 'Amox' : 'amoxicillin/clavulanic acid',
 'Azithromycin' : 'azithromycin',
 'Cefalexin' : 'cefalexin',
 'Cefazolin' : 'cefazolin',
 'Cefixime' : 'cefixime',
 'Cefovecin' : 'cefovecin',
 'Ceftriaxone' : 'ceftriaxone',
 'Cephalecxin' : 'cefalexin',
 'Cephalexin' : 'cefalexin',
 'Cephazolin' : 'cefazolin',
 'Clindamycin' : 'clindamycin',
 'Doxycycline' : 'doxycycline',
 'Enrofloxacin' : 'enrofloxacin',
 'Fluconazole' : 'fluconazole',
 'Fosfomycin' : 'fosfomycin',
 'Gentamicin' : 'gentamicin',
 'Imipenem' : 'imipenem',
 'Marbo' : 'marbofloxacin',
 'Metronidazole' : 'metronidazole',
 'Nitrofurantoin' : 'nitrofurantoin',
 'Piperacillintazobactam' : 'piperacillin/tazobactam',
 'Pradofloxacin' : 'pradofloxacin',
 'Rifampi' : 'rifampicin',
 'Rifampi' : 'rifampicin',
 'Sulfamethoxazole' : 'trimethoprim/sulfamethoxazole',
 'Sulfamethoxazoletrimethoprim' : 'trimethoprim/sulfamethoxazole',
 'Tetracycline' : 'tetracycline',
 'Vancomycin' : 'vancomycin',
 'cefovecin' : 'cefovecin',
 'cephalexin' : 'cefalexin',
 'imipenem' : 'imipenem',
 'marbofloxacin' : 'marbofloxacin',
 'nitrofurantoin' : 'nitrofurantoin',
 'sulfamethoxazoletrimethoprim' : 'trimethoprim/sulfamethoxazole' }

In [None]:
def anti_change(value : str, anti : Dict) -> str:
    for key , val in anti.items():
        if value.startswith(key):
            return val
    return value

df_3.Answer_Antimicrobials = df_3.Answer_Antimicrobials.map(lambda x : anti_change(x,anti_2))


In [None]:
df_3.to_csv("Answer_with_paragraph2020-2021.csv",index=False)

In [None]:
df_3[['hash_id','Answer_Antimicrobials']].drop_duplicates().to_csv("Answer2020-2021.csv",index=False)