# Configuration

In [1]:
USE_COLAB=False

In [2]:
if USE_COLAB:
    from google.colab import drive
    drive.mount('/content/gdrive')

In [25]:
BASE_PATH = '..\\data\\'
ORIGINAL_BASE_PATH = '..\\data\\original\\'
DIR_SEPARATOR = '\\' # /

ICD10_BG_4SIGN = 'ICD10_bg_4sign.csv'
ICD10_BG_3SIGN = 'ICD10_bg_3sign.csv'
ICD10_ALL_4SIGN = 'ICD10_all_4sign.csv'
ICD10_ALL_3SIGN = 'ICD10_all_3sign.csv'

LABEL_COLUMN = 'ICD10'
TEXT_COLUMN = 'Text'

# Create Classes Files

In [4]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from pandas import DataFrame
import numpy as np

In [5]:
df_icd10 = pd.read_csv('{0}{1}'.format(ORIGINAL_BASE_PATH, ICD10_BG_4SIGN), header=None)
df_icd10.columns = [LABEL_COLUMN, TEXT_COLUMN]
df_icd10.head()

Unnamed: 0,ICD10,Text
0,A00,Холера
1,A00.0,"Холера, предизвикана от холерен вибрион 01, би..."
2,A00.1,"Холера, предизвикана от холерен вибрион 01, би..."
3,A00.9,"Холера, неуточнена"
4,A01,Тиф и паратиф


In [6]:
# todo: reuse Boris' notebook
def standardize_code(x, digits = 3, vocab = None):
  code = fix_code(x);
  code = code[0:digits]
    
  return code

In [7]:
def fix_code(x):
  code = str(x).upper().replace(r'\n',' ');
  code = code.replace('СЪЕДИНЕНИЕ ', '') \
  .replace('А','A') \
  .replace('Б','B') \
  .replace('В','B') \
  .replace('Е','E') \
  .replace('К','K') \
  .replace('М','M') \
  .replace('О','O') \
  .replace('Р','P') \
  .replace('Т','T') \
  .replace(' ','') \
  .replace('*','') \
  .replace('.-','') \
  .replace('+', '')
  
  return code

In [8]:
standardize_code('Съединение А71.1', digits=5)

'A71.1'

In [9]:
classes_4sign = pd.DataFrame({
            LABEL_COLUMN:df_icd10[LABEL_COLUMN].apply(lambda x: standardize_code(x, 5))
            })


In [10]:
classes_4sign.shape

(10971, 1)

In [11]:
classes_4sign.to_csv('{0}classes-4.csv'.format(BASE_PATH), sep=',', index=False, header=False)

In [12]:
classes_3sign = pd.DataFrame({
            LABEL_COLUMN:df_icd10[LABEL_COLUMN].apply(lambda x: standardize_code(x, 3))
            })

In [13]:
classes_3sign.shape

(10971, 1)

In [14]:
classes_3sign.to_csv('{0}classes-3.csv'.format(BASE_PATH), sep=',', index=False, header=False)

# Load Dataset, apply stemming and stop words

In [15]:
import os
from os import path

In [16]:
file_name = '{0}{1}'.format(ORIGINAL_BASE_PATH, ICD10_ALL_4SIGN)
df_4sign = pd.read_csv(file_name)

In [17]:
file_name = '{0}{1}'.format(ORIGINAL_BASE_PATH, ICD10_ALL_3SIGN)
df_3sign = pd.read_csv(file_name)

In [23]:
original_workding_dir = os.getcwd()

In [26]:
bulstem_py_dir = '{0}{1}bulstem-py'.format(original_workding_dir, DIR_SEPARATOR)
if not(path.exists(bulstem_py_dir)):
  os.chdir(original_workding_dir)
  !git clone https://github.com/mhardalov/bulstem-py.git
  !git pull

os.chdir(bulstem_py_dir)

!pip install .

os.chdir(original_workding_dir)

Cloning into 'bulstem-py'...
fatal: not a git repository (or any of the parent directories): .git


Processing g:\projects\python\icd10_bert\notebooks\bulstem-py
Building wheels for collected packages: bulstem-py
  Building wheel for bulstem-py (setup.py): started
  Building wheel for bulstem-py (setup.py): finished with status 'done'
  Created wheel for bulstem-py: filename=bulstem_py-0.2.0-py3-none-any.whl size=828509 sha256=e86402fb88871a71490d1ade91478c673988fd6e823727206b91556ef5ca9aa3
  Stored in directory: c:\users\sylvia.vassileva\appdata\local\pip\cache\wheels\25\ae\77\03e7bb6939849d34c5b825db0940700cd23e5aaa711e48d092
Successfully built bulstem-py
Installing collected packages: bulstem-py
  Attempting uninstall: bulstem-py
    Found existing installation: bulstem-py 0.2.0
    Uninstalling bulstem-py-0.2.0:
      Successfully uninstalled bulstem-py-0.2.0
Successfully installed bulstem-py-0.2.0


In [27]:
from bulstem.stem import BulStemmer
stemmer = BulStemmer.from_file('stem-context-2', min_freq=2, left_context=1)

In [28]:
stemmer.stem('зъбния')

'зъбни'

In [36]:
stopwords = pd.read_csv('{0}Custom-BTB-StopWordList.csv'.format(ORIGINAL_BASE_PATH), header=None)
stopwordsList = list(stopwords[0])

In [37]:
import re

def apply_stemmer(x):
  disease = str(x)
  # remove ICD-10 codes
  cleaned = re.sub(r"[A-ZА-Я][0-9]{2}.[0-9]", " ",  disease);
  cleaned = re.sub(r"[A-ZА-Я][0-9]{2}", " ",  cleaned);
  cleaned = re.sub('[^0-9a-zA-ZА-Яа-я% ]+', ' ', cleaned)
  cleaned = cleaned.lower()
  tokens = cleaned.split();
  stems = [];
  for token in tokens:
    if(not(token in stopwordsList)):
      stem = stemmer.stem(token);
      stems.append(stem);
  result = " ".join(stems);
  return result;

In [38]:
apply_stemmer('монозомíа 2п15 п16.1')

'монозом 2п15 п16 1'

In [39]:
invalidValues = ['#value!', '#VALUE!', '#наме?']
df_4sign_filtered = df_4sign[~df_4sign[TEXT_COLUMN].isin(invalidValues)];
df_3sign_filtered = df_3sign[~df_3sign[TEXT_COLUMN].isin(invalidValues)];

In [40]:
df_4sign_stemmed = pd.DataFrame({
            LABEL_COLUMN:df_4sign_filtered[LABEL_COLUMN],
            TEXT_COLUMN:df_4sign_filtered[TEXT_COLUMN].apply(lambda x: apply_stemmer(x))
            })
print(df_4sign_stemmed.shape)

df_4sign_stemmed.head()

(377937, 2)


Unnamed: 0,ICD10,Text
0,A00,холер
1,A00.0,холер предизвика холер вибрион 01 биовар cholerae
2,A00.1,холер предизвика холер вибрион 01 биовар eltor
3,A00.9,холер неуточн
4,A01,тиф паратиф


In [41]:
df_3sign_stemmed = pd.DataFrame({
            LABEL_COLUMN:df_3sign_filtered[LABEL_COLUMN],
            TEXT_COLUMN:df_3sign_filtered[TEXT_COLUMN].apply(lambda x: apply_stemmer(x))
            })
print(df_3sign_stemmed.shape)

df_3sign_stemmed.head()

(189253, 2)


Unnamed: 0,ICD10,Text
0,A00,холер
1,A00,холер предизвика холер вибрион 01 биовар cholerae
2,A00,холер предизвика холер вибрион 01 биовар eltor
3,A00,холер неуточн
4,A01,тиф паратиф


In [42]:
# validate all codes are valid classes, result should be empty
differences_4sign = np.setdiff1d(df_4sign_stemmed[LABEL_COLUMN], classes_4sign[LABEL_COLUMN])

print(differences_4sign)

[]


In [43]:
# validate all codes are valid classes, result should be empty
differences_3sign = np.setdiff1d(df_3sign_stemmed[LABEL_COLUMN], classes_3sign[LABEL_COLUMN])

print(differences_3sign)

[]


# Augment Classes with Low Representation

In [44]:
groups_4sign = df_4sign_stemmed.groupby(LABEL_COLUMN).count().reset_index()
small_classes_4sign = groups_4sign[groups_4sign[TEXT_COLUMN] < 3][LABEL_COLUMN]
len(small_classes_4sign)

2495

In [45]:
small_classes_4sign

38       A05.3
39       A05.4
70       A15.1
71       A15.2
72       A15.3
80       A16.0
81       A16.1
90       A17.0
91       A17.1
93       A17.9
105      A19.0
107      A19.2
114      A20.3
149      A25.9
172      A30.8
213      A40.2
215      A40.8
218      A41.0
221      A41.3
241      A44.8
252      A49.0
254      A49.2
255      A49.3
301      A56.2
314      A60.9
317      A63.8
399      A83.6
408      A85.0
409      A85.1
423        A89
         ...  
10835    Z85.4
10836    Z85.5
10837    Z85.6
10840    Z85.9
10849    Z86.7
10851    Z87.0
10852    Z87.1
10857    Z87.6
10861    Z88.0
10863    Z88.2
10864    Z88.3
10866    Z88.5
10867    Z88.6
10893    Z91.0
10902    Z92.0
10906    Z92.4
10908    Z92.8
10909    Z92.9
10915    Z93.4
10921    Z94.0
10922    Z94.1
10923    Z94.2
10925    Z94.4
10926    Z94.5
10927    Z94.6
10928    Z94.7
10965    Z99.0
10966    Z99.1
10968    Z99.3
10970    Z99.9
Name: ICD10, Length: 2495, dtype: object

In [46]:
df_small_classes_4sign = df_4sign_stemmed[df_4sign_stemmed[LABEL_COLUMN].isin(small_classes_4sign)]

In [47]:
df_small_classes_4sign.to_csv('{0}small_classes_4sign.csv'.format(BASE_PATH), sep=',', index=False, header=True)

In [48]:
groups_3sign = df_3sign_stemmed.groupby(LABEL_COLUMN).count().reset_index()
small_classes_3sign = groups_3sign[groups_3sign[TEXT_COLUMN] < 3][LABEL_COLUMN]
len(small_classes_3sign)

343

In [49]:
small_classes_3sign

76      A89
85      A99
163     B91
164     B92
257     C97
332     D77
393     E68
412     E90
492     G01
498     G07
507     G22
511     G26
621     H82
761     J91
851     L14
868     L45
871     L52
899     L86
1144    O96
1193    P75
1367    R81
1369    R83
1377    R92
1400    S16
1491    T07
1492    T08
1494    T10
1496    T12
1539    T55
1548    T64
       ... 
1885    Y25
1886    Y26
1887    Y27
1888    Y28
1889    Y29
1890    Y30
1891    Y31
1892    Y32
1893    Y33
1894    Y34
1923    Y66
1924    Y69
1925    Y70
1926    Y71
1927    Y72
1928    Y73
1929    Y74
1930    Y75
1931    Y76
1932    Y77
1933    Y78
1934    Y79
1935    Y80
1936    Y81
1937    Y82
1941    Y86
1947    Y95
1948    Y96
1949    Y97
1950    Y98
Name: ICD10, Length: 343, dtype: object

In [50]:
df_small_classes_3sign = df_3sign_stemmed[df_3sign_stemmed[LABEL_COLUMN].isin(small_classes_3sign)]

In [51]:
df_small_classes_3sign.to_csv('{0}small_classes_3sign.csv'.format(BASE_PATH), sep=',', index=False, header=True)

In [52]:
!pip install nlpaug numpy matplotlib python-dotenv setuptools requests



In [53]:
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as nacw

In [54]:
aug_word = nacw.RandomWordAug(action="swap")
aug_word.augment('това е проба на аугментация')

'е това проба на аугментация'

In [55]:
aug_char = nac.RandomCharAug(action="swap")
aug_word = nacw.RandomWordAug(action="swap")

def augment_text(text):
  tokens = text.split(' ')
  tokens_len = len(tokens)
  if tokens_len == 1:
    augmented_text = aug_char.augment(text)
  else:    
    augmented_text = aug_word.augment(text)

  return augmented_text

In [56]:
dfs_augmented_4sign = []
augment_iterations = 2

for i in range(augment_iterations):
  df_augmented = pd.DataFrame({
            TEXT_COLUMN:df_small_classes_4sign[TEXT_COLUMN].apply(lambda x: augment_text(x)),
            LABEL_COLUMN:df_small_classes_4sign[LABEL_COLUMN]
            })
  dfs_augmented_4sign.append(df_augmented)

In [57]:
dfs_augmented_4sign.append(df_4sign_stemmed)
df_augmented_4sign = pd.concat(dfs_augmented_4sign, ignore_index=True)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


In [58]:
df_augmented_4sign.to_csv('{0}dataset-4.csv'.format(BASE_PATH), sep=',', index=False, header=True)

In [59]:
dfs_augmented_3sign = []
augment_iterations = 2

for i in range(augment_iterations):
  df_augmented = pd.DataFrame({
            TEXT_COLUMN:df_small_classes_3sign[TEXT_COLUMN].apply(lambda x: augment_text(x)),
            LABEL_COLUMN:df_small_classes_3sign[LABEL_COLUMN]
            })
  dfs_augmented_3sign.append(df_augmented)

In [60]:
dfs_augmented_3sign.append(df_3sign_stemmed)
df_augmented_3sign = pd.concat(dfs_augmented_3sign, ignore_index=True)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


In [61]:
df_augmented_3sign.to_csv('{0}dataset-3.csv'.format(BASE_PATH), sep=',', index=False, header=True)