# Configuration

In [1]:
USE_COLAB=False

In [2]:
if USE_COLAB:
    from google.colab import drive
    drive.mount('/content/gdrive')

In [3]:
BASE_PATH = '..\\data\\'
ORIGINAL_BASE_PATH = '..\\data\\original\\'
DIR_SEPARATOR = '\\' # /

ICD10_BG_4SIGN = 'ICD10_bg_4sign.csv'
#ICD10_BG_3SIGN = 'ICD10_bg_3sign.csv'
ICD10_ALL_4SIGN = 'ICD10_4sign.csv'
#ICD10_ALL_3SIGN = 'ICD10_all_3sign.csv'

LABEL_COLUMN = 'ICD10'
TEXT_COLUMN = 'Text'
PRED_CLASS = 'pred_class'

# Create Classes Files

In [4]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from pandas import DataFrame
import numpy as np

In [5]:
df_icd10 = pd.read_csv('{0}{1}'.format(ORIGINAL_BASE_PATH, ICD10_BG_4SIGN), header=None)
df_icd10.columns = [LABEL_COLUMN, TEXT_COLUMN]
df_icd10.head()

Unnamed: 0,ICD10,Text
0,A00,Холера
1,A00.0,"Холера, предизвикана от холерен вибрион 01, би..."
2,A00.1,"Холера, предизвикана от холерен вибрион 01, би..."
3,A00.9,"Холера, неуточнена"
4,A01,Тиф и паратиф


In [6]:
# todo: reuse Boris' notebook
def standardize_code(x, digits = 3, vocab = None):
  code = fix_code(x);
  code = code[0:digits]
    
  return code

In [7]:
def fix_code(x):
  code = str(x).upper().replace(r'\n',' ');
  code = code.replace('СЪЕДИНЕНИЕ ', '') \
  .replace('А','A') \
  .replace('Б','B') \
  .replace('В','B') \
  .replace('Е','E') \
  .replace('К','K') \
  .replace('М','M') \
  .replace('О','O') \
  .replace('Р','P') \
  .replace('Т','T') \
  .replace(' ','') \
  .replace('*','') \
  .replace('.-','') \
  .replace('+', '')
  
  return code

In [8]:
standardize_code('Съединение А71.1', digits=5)

'A71.1'

In [9]:
classes_4sign = pd.DataFrame({
            LABEL_COLUMN:df_icd10[LABEL_COLUMN].apply(lambda x: standardize_code(x, 5))
            })

In [10]:
classes_4sign.shape

(10971, 1)

In [11]:
classes_4sign.to_csv('{0}classes-4.csv'.format(BASE_PATH), sep=',', index=False, header=False)

In [12]:
classes_3sign = pd.DataFrame({
            LABEL_COLUMN:df_icd10[LABEL_COLUMN].apply(lambda x: standardize_code(x, 3))
            })

In [13]:
classes_3sign.drop_duplicates(inplace=True)
classes_3sign.shape

(2035, 1)

In [14]:
classes_3sign.to_csv('{0}classes-3.csv'.format(BASE_PATH), sep=',', index=False, header=False)

# Load Dataset, apply stemming and stop words

In [15]:
import os
from os import path

In [16]:
file_name = '{0}{1}'.format(ORIGINAL_BASE_PATH, ICD10_ALL_4SIGN)
df_4sign = pd.read_csv(file_name)

In [17]:
original_workding_dir = os.getcwd()

In [18]:
bulstem_py_dir = '{0}{1}bulstem-py'.format(original_workding_dir, DIR_SEPARATOR)
if not(path.exists(bulstem_py_dir)):
  os.chdir(original_workding_dir)
  !git clone https://github.com/mhardalov/bulstem-py.git
  !git pull

os.chdir(bulstem_py_dir)

!pip install .

os.chdir(original_workding_dir)

Processing c:\users\sylvia\documents\work\projects\slavic-multilingual-bert\notebooks\bulstem-py
Building wheels for collected packages: bulstem-py
  Building wheel for bulstem-py (setup.py): started
  Building wheel for bulstem-py (setup.py): finished with status 'done'
  Created wheel for bulstem-py: filename=bulstem_py-0.2.0-py3-none-any.whl size=828548 sha256=7384988dbd1ef3d2d7103cc36fdc49ef50c4a4db56acb696981622ff59447ca6
  Stored in directory: c:\users\sylvia\appdata\local\pip\cache\wheels\6d\42\bf\db10044fc194e652f47c33832ecf53edf71c3c74fc1c320314
Successfully built bulstem-py
Installing collected packages: bulstem-py
  Attempting uninstall: bulstem-py
    Found existing installation: bulstem-py 0.2.0
    Uninstalling bulstem-py-0.2.0:
      Successfully uninstalled bulstem-py-0.2.0
Successfully installed bulstem-py-0.2.0


In [19]:
from bulstem.stem import BulStemmer
stemmer = BulStemmer.from_file('stem-context-2', min_freq=2, left_context=1)

In [20]:
stemmer.stem('зъбния')

'зъбни'

In [21]:
stopwords = pd.read_csv('{0}Custom-BTB-StopWordList.csv'.format(ORIGINAL_BASE_PATH), header=None)
stopwordsList = list(stopwords[0])

In [22]:
import re

def apply_stemmer(x):
  disease = str(x)
  # remove ICD-10 codes
  cleaned = re.sub(r"[A-ZА-Я][0-9]{2}.[0-9]", " ",  disease);
  cleaned = re.sub(r"[A-ZА-Я][0-9]{2}", " ",  cleaned);
  cleaned = re.sub('[^0-9a-zA-ZА-Яа-я% ]+', ' ', cleaned)
  cleaned = cleaned.lower()
  tokens = cleaned.split();
  stems = [];
  for token in tokens:
    if(not(token in stopwordsList)):
      stem = stemmer.stem(token);
      stems.append(stem);
  result = " ".join(stems);
  return result;

In [23]:
apply_stemmer('монозомíа 2п15 п16.1')

'монозом 2п15 п16 1'

In [24]:
invalidValues = ['#value!', '#VALUE!', '#наме?']
df_4sign_filtered = df_4sign[~df_4sign[TEXT_COLUMN].isin(invalidValues)];
#df_3sign_filtered = df_3sign[~df_3sign[TEXT_COLUMN].isin(invalidValues)];

In [25]:
# apply stemming and stop words
df_4sign_stemmed = pd.DataFrame({
            LABEL_COLUMN:df_4sign_filtered[LABEL_COLUMN],
            TEXT_COLUMN:df_4sign_filtered[TEXT_COLUMN].apply(lambda x: apply_stemmer(x))
            })
print(df_4sign_stemmed.shape)

df_4sign_stemmed.head()

(377937, 2)


Unnamed: 0,ICD10,Text
0,A00,холер
1,A00.0,холер предизвика холер вибрион 01 биовар cholerae
2,A00.1,холер предизвика холер вибрион 01 биовар eltor
3,A00.9,холер неуточн
4,A01,тиф паратиф


In [26]:
df_4sign_stemmed = df_4sign_stemmed.drop_duplicates()
df_4sign_stemmed = df_4sign_stemmed[df_4sign_stemmed[TEXT_COLUMN].str.len() >= 2]
df_4sign_stemmed.shape

(368554, 2)

In [27]:
# validate all codes are valid classes, result should be empty
differences_4sign = np.setdiff1d(df_4sign_stemmed[LABEL_COLUMN], classes_4sign[LABEL_COLUMN])

print(differences_4sign)

[]


In [28]:
df_4sign_stemmed.to_csv('{0}dataset-stemmed-4.csv'.format(BASE_PATH), sep=',', index=False, header=True)

# Collapse Low Resource Classed

In [29]:
groups_4sign = df_4sign_stemmed.groupby(LABEL_COLUMN).count().reset_index()
groups_4sign.columns= [LABEL_COLUMN, 'count']

In [30]:
groups_4sign['count'].sum()

368554

In [31]:
small_classes_4sign = groups_4sign[groups_4sign['count'] <= 5][LABEL_COLUMN]

In [32]:
# classes with number of representatives <= 5
small_classes_4sign.head(20)

24    A04.0
25    A04.1
26    A04.2
27    A04.3
29    A04.5
33    A04.9
38    A05.3
39    A05.4
40    A05.8
61    A08.0
63    A08.2
66    A08.5
70    A15.1
71    A15.2
72    A15.3
78    A15.9
80    A16.0
81    A16.1
90    A17.0
91    A17.1
Name: ICD10, dtype: object

In [33]:
df_4sign_stemmed[df_4sign_stemmed[LABEL_COLUMN] == 'A17.1']

Unnamed: 0,ICD10,Text
91,A17.1,менингеал туберкулом
10972,A17.1,туберкулом


In [34]:
print('number of small classes with 4 signs: ', len(small_classes_4sign[(small_classes_4sign.str.len() == 5)]))
print('number of small classes with 3 signs: ', len(small_classes_4sign[(small_classes_4sign.str.len() == 3)]))

number of small classes with 4 signs:  4534
number of small classes with 3 signs:  405


In [35]:
groups_4sign = df_4sign_stemmed.groupby(LABEL_COLUMN).count().reset_index()
groups_4sign.columns= [LABEL_COLUMN, 'count']
groups_4sign.to_csv('{0}class-groups-4.csv'.format(BASE_PATH), sep=',', index=False, header=True)

In [36]:
small_classes_4sign_unique = small_classes_4sign.unique()

In [37]:
len(small_classes_4sign_unique)

4939

In [38]:
# trim the class label to 3 signs if it's in the small class list
def collapse_label(label):
    if label in small_classes_4sign_unique and len(label) > 3:
        return label[0:3]
    else:
        return label

In [39]:
collapse_label('A04.0')

'A04'

In [40]:
# collapse the small classes to their parent classes
df_4sign_collapsed = pd.DataFrame({
            LABEL_COLUMN:df_4sign_stemmed[LABEL_COLUMN].apply(lambda x: collapse_label(x)),
            TEXT_COLUMN:df_4sign_stemmed[TEXT_COLUMN]
            })
print(df_4sign_collapsed.shape)

df_4sign_collapsed.head()

(368554, 2)


Unnamed: 0,ICD10,Text
0,A00,холер
1,A00.0,холер предизвика холер вибрион 01 биовар cholerae
2,A00.1,холер предизвика холер вибрион 01 биовар eltor
3,A00.9,холер неуточн
4,A01,тиф паратиф


In [41]:
# remove any duplicates
df_4sign_collapsed = df_4sign_collapsed.drop_duplicates()
df_4sign_collapsed.shape

(356342, 2)

# Augment Classes with Low Representation

In [42]:
# calculate number of small classes (they are 3-sign classes only and can't be collapsed)
groups_4sign_collapsed = df_4sign_collapsed.groupby(LABEL_COLUMN).count().reset_index()
groups_4sign_collapsed.columns= [LABEL_COLUMN, 'count']
small_classes_4sign_collapsed = groups_4sign_collapsed[groups_4sign_collapsed['count'] <= 5][LABEL_COLUMN]
len(small_classes_4sign_collapsed)

405

In [43]:
groups_4sign_collapsed.head()

Unnamed: 0,ICD10,count
0,A00,74
1,A00.0,24
2,A00.1,25
3,A00.9,24
4,A01,142


In [44]:
groups_4sign_collapsed.to_csv('{0}class-counts-collapsed-4.csv'.format(BASE_PATH), sep=',', index=False, header=True)

In [45]:
small_classes_4sign_collapsed.head()

232    A64
319    A89
349    A99
381    B09
405    B24
Name: ICD10, dtype: object

In [46]:
df_small_classes_4sign_collapsed = df_4sign_collapsed[df_4sign_collapsed[LABEL_COLUMN].isin(small_classes_4sign_collapsed)]

In [47]:
df_small_classes_4sign_collapsed.shape

(608, 2)

In [48]:
df_small_classes_4sign_collapsed.to_csv('{0}small-classes-collapsed-4.csv'.format(BASE_PATH), sep=',', index=False, header=True)

In [49]:
!pip install nlpaug numpy matplotlib python-dotenv setuptools requests



In [50]:
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as nacw

In [51]:
aug_word = nacw.RandomWordAug(action="swap")
aug_word.augment('това е проба на аугментация')

'това е на проба аугментация'

In [52]:
# map bg to en keyboard in order to use the nlp aug keyboard augmenter
def keyboard_mapping(word, direction):
    bg_en_map = { 'а':'a', 'б':'b', 'в':'w', 'г':'g', 'д':'d', 'е':'e', 'ж':'v', 
                 'з':'z', 'и':'i', 'й':'j', 'к':'k', 'л':'l', 'м':'m', 'н':'n', 'о':'o', 'п':'p', 'р':'r',
                 'с':'s', 'т':'t', 'у':'u', 'ф':'f', 'х':'h', 'ц':'c', 'ч':'`', 'ш':'[', 'щ':']', 'ъ':'y',
                 'ь':'x', 'ю':'\\', 'я':'q',
                 'А':'A', 'Б':'B', 'В':'W', 'Г':'G', 'Д':'D', 'Е':'E', 'Ж':'V', 
                 'З':'Z', 'И':'I', 'Й':'J', 'К':'K', 'Л':'L', 'М':'M', 'Н':'N', 'О':'O', 'П':'P', 'Р':'R',
                 'С':'S', 'Т':'T', 'Ъ':'U', 'Ф':'F', 'Х':'H', 'Ц':'C', 'Ч':'~', 'Ш':'{', 'Щ':'}', 'Ъ':'Y',
                 'ѝ':'X', 'Ю':'|', 'Я':'Q'}
    
    en_bg_map = {v: k for k, v in bg_en_map.items()}
    result = ''
    
    current_map = bg_en_map if direction == 1 else en_bg_map
    
    for character in word:        
        # handle non-mapped characters
        if character in current_map:
            result += current_map[character]
        else:
            result += character
    return result

In [53]:
import random

def augment_text(text):
  aug_char = [
      nac.RandomCharAug(action="swap", aug_char_min=1, min_char=1),
      nac.RandomCharAug(action="delete", aug_char_min=1, min_char=1),
      nac.KeyboardAug(aug_char_max=1, min_char=1, include_special_char=False, include_numeric=False)
  ]

  aug_word = [ 
      nacw.RandomWordAug(action="swap"),
      nac.RandomCharAug(action="swap", aug_char_min=1, min_char=1)
  ]
  tokens = text.split(' ')
  tokens_len = len(tokens)
  
  augmented_text = text
    
  if len(text) == 1:
    augmented_text = text
  elif tokens_len == 1:
      aug_number = random.randint(0, 2)
      if aug_number == 2: #keyboard        
        transliterated = keyboard_mapping(text, 1)
        augmented_transliterated = aug_char[aug_number].augment(transliterated)
        augmented_text = keyboard_mapping(augmented_transliterated, 2)
      else:
        augmented_text = aug_char[aug_number].augment(text)
  else:
    aug_number = random.randrange(2)
    if aug_number == 1:
      random_word_number = random.randint(0, tokens_len-1)
      random_word = tokens[random_word_number]
        
      i = 1
      while len(random_word) < 2 and i < 10:
        random_word_number = random.randint(0, tokens_len-1)
        random_word = tokens[random_word_number]
        i += 1
        
      if len(random_word) >= 2:
          try: 
              augmented_word = aug_word[aug_number].augment(random_word)
              tokens[random_word_number] = augmented_word
              augmented_text = ' '.join(tokens)
          except:
            print(random_word)
      else:
        augmented_text = aug_word[0].augment(text)
    else:        
      augmented_text = aug_word[aug_number].augment(text)

  if augmented_text == text:
    augmented_text = aug_char[1].augment(augmented_text)

  return augmented_text

In [54]:
augment_text('Прехранван последи')

'Прехранван опслеид'

In [55]:
# augment the dataset by adding variations to the under-represented classes
def augment_df(full_df, small_class_df, augment_iterations):
    dfs_augmented_4sign = []

    for i in range(augment_iterations):
      df_augmented = pd.DataFrame({
                TEXT_COLUMN:small_class_df[TEXT_COLUMN].apply(lambda x: augment_text(x)),
                LABEL_COLUMN:small_class_df[LABEL_COLUMN]
                })
      dfs_augmented_4sign.append(df_augmented)

    dfs_augmented_4sign.append(full_df)
    df_augmented_4sign = pd.concat(dfs_augmented_4sign, ignore_index=True)
    df_augmented_4sign = df_augmented_4sign.drop_duplicates()
    
    return df_augmented_4sign

In [56]:
# return list of under-represented classes based on threshold
def check_small_classes(full_df, small_class_threshold):
    groups_4sign_result = full_df.groupby(LABEL_COLUMN).count().reset_index()
    groups_4sign_result.columns= [LABEL_COLUMN, 'count']
    small_classes_4sign_result = groups_4sign_result[groups_4sign_result['count'] <= small_class_threshold][LABEL_COLUMN]
    return small_classes_4sign_result

In [57]:
# augment the dataset and ensure all classes have more than 5 unique representatives
new_augmented_df = df_4sign_collapsed
small_df = df_small_classes_4sign_collapsed
small_classes = small_classes_4sign_collapsed
i = 0;
while len(small_classes) > 0 and i < 10:
    small_df = new_augmented_df[new_augmented_df[LABEL_COLUMN].isin(small_classes)]
    new_augmented_df = augment_df(new_augmented_df, small_df, 5)
    small_classes = check_small_classes(new_augmented_df, 5)
    i += 1
    print(len(small_classes))

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  del sys.path[0]


106
0


In [58]:
new_augmented_df.shape

(360916, 2)

In [59]:
new_augmented_df.to_csv('{0}dataset-augmented-4.csv'.format(BASE_PATH), sep=',', index=False, header=True)

In [60]:
print('Number of unique classes in collapsed dataset:', len(new_augmented_df[LABEL_COLUMN].unique()))

Number of unique classes in collapsed dataset: 6436


In [61]:
df_classes_collapsed = pd.DataFrame({'ICD10': np.sort(new_augmented_df[LABEL_COLUMN].unique())})

In [62]:
df_classes_collapsed.to_csv('{0}classes-collapsed-4.csv'.format(BASE_PATH), sep=',', index=False, header=False)

In [63]:
class_list = df_classes_collapsed['ICD10'].tolist()

In [64]:
new_augmented_df[PRED_CLASS] = new_augmented_df.apply(lambda x:  class_list.index(x[LABEL_COLUMN]),axis=1)
new_augmented_df[TEXT_COLUMN] = new_augmented_df[TEXT_COLUMN].apply(lambda x:  str(x))

new_augmented_df = new_augmented_df.drop([LABEL_COLUMN], axis=1)

train_dev_df, test_df = train_test_split(new_augmented_df,
                            stratify=new_augmented_df[PRED_CLASS],
                            random_state=42,
                            shuffle=True, 
                            test_size=0.1)
train_df, dev_df = train_test_split(train_dev_df,
                            stratify=train_dev_df[PRED_CLASS],
                            random_state=42,
                            shuffle=True, 
                            test_size=0.11)

train_df.to_csv('{0}train-4.csv'.format(BASE_PATH), sep=',', index=False, header=True)
dev_df.to_csv('{0}dev-4.csv'.format(BASE_PATH), sep=',', index=False, header=True)
test_df.to_csv('{0}test-4.csv'.format(BASE_PATH), sep=',', index=False, header=True)

In [65]:
groups_4sign_collapsed = new_augmented_df.groupby(PRED_CLASS).count().reset_index()
groups_4sign_collapsed.columns= [LABEL_COLUMN, 'count']
groups_4sign_collapsed.to_csv('{0}class-groups-collapsed-4.csv'.format(BASE_PATH), sep=',', index=False, header=True)