In [1]:
BASE_PATH = '..\\data\\'
ORIGINAL_BASE_PATH = '..\\data\\original\\'
DIR_SEPARATOR = '\\' # /

LABEL_COLUMN = 'ICD10'
TEXT_COLUMN = 'Text'
PRED_CLASS = 'pred_class'
ORIGINAL_TEXT = 'original_text'
ALT_LABELS = 'alt_labels'

In [2]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from pandas import DataFrame
import numpy as np

In [3]:
import os
from os import path

In [4]:
file_name = '{0}train-4.csv'.format(BASE_PATH)
train_df = pd.read_csv(file_name)

In [5]:
file_name = '{0}test-4.csv'.format(BASE_PATH)
test_df = pd.read_csv(file_name)

In [6]:
file_name = '{0}dev-4.csv'.format(BASE_PATH)
dev_df = pd.read_csv(file_name)

In [7]:
train_df[TEXT_COLUMN] = train_df[TEXT_COLUMN].apply(lambda x:  str(x))
dev_df[TEXT_COLUMN] = dev_df[TEXT_COLUMN].apply(lambda x:  str(x))
test_df[TEXT_COLUMN] = test_df[TEXT_COLUMN].apply(lambda x:  str(x))

In [8]:
dev_df.head()

Unnamed: 0,Text,pred_class
0,ентеровирус инфекци неуточн,425
1,плацен плацентар абнормн анома,4323
2,болест забол плацен уврежда плода новород,4474
3,артери коронари синист аномали н инординацио,4766
4,disci intervertebralis inflammatione,3726


In [35]:
train_array = train_df[TEXT_COLUMN].to_numpy()
test_array = test_df[TEXT_COLUMN].to_numpy()
dev_array = dev_df[TEXT_COLUMN].to_numpy()

In [36]:
intersection_test = np.intersect1d(test_array, train_array)

In [37]:
intersection_dev = np.intersect1d(dev_array, train_array)

In [38]:
len(intersection_test)/len(test_array)

0.7223299285280246

In [39]:
len(intersection_dev)/len(dev_array)

0.72451991932891

In [14]:
test_df.head()

Unnamed: 0,Text,pred_class
0,синдром бехчет,3702
1,infundibulo neurohypophysitis,1465
2,карцином 3 неинфилтрира 2 лобулар 2 млечн жлеза,1069
3,криптоспоридиозис,44
4,tumor ex maligno feminam generatiua ratio,882


In [15]:
df_train_dev = pd.concat([train_df, dev_df])
label_groups = df_train_dev.groupby(TEXT_COLUMN)
alt_labels_df = label_groups[PRED_CLASS].apply(lambda x: x.values).reset_index()
alt_labels_df.set_index(TEXT_COLUMN, inplace=True)

In [16]:
alt_labels_df.head()

Unnamed: 0_level_0,pred_class
Text,Unnamed: 1_level_1
0 genus cpam,"[4819, 4820]"
0 genus laryngotracheoesophageal vehementissimo imbri,"[4816, 4817]"
0 genus morbo glycogen repono,"[1593, 1594]"
0 reipublic s a carcinomate firmo nexu fulcit,"[1041, 1039]"
0 scaena pulmonis carcinoma,[1051]


In [17]:
label_groups_train = train_df.groupby(TEXT_COLUMN)
alt_labels_train_df = label_groups_train[PRED_CLASS].apply(lambda x: x.values).reset_index()
alt_labels_train_df.set_index(TEXT_COLUMN, inplace=True)

In [18]:
alt_labels_train_df.head()

Unnamed: 0_level_0,pred_class
Text,Unnamed: 1_level_1
0 genus cpam,"[4819, 4820]"
0 genus laryngotracheoesophageal vehementissimo imbri,"[4816, 4817]"
0 genus morbo glycogen repono,[1593]
0 reipublic s a carcinomate firmo nexu fulcit,"[1041, 1039]"
0 scaena pulmonis carcinoma,[1051]


In [19]:
def get_alt_labels(text, current_df, alt_df):
  if text in alt_df.index:
    labels = alt_df.at[text, PRED_CLASS]
    return labels
  return []

In [22]:
get_alt_labels('0 genus cpam', test_df, alt_labels_df)

array([4819, 4820], dtype=int64)

In [24]:
test_df[ALT_LABELS] = test_df[TEXT_COLUMN].apply(lambda x: get_alt_labels(x, test_df, alt_labels_df))

In [25]:
test_df[TEXT_COLUMN] = test_df[TEXT_COLUMN].str.lower()

In [26]:
test_df.head()

Unnamed: 0,Text,pred_class,alt_labels
0,синдром бехчет,3702,[3705]
1,infundibulo neurohypophysitis,1465,[1460]
2,карцином 3 неинфилтрира 2 лобулар 2 млечн жлеза,1069,[1068]
3,криптоспоридиозис,44,[47]
4,tumor ex maligno feminam generatiua ratio,882,[862]


In [27]:
test_df.to_csv('{0}test-4.csv'.format(BASE_PATH), header=True, index=False)

In [28]:
dev_df[ALT_LABELS] = dev_df[TEXT_COLUMN].apply(lambda x: get_alt_labels(x, dev_df, alt_labels_train_df))

In [29]:
dev_df[TEXT_COLUMN] = dev_df[TEXT_COLUMN].str.lower()

In [30]:
dev_df.to_csv('{0}dev-4.csv'.format(BASE_PATH), header=True, index=False)

In [31]:
dev_df.head()

Unnamed: 0,Text,pred_class,alt_labels
0,ентеровирус инфекци неуточн,425,[424]
1,плацен плацентар абнормн анома,4323,[4325]
2,болест забол плацен уврежда плода новород,4474,[4477]
3,артери коронари синист аномали н инординацио,4766,[4760]
4,disci intervertebralis inflammatione,3726,[3728]


In [32]:
train_df[TEXT_COLUMN] = train_df[TEXT_COLUMN].str.lower()

In [33]:
train_df.to_csv('{0}train-4.csv'.format(BASE_PATH), header=True, index=False)

In [44]:
train_df = train_df.reindex(train_df.columns.tolist() + [ALT_LABELS], axis=1)

In [45]:
train_df.head()

Unnamed: 0,Text,pred_class,alt_labels
0,гоноко гонококов болест инфекци анус,211,
1,alius originem arteriae pulmonalis tendit,4778,
2,карцином 3 жлъчен кана 3 черен дроб,757,
3,злокачеств тумор разстройств предни етаж уст,674,
4,леукемоид реакци,1324,


In [46]:
full_df = pd.concat([train_df, dev_df, test_df])

In [47]:
full_df.head()

Unnamed: 0,Text,pred_class,alt_labels
0,гоноко гонококов болест инфекци анус,211,
1,alius originem arteriae pulmonalis tendit,4778,
2,карцином 3 жлъчен кана 3 черен дроб,757,
3,злокачеств тумор разстройств предни етаж уст,674,
4,леукемоид реакци,1324,


In [48]:
full_df.to_csv('{0}dataset-full-4.csv'.format(BASE_PATH), header=True, index=False)