In [1]:
%run utils.ipynb

import pandas as pd
import numpy as np
import re
from tqdm import tqdm

import matplotlib.pyplot as plt
import plotly.graph_objs as go
from fuzzywuzzy import fuzz

import cufflinks
cufflinks.go_offline()
cufflinks.set_config_file(world_readable=True, theme='pearl')

from sklearn.feature_extraction.text import CountVectorizer

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

[nltk_data] Downloading package stopwords to /home/dev/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
text = 'When encountering the word ‘went,’ the model, having learned patterns, predicts the base form as ‘go.’ Similarly, for ‘happier,’ the model deduces ‘happy’ as the lemma. The advantage lies in the model’s ability to adapt to varied linguistic nuances and handle irregularities, making it robust for lemmatizing diverse vocabularies.'
st = stemming_lemma_reprocess(text)
st

'when encount the word ‘ went , ’ the model , have learn pattern , predict the base form as ‘ go. ’ similarli , for ‘ happier , ’ the model deduc ‘ happi ’ as the lemma . the advantag lie in the model ’ s abil to adapt to vari linguist nuanc and handl irregular , make it robust for lemmat divers vocabulari .'

In [3]:
st = stemming_lemma_reprocess(text, type_select='lemma')
st

'When encountering the word ‘ went , ’ the model , having learned pattern , predicts the base form a ‘ go. ’ Similarly , for ‘ happier , ’ the model deduces ‘ happy ’ a the lemma . The advantage lie in the model ’ s ability to adapt to varied linguistic nuance and handle irregularity , making it robust for lemmatizing diverse vocabulary .'

In [4]:
import pandas as pd

df = pd.read_csv('../Medical-Abstracts-TC-Corpus/medical_tc_train.csv')
df.head()

Unnamed: 0,condition_label,medical_abstract
0,5,Tissue changes around loose prostheses. A cani...
1,1,Neuropeptide Y and neuron-specific enolase lev...
2,2,"Sexually transmitted diseases of the colon, re..."
3,1,Lipolytic factors associated with murine and h...
4,3,Does carotid restenosis predict an increased r...


In [5]:
df_text = df['medical_abstract'].tolist()

In [6]:
df_text = ' '.join(df_text)

In [7]:
df_text = list(set(df_text.split(' ')))
df_text[:5]

['', 'PH);', 'near-diploid', 'suprabasilar', 'Interlocking']

In [8]:
len(df_text)

96222

In [14]:
dd = [i for i in df_text if len(i)<4]

In [15]:
len(dd)

5469

In [48]:
'CI' in dd

True

In [52]:
with open('Abbreviation_List_for_Medical_Record_Documentation.txt', 'r', encoding='utf-8') as rt:
    ct = rt.read()
    ct = ct.split('\n')[1:]
rt.close()
len(ct)

930

In [58]:
words_to_keep = [f.lower() for f in ct]
    
stop_words = set(stopwords.words('english'))

f = [j for j in words_to_keep if j in stop_words]

In [8]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import Dataset, DataLoader
from torch.nn import BCEWithLogitsLoss  # For multi-label classification


In [32]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
import torch.optim as optim

# Chuẩn bị dữ liệu
texts = ["Lòai chó rất trung thành với con người.", "Mèo có thể ăn động vật và thực vật.", "Cây xanh mang lại nguồn ô xi quí giá.", "Máy móc là một phần của cuộc sống."]
labels = torch.tensor([[1, 0, 0, 0], [1, 0, 1, 0], [0, 0, 1, 0], [0, 0, 0, 1]])

# Token hóa văn bản
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokenized_texts = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")

# Tạo dataset
class CustomDataset(Dataset):
    def __init__(self, tokenized_texts, labels):
        self.tokenized_texts = tokenized_texts
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {'input_ids': self.tokenized_texts['input_ids'][idx], 'attention_mask': self.tokenized_texts['attention_mask'][idx], 'labels': self.labels[idx]}

dataset = CustomDataset(tokenized_texts, labels)

# Tạo DataLoader
batch_size = 2
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Tạo mô hình BERT
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=4)

# Hàm mất mát BCEWithLogitsLoss
# criterion = nn.BCEWithLogitsLoss()
criterion = nn.CrossEntropyLoss()

# Tối ưu hóa
optimizer = optim.AdamW(model.parameters(), lr=1e-5)

# Quá trình huấn luyện
num_epochs = 10
for epoch in range(num_epochs):
    for batch in dataloader:
        inputs = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']

        # Tính logits từ mô hình
        outputs = model(inputs, attention_mask=attention_mask)
        logits = outputs.logits

        # Tính toán loss
        loss = criterion(logits, labels.float())
        print(loss)
        # Backpropagation và tối ưu hóa
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

# Đánh giá mô hình (tương tự trong quá trình huấn luyện)
# ...


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

tensor(2.0286, grad_fn=<DivBackward1>)
tensor(1.1859, grad_fn=<DivBackward1>)
tensor(1.1076, grad_fn=<DivBackward1>)
tensor(1.8620, grad_fn=<DivBackward1>)
tensor(1.4847, grad_fn=<DivBackward1>)
tensor(1.2742, grad_fn=<DivBackward1>)
tensor(0.8883, grad_fn=<DivBackward1>)
tensor(1.7102, grad_fn=<DivBackward1>)
tensor(1.3609, grad_fn=<DivBackward1>)
tensor(1.0872, grad_fn=<DivBackward1>)
tensor(1.0562, grad_fn=<DivBackward1>)
tensor(1.2400, grad_fn=<DivBackward1>)
tensor(0.9202, grad_fn=<DivBackward1>)
tensor(1.2893, grad_fn=<DivBackward1>)
tensor(1.1464, grad_fn=<DivBackward1>)
tensor(0.9277, grad_fn=<DivBackward1>)
tensor(1.1269, grad_fn=<DivBackward1>)
tensor(0.8731, grad_fn=<DivBackward1>)
tensor(1.0754, grad_fn=<DivBackward1>)
tensor(0.8068, grad_fn=<DivBackward1>)


In [28]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import Dataset, DataLoader
from torch.nn import BCEWithLogitsLoss  # For multi-label classification
class MultiLabelDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(self.texts[idx], return_tensors='pt', max_length=self.max_length, truncation=True, padding='max_length')
        label = torch.tensor(self.labels[idx])  # Create a 1D tensor for each label
        return {'input_ids': encoding['input_ids'].squeeze(), 'attention_mask': encoding['attention_mask'].squeeze(), 'labels': label.unsqueeze(0)}  # Add a dimension for batch compatibility

# Example data
texts = ["Lòai chó rất trung thành với con người.", "Mèo có thể ăn động vật và thực vật.", "Cây xanh mang lại nguồn ô xi quí giá.", "Máy móc là một phần của cuộc sống."]
labels = torch.tensor([[1, 0, 0, 0], [1, 0, 1, 0], [0, 0, 1, 0], [0, 0, 0, 1]])

# Initialize BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Create dataset
max_length = 20  # Adjust as needed
dataset = MultiLabelDataset(texts, labels, tokenizer, max_length)
# Initialize BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=4)  # 4 labels in your case

# Set up optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)
# Set up DataLoader
batch_size = 2  # Adjust as needed
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
criterion = nn.BCEWithLogitsLoss()
# Training loop
num_epochs = 3  # Adjust as needed
for epoch in range(num_epochs):
    for batch in dataloader:
        inputs = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']

        outputs = model(inputs, attention_mask=attention_mask)
        logits = outputs.logits

        # Tính toán loss
        loss = criterion(logits, labels.float())

        # Backpropagation và tối ưu hóa
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

ValueError: Target size (torch.Size([2, 1, 4])) must be the same as input size (torch.Size([2, 4]))

In [33]:
import pandas as pd
import numpy as np

In [38]:
df = pd.read_csv('../Medical-Abstracts-TC-Corpus/medical_tc_train.csv')
df.head()

Unnamed: 0,condition_label,medical_abstract
0,5,Tissue changes around loose prostheses. A cani...
1,1,Neuropeptide Y and neuron-specific enolase lev...
2,2,"Sexually transmitted diseases of the colon, re..."
3,1,Lipolytic factors associated with murine and h...
4,3,Does carotid restenosis predict an increased r...


In [39]:
df.shape

(11550, 2)

In [40]:
df.drop_duplicates('medical_abstract', inplace=True)

In [41]:
df.shape

(9445, 2)

In [42]:
11550-9445

2105