In [1]:
import pandas as pd
import numpy as np
import re
import torch
from torch.utils.data import Dataset, DataLoader
!pip install transformers
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import AutoTokenizer, AutoModelForTokenClassification

# Load the dataset
data = pd.read_csv('/content/drive/MyDrive/mtsamples.csv')
data = data.dropna()  # remove missing values
data = data.drop_duplicates()  # remove duplicate values

# Define the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Preprocess the text
def preprocess_text(text):
    text = text.lower()  # convert to lowercase
    text = re.sub(r'[^\w\s]', '', text)  # remove punctuation
    text = re.sub(r'\d+', '', text)  # remove digits
    text = re.sub(r'\s+', ' ', text)  # remove extra whitespaces
    return text

data['transcription'] = data['transcription'].apply(preprocess_text)

# Extracting sex and age information
def extract_sex_age(text):
    # Look for age in the text using regular expressions
    age = re.search(r'\b\d{1,3}\b', text)
    if age:
        age = age.group()
    else:
        age = 'unknown'
    
    # Look for sex in the text using regular expressions
    sex = re.search(r'\b(male|female|woman|man|girl|boy)\b', text, re.IGNORECASE)
    if sex:
        sex = sex.group().lower()
        if sex == 'woman' or sex == 'girl':
            sex = 'female'
        elif sex == 'man' or sex == 'boy':
            sex = 'male'
    else:
        sex = 'unknown'
    return sex, age

data['sex'], data['age'] = zip(*data['transcription'].apply(extract_sex_age))

# Extracting treatment information
tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.1")
model = AutoModelForTokenClassification.from_pretrained("dmis-lab/biobert-base-cased-v1.1").to(device)



Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.0-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m68.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m98.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.4-py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200.1 kB[0m [31m25.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.4 tokenizers-0.13.3 transformers-4.28.0


Downloading (…)lve/main/config.json:   0%|          | 0.00/313 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of the model checkpoint at dmis-lab/biobert-base-cased-v1.1 were not used when initializing BertForTokenClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initi

In [2]:
def extract_treatment(text):
    max_length = 512
    tokens = []
    for i in range(0, len(text), max_length):
        chunk = text[i:i+max_length]
        input_ids = tokenizer.encode(chunk, add_special_tokens=True, return_tensors='pt').to(device)
        output = model(input_ids)
        label_indices = torch.argmax(output[0], axis=2)
        chunk_tokens = tokenizer.convert_ids_to_tokens(input_ids[0])
        treatment_tokens = []
        for token, label_idx in zip(chunk_tokens, label_indices[0]):
            if label_idx == 1:
                treatment_tokens.append(token)
        tokens.extend(treatment_tokens)
    if len(tokens) > 0:
        return ' '.join(tokens).replace(' ##', '')
    else:
        return ''


In [3]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Define NLTK objects for text preprocessing
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def extract_treatment(text):
    max_length = 512
    tokens = []
    for i in range(0, len(text), max_length):
        chunk = text[i:i+max_length]
        input_ids = tokenizer.encode(chunk, add_special_tokens=True, return_tensors='pt').to(device)
        output = model(input_ids)
        label_indices = torch.argmax(output[0], axis=2)
        chunk_tokens = tokenizer.convert_ids_to_tokens(input_ids[0])
        treatment_tokens = []
        for token, label_idx in zip(chunk_tokens, label_indices[0]):
            if label_idx == 1:
                treatment_tokens.append(token)
        tokens.extend(treatment_tokens)
    if len(tokens) > 0:
        return ' '.join(tokens).replace(' ##', '')
    else:
        return ''


    # Use the GPT-2 model to extract treatment information
    input_ids = tokenizer.encode(text, add_special_tokens=True, return_tensors='pt').to(device)
    output = model(input_ids)
    label_indices = torch.argmax(output[0], axis=2)
    tokens = tokenizer.convert_ids_to_tokens(input_ids[0])
    treatment_tokens = []
    for token, label_idx in zip(tokens, label_indices[0]):
        if label_idx == 1:
            treatment_tokens.append(token)
    if len(treatment_tokens) > 0:
        return ' '.join(treatment_tokens).replace(' ##', '')
    else:
        return ''


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [4]:
import nltk
nltk.download('wordnet')


[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [5]:
import matplotlib.pyplot as plt
import seaborn as sns



In [6]:
if 'treatment' not in data.columns:
    # code to create the 'treatment' column
    data['treatment'] = data['transcription'].apply(extract_treatment)
else:
    # code to use the 'treatment' column
    pass


In [7]:
treatment_distribution = data['treatment'].value_counts(normalize=True)

plt.figure(figsize=(6, 3)) # reduce the figure size
sns.barplot(x=treatment_distribution.index, y=treatment_distribution.values)
plt.title('Distribution of Treatment', fontsize=12) # adjust the font size of the title
plt.xlabel('Treatment', fontsize=10) # adjust the font size of the x-axis label
plt.ylabel('Percentage', fontsize=10) # adjust the font size of the y-axis label
plt.xticks(rotation=45, fontsize=8) # adjust the rotation and font size of the x-axis tick labels

plt.savefig('treatment_distribution.png') # save the plot to file

plt.show()


In [None]:
treatment_distribution = data['treatment'].value_counts(normalize=True)

# create a list of colors for the pie chart
colors = ['pink', 'lightblue', 'lightgreen', 'purple', 'orange']

# create the pie chart
plt.pie(treatment_distribution.values, labels=treatment_distribution.index, colors=colors,
        autopct='%1.1f%%', startangle=90)

# add a circle in the middle to create a donut chart
circle = plt.Circle((0,0), 0.7, color='white')
fig = plt.gcf()
fig.gca().add_artist(circle)

# add title and legend
plt.title('Distribution of Treatment')
plt.legend(treatment_distribution.index, loc='upper right')

# show the plot
plt.axis('equal')
plt.show()


In [9]:
data['sex']


0        female
1       unknown
2       unknown
3       unknown
4       unknown
         ...   
4984     female
4985     female
4989     female
4993     female
4995       male
Name: sex, Length: 3898, dtype: object

In [10]:
data['age']


0       unknown
1       unknown
2       unknown
3       unknown
4       unknown
         ...   
4984    unknown
4985    unknown
4989    unknown
4993    unknown
4995    unknown
Name: age, Length: 3898, dtype: object

In [11]:
data['treatment']


0       [CLS]gies used to havegies when seattle but th...
1       history airline seats shoes used to seating ob...
2       today who he since of his highest he pounds hi...
3       mm enlargement diameter cm size right ventle l...
4       [CLS] vent size wall thickness the wall motion...
                              ...                        
4984    [CLS] admission diagnosis obesity b diagnosis ...
4985    m considerationscopic rouxeny gas the approxim...
4989    history test to five ago short walking about s...
4993    ##oldre possibility evaluationgies xmiaopf dis...
4995    diagnosis kadischa diagnosis ka resolvinghos c...
Name: treatment, Length: 3898, dtype: object

comment: The extract_treatment function uses the BioBERT model to extract the tokens that correspond to treatments mentioned in the text. The function tokenizes the input text, applies the BioBERT model to perform token classification and identify the tokens that correspond to treatments, and then returns the extracted tokens as a string.

Regarding the extraction of sex and age information, the BioBERT model is not directly involved. Instead, the extract_sex_age function uses regular expressions to search for patterns in the text that correspond to sex and age information, and then extracts the relevant information.