In [None]:
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

In [None]:
from src.tokenizer.regex_tokenizer import RegexTokenizer
tokenizer = RegexTokenizer()

In [None]:
tokenizer.load(model_file='src\tokenizer\tokenizer_model.model')

In [None]:
def get_vocab_size(tokenizer):
    vocab = tokenizer.vocab
    return len(vocab)

In [None]:
get_vocab_size(tokenizer)

In [None]:
from datasets import load_dataset

ds = load_dataset("Malikeh1375/medical-question-answering-datasets", "chatdoctor_healthcaremagic")
ds

In [None]:
dataset = ds['train']
len(dataset)

In [None]:
import pandas as pd

df = pd.DataFrame(dataset)
df = df.drop('instruction', axis=1)
df

In [None]:
df

In [None]:
from tqdm import tqdm 

data = []
rows = dataset["input"]
for row in tqdm(rows):
    data.append(row)

rows = dataset["output"]
for row in tqdm(rows):
    data.append(row)

print(len(" ".join(data)))

In [None]:
pd.set_option('display.max_colwidth', None)
df.describe()

In [None]:
df.isnull().sum()

In [None]:
df.duplicated(keep=False).value_counts()

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt
txt = df['input']
txt_ = ''.join(txt)
word_cloud = WordCloud(width=800, height=400, background_color='white',
                       max_words=200).generate(txt_)
plt.imshow(word_cloud)

In [None]:
with open("medical_dataset.txt", "w") as f:
    f.write(" ".join(data))

In [None]:
with open("datasets\medical_dataset.txt", "r") as f:
    num_char_to_read = 3_000_000
    text_sequence = f.read(num_char_to_read)

len(text_sequence)

In [None]:
import re
def clean_text(text):
    return re.sub(r'[\\]+', "", text).strip()

In [None]:
text = ""
    
for _, row in df.iterrows():
    text += f"<|startoftext|><|User|>{clean_text(row['input'])}\n"
    text += f"<|Assistant|>{clean_text(row['output'])}<|endoftext|>\n"
len(text)

In [None]:
tokenizer.encode('<|startoftext|>Thanks for your question on Chat Doctor', allowed_special='all')

In [None]:
pd.reset_option('display.max_colwidth', None)
df

In [None]:
encoded_text_sequence = []
batch_size = 3_000_000
with open("/kaggle/working/medical_dataset.txt", "r") as f:
    
    while True:
        chunk = f.read(batch_size)
        if not chunk:
            break 

        batch_tokens = tokenizer.encode(chunk)
        encoded_text_sequence.extend(batch_tokens)
        print(f"Processed {len(encoded_text_sequence)} tokens so far")

print(f"Total Tokens: {len(encoded_text_sequence)}")

#### Removing persona identifiers and senstitive info

In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [None]:
def remove_person_identifiers(text):

    doc = nlp(text)
    raw_text = text

    for ent in doc.ents:
        if ent.label_ == "PERSON":
            raw_text = raw_text.replace(ent.text, "")

    return raw_text

In [None]:
def remove_sensitive_info(text):

    text = re.sub(r'\S+@\S+', '', text)
    text = re.sub(r'\b\d{10}\b', '', text)
    text = re.sub(r'\b\d{1,2}/\d{1,2}/\d{2,4}\b', '', text)

    return text

#### Creating conversation pair

In [None]:
from typing import List, Dict
import re
def conversation(raw_conv: pd.DataFrame) -> List[Dict[str, str]]:

    conversations = []
    for _, row in raw_conv.iterrows():
        
        user = str(row["input"]).replace('/', '').strip()
        user = remove_person_identifiers(user)
        user = remove_sensitive_info(user)
        
        assist = str(row["output"]).replace('/', '').strip()
        assist = remove_person_identifiers(assist)
        assist = remove_sensitive_info(assist)
        
        if user and assist:
            conversations.append({
                "user": user,
                "assistant": assist
            })

    return conversations

In [None]:
conversations = conversation(df)
print(len(conversations))

In [None]:
print(conversations[90])

In [None]:
def conv_format(conversation: list[dict]):

    formatted_data = []
    
    for pair in conversation:
        try:
            user_text = pair.get("user").strip()
            assist_text = pair.get("assistant").strip()
            
            formatted = (f"<|startoftext|><|User|>{user_text}\n"
                         f"<|Assistant|>{assist_text}<|endoftext|>")
            formatted_data.append(formatted)
            
        except Exception as e:
            print(f'Skipping due to error: {e}')

    return formatted_data

In [None]:
conversation_format = conv_format(conversations)
print(len(conversation_format))

In [None]:
print(conversation_format[-1])

## Encoding the dataset

In [None]:
import numpy as np

np.save("encoded_dataset.npy",
       np.array(encoded_text_sequence, dtype=np.int64))


#### Saving dataset in formatted manner for pretraining

In [None]:
import json
with open("formmated_dataset.jsonl", "w", encoding="utf-8") as f:
    for item in conversation_format:
        f.write(json.dumps({"text": item}, ensure_ascii=False) + '\n')

In [None]:
encoded_text_sequence = []
batch_size = 3_000_000
with open("datasets\formmated_dataset.jsonl", "r") as f:
    
    while True:
        chunk = f.read(batch_size)
        if not chunk:
            break 

        batch_tokens = tokenizer.encode(chunk)
        encoded_text_sequence.extend(batch_tokens)
        print(f"Processed {len(encoded_text_sequence)} tokens so far")

print(f"Total Tokens: {len(encoded_text_sequence)}")