In [None]:
import pandas as pd

icd10 = pd.read_csv('ICD10codes.csv', header=None)
icd10

In [None]:
# data preprocessing
# s[:3]: Mengambil tiga karakter pertama dari string s.
# s[3:]: Mengambil karakter keempat dan seterusnya.
def add_dot(s):
    return s[:3]+'.'+s[3:]

for i in range(len(icd10)):
    if len(icd10[0].iloc[i])>3:
        icd10.loc[i, 4] = icd10[4].iloc[i] + ', ' + icd10[5].iloc[i]
        icd10.loc[i, 2] = add_dot(icd10[2].iloc[i])

icd10 = icd10.drop([0,1,3,5], axis=1)
icd10 = icd10.set_axis(['target_text', 'input_text'], axis=1)

for i in range(len(icd10)):
    if '.' not in icd10['target_text'].iloc[i]:
        icd10.loc[i, 'target_text'] = add_dot(icd10['target_text'].iloc[i])

In [None]:
icd10

In [None]:
from datasets import Dataset
ds = Dataset.from_dict(icd10)

In [None]:
print({key: type(value) for key, value in icd10.items()})

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch

# loading the t5-small model
model_name = 't5-small'
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

In [None]:
from transformers import DataCollatorWithPadding

def tokenize_function(df):
    model_inputs = tokenizer(df["input_text"], padding="max_length", truncation=True, max_length=128)
    labels = tokenizer(df["target_text"], padding="max_length", truncation=True, max_length=128)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = ds.map(tokenize_function, batched=True)
tokenized_datasets.set_format("torch")

data_collator = DataCollatorWithPadding(tokenizer)

In [None]:
# from transformers import TrainingArguments, Trainer

# # Split dataset into train and validation
# train_test_split = tokenized_datasets.train_test_split(test_size=0.1)
# train_dataset = train_test_split["train"]
# val_dataset = train_test_split["test"]


# # Define training arguments
# training_args = TrainingArguments(
#     output_dir="./results",
#     evaluation_strategy="epoch",
#     learning_rate=2e-5,
#     per_device_train_batch_size=8,
#     per_device_eval_batch_size=8,
#     num_train_epochs=3,
#     weight_decay=0.01,
#     save_total_limit=2,
# )

# # Set up Trainer
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=train_dataset,
#     eval_dataset=val_dataset,
#     data_collator=data_collator
# )

In [None]:
# # Fine-tune the model
# trainer.train()

In [None]:
# from transformers import T5ForConditionalGeneration, T5Tokenizer

# # Misalnya, setelah fine-tuning
# model = T5ForConditionalGeneration.from_pretrained('t5-small')
# tokenizer = T5Tokenizer.from_pretrained('t5-small')

# # Simpan model dan tokenizer
# model.save_pretrained('working')  # Menyimpan model
# tokenizer.save_pretrained('working')  # Menyimpan tokenizer

In [None]:
# from transformers import pipeline
# from transformers import T5Tokenizer, T5ForConditionalGeneration


In [None]:
# custom_in = pd.read_csv('Diagnoses_list.csv', sep=';')
# print(custom_in.head())

In [None]:
# # Read T_Sample data
# def load_sample_data(t_sample):
#     df = pd.read_csv(t_sample)
#     return df['T_SampleName'].tolist()  # Assuming this is the column with diagnoses

In [None]:
#loading the custom input(given as csv in the assignment) and preprocessing it
custom_in = pd.read_csv('Diagnoses_list_Sheet1.csv')
for i in range(len(custom_in)):
    custom_in.loc[i, 'Diagnoses_list'] = custom_in['Diagnoses_list'].iloc[i][1:-1]

In [None]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

# Memuat tokenizer dan model
model_path = "./working"  # Sesuaikan dengan model yang kamu gunakan
tokenizer = T5Tokenizer.from_pretrained(model_path)
model = T5ForConditionalGeneration.from_pretrained(model_path)

# # Memastikan model berada di perangkat yang benar
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = model.to(device)

device # should be cuda

In [None]:
# Melanjutkan dengan input tensors
model_inputs = tokenizer(list(custom_in["Diagnoses_list"]), padding="max_length", truncation=True, max_length=128, return_tensors='pt')
inputs = {key: val.to(device) for key, val in model_inputs.items()}
decoder_input_ids = torch.tensor([[tokenizer.pad_token_id]] * inputs["input_ids"].size(0)).to(device)

# Melakukan prediksi
with torch.no_grad():
    generated_ids = model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_length=128,  
        num_beams=4,     
        early_stopping=True
    )

# Decode hasil prediksi
predictions = [tokenizer.decode(g, skip_special_tokens=True) for g in generated_ids]


In [None]:
predictions

In [None]:
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer

model_path = "./working"  # Update with actual path
tokenizer = T5Tokenizer.from_pretrained(model_path)
model = T5ForConditionalGeneration.from_pretrained(model_path).to("cuda")

def predict_icd(diagnosis):
    input_text = f"ICD10: {diagnosis}"
    inputs = tokenizer(
        input_text, 
        return_tensors="pt", 
        max_length=128, 
        truncation=True, 
        padding="max_length"
    ).to("cuda")
    
    outputs = model.generate(
        input_ids=inputs.input_ids,
        attention_mask=inputs.attention_mask,
        max_length=15,
        num_beams=4,
        early_stopping=True
    )
    
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Example usage
sample_diagnosis = "What is the shape of the earth?"
predicted_icd = predict_icd(sample_diagnosis)
print(f"Diagnosis: {sample_diagnosis}\nPredicted ICD-10: {predicted_icd}")