# Instalasi dan Setup

In [1]:
!pip install openai



In [2]:
# !pip install ray transformers sentence-transformers torch pytesseract Pillow
#!pip install --upgrade "jax[cpu]"

import ray
import time
from transformers import T5ForConditionalGeneration, T5Tokenizer
from sentence_transformers import SentenceTransformer, util
from Bio import Entrez
import pytesseract
from PIL import Image
import os
import torch
import warnings
import re
import pandas as pd
from typing import List

# Set environment variables
email_address = "dwiyulianto31072k4@gmail.com"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Menonaktifkan FutureWarning terkait resume_download
warnings.simplefilter(action='ignore', category=FutureWarning)

# Inisialisasi Ray
ray.shutdown()
ray.init()

2024-10-28 01:32:08,903	INFO worker.py:1816 -- Started a local Ray instance.


0,1
Python version:,3.11.5
Ray version:,2.38.0


# Agent Pencarian Data PubMed

In [3]:
@ray.remote
class DataRetrievalAgent:
    def __init__(self):
        Entrez.email = email_address

    def search_pubmed(self, keywords, max_results=5):
        search_results = []
        for term in keywords:
            handle = Entrez.esearch(db="pubmed", term=term, retmax=max_results)
            record = Entrez.read(handle)
            pmids = record["IdList"]
            search_results.extend(pmids)
            handle.close()
        return search_results


# Agent Ekstraksi Abstrak dari PubMed

In [4]:
@ray.remote
class ExtractionAgent:
    def __init__(self):
        Entrez.email = email_address

    def fetch_abstract(self, pmid):
        handle = Entrez.efetch(db="pubmed", id=pmid, rettype="abstract", retmode="text")
        abstract = handle.read()
        handle.close()
        return abstract

# Agent Peringkasan Teks Abstrak Menggunakan Model T5

In [5]:
@ray.remote
class SummarizationAgent:
    def __init__(self):
        self.model = T5ForConditionalGeneration.from_pretrained("t5-base")
        self.tokenizer = T5Tokenizer.from_pretrained("t5-base", model_max_length=512)
    
    def summarize_text(self, text):
        input_text = "summarize: " + text
        inputs = self.tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)
        summary_ids = self.model.generate(inputs.input_ids, max_length=150, min_length=30, length_penalty=2.0, num_beams=4, early_stopping=True)
        return self.tokenizer.decode(summary_ids[0], skip_special_tokens=True)



# Agent OCR untuk Ekstraksi Teks dari Gambar

In [6]:
@ray.remote
class OCRAgent:
    def extract_text_from_images(self, folder_path):
        ocr_texts = []
        for filename in os.listdir(folder_path):
            if filename.endswith(('.png', '.jpg', '.jpeg', '.tiff', '.bmp', '.gif')):
                image_path = os.path.join(folder_path, filename)
                text = pytesseract.image_to_string(Image.open(image_path))
                ocr_texts.append(text)
        return " ".join(ocr_texts)


# Agent Pencocokan Relevansi Abstrak

In [7]:
@ray.remote
class RelevanceAgent:
    def __init__(self):
        self.embedder = SentenceTransformer('all-MiniLM-L6-v2')
    
    def find_relevant_abstracts(self, abstracts, query, top_k=5):
        abstract_embeddings = [self.embedder.encode(abstract, convert_to_tensor=True) for abstract in abstracts]
        abstract_embeddings = torch.stack(abstract_embeddings)
        query_embedding = self.embedder.encode(query, convert_to_tensor=True)
        cosine_scores = util.pytorch_cos_sim(query_embedding, abstract_embeddings)
        top_results = torch.topk(cosine_scores, k=top_k)
        return [abstracts[i] for i in top_results[1][0].tolist()]



# Fungsi Normalisasi Teks OCR

In [8]:
def normalisasi_abstrak(teks: str) -> str:
    teks = re.sub(r'\\bduosnueyy\\b', '', teks)  # contoh hapus kata 'duosnueyy' jika itu adalah noise OCR
    teks = re.sub(r'\\s{2,}', ' ', teks)  # menghilangkan spasi berlebih
    teks = re.sub(r'\\n+', ' ', teks)  # mengubah newline menjadi spasi
    teks = re.sub(r'\\[.*?\\]', '', teks)  # hapus teks dalam tanda kurung kotak
    return teks.strip()

# Fungsi Menyimpan Abstrak ke CSV

In [9]:
def simpan_abstrak_ke_csv(abstrak_list: List[str], filename: str):
    data = {'Abstrak': abstrak_list}
    df = pd.DataFrame(data)
    df.to_csv(filename, index=False)
    print(f"Abstrak yang relevan telah disimpan ke {filename}")

# Pipeline untuk Menjalankan Semua Agen

In [10]:
def multi_agent_pipeline(search_terms, user_query, folder_path=None):
    # Inisialisasi agen
    retrieval_agent = DataRetrievalAgent.remote()
    extraction_agent = ExtractionAgent.remote()
    summarization_agent = SummarizationAgent.remote()
    relevance_agent = RelevanceAgent.remote()
    
    # Langkah 1: Cari PubMed berdasarkan kata kunci
    pmids = ray.get(retrieval_agent.search_pubmed.remote(search_terms))
    print("Daftar PMIDs yang ditemukan:", pmids)
    
    # Langkah 2: Ekstraksi abstrak dari PubMed
    abstracts = ray.get([extraction_agent.fetch_abstract.remote(pmid) for pmid in pmids])
    print("Contoh abstrak:", abstracts[0] if abstracts else "Tidak ada abstrak ditemukan.")
    
    # Langkah 3: Jalankan OCR untuk batch gambar jika ada folder gambar yang disediakan
    if folder_path:
        ocr_agent = OCRAgent.remote()
        ocr_text = ray.get(ocr_agent.extract_text_from_images.remote(folder_path))
        ocr_text_normalized = normalisasi_abstrak(ocr_text)
        print("Teks OCR yang telah dinormalisasi:", ocr_text_normalized)
    
        # Filter dan simpan hasil abstrak relevan
        kata_kunci = ['diabetes', 'peripheral neuropathy', 'neuropathy', 'diabetic']
        abstrak_relevan = [ocr_text_normalized] if any(kata in ocr_text_normalized.lower() for kata in kata_kunci) else []
        
        if abstrak_relevan:
            simpan_abstrak_ke_csv(abstrak_relevan, 'abstrak_relevan.csv')
        
        print("Abstrak yang relevan dan terfilter:")
        for idx, abstrak in enumerate(abstrak_relevan, 1):
            print(f"{idx}. {abstrak}\\n")

# Menjalankan Pipeline

In [11]:
search_terms = ["diabetes complications management", "innovations in diabetes treatment", "advancements in diabetic wound healing"]
user_query = "What are the latest innovations in diabetes complications management?"
folder_path = "/Users/dwiyulianto/Downloads/AI PROJECT/Journal_HHS Public Acces _Health"  # Ganti dengan path folder gambar

# Menjalankan pipeline
multi_agent_pipeline(search_terms, user_query, folder_path)



Daftar PMIDs yang ditemukan: ['39460909', '39460375', '39459574', '39459480', '39459455', '39460977', '39460909', '39460896', '39460887', '39460283', '39458946', '39458672', '39458583', '39454900', '39452591']
Contoh abstrak: 1. Diabetes Ther. 2024 Oct 26. doi: 10.1007/s13300-024-01658-8. Online ahead of 
print.

Current Perspectives in Pre- and Diabetic Peripheral Neuropathy Diagnosis and 
Management: An Expert Statement for the Gulf Region.

Beshyah SA(1)(2)(3), Jayyousi A(4), Al-Mamari AS(5), Shaaban A(6), Ozairi EA(7), 
Nafach J(8), Jallo MKI(9), Khader S(10), Evans M(11).

Author information:
(1)Department of Medicine, NMC Royal, MBZ, Abu Dhabi, United Arab Emirates. 
beshyah@yahoo.com.
(2)Department of Medicine, College of Medicine and Health Sciences, Khalifa 
University, Abu Dhabi, UAE. beshyah@yahoo.com.
(3)Department of Medicine, Dubai Medical College, Dubai, United Arab Emirates. 
beshyah@yahoo.com.
(4)Hamad Medical Corporation, Doha, Qatar.
(5)Department of Medicine, Sultan

# Agent untuk Question Answering

In [12]:
@ray.remote
class QuestionAnsweringAgent:
    def __init__(self):
        self.model = T5ForConditionalGeneration.from_pretrained("t5-base")
        self.tokenizer = T5Tokenizer.from_pretrained("t5-base", model_max_length=512)

    def answer_question(self, question, context):
        input_text = f"question: {question} context: {context}"
        inputs = self.tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)
        answer_ids = self.model.generate(inputs.input_ids, max_length=150, num_beams=4, early_stopping=True)
        return self.tokenizer.decode(answer_ids[0], skip_special_tokens=True)

# Fungsi untuk Menanyakan Pertanyaan ke Abstrak yang Sudah Dirangkum

In [13]:
def ask_questions_on_abstracts(summaries, questions):
    # Inisialisasi agen Question Answering
    qa_agent = QuestionAnsweringAgent.remote()
    
    print("\nJawaban Pertanyaan yang Tepat:")
    for i, summary in enumerate(summaries):
        relevant_answers_found = False
        # Mengganti \n dalam ringkasan dengan line break asli
        formatted_summary = summary.replace("\\n", "\n")
        
        # Tampilkan ringkasan yang sudah dirapikan
        print(f"\nRingkasan Abstrak {i+1}:\n{formatted_summary}\n")
        
        for question in questions:
            answer = ray.get(qa_agent.answer_question.remote(question, formatted_summary))
            # Hanya tampilkan jawaban yang relevan
            if answer and answer.lower() not in ["not", "jawaban tidak ditemukan", "not_entailment"]:
                if not relevant_answers_found:
                    relevant_answers_found = True
                print(f"Pertanyaan: {question}")
                print(f"Jawaban: {answer}\n")

# Pipeline Utama dengan Pertanyaan

In [14]:
def multi_agent_pipeline_with_questions(search_terms, user_query, folder_path=None, questions=[]):
    # Menjalankan pipeline multi-agent sebelumnya untuk mencari, mengekstrak, dan merangkum abstrak
    retrieval_agent = DataRetrievalAgent.remote()
    extraction_agent = ExtractionAgent.remote()
    summarization_agent = SummarizationAgent.remote()
    
    # Langkah 1: Cari PubMed berdasarkan kata kunci
    pmids = ray.get(retrieval_agent.search_pubmed.remote(search_terms))
    print("Daftar PMIDs yang ditemukan:", pmids)
    
    # Langkah 2: Ekstraksi abstrak dari PubMed
    abstracts = ray.get([extraction_agent.fetch_abstract.remote(pmid) for pmid in pmids])
    print("Contoh abstrak:", abstracts[0] if abstracts else "Tidak ada abstrak ditemukan.")
    
    # Langkah 3: Meringkas abstrak
    summaries = ray.get([summarization_agent.summarize_text.remote(abstract) for abstract in abstracts])
    
    # Langkah 4: Menanyakan pertanyaan pada abstrak yang sudah diringkas
    if questions:
        ask_questions_on_abstracts(summaries, questions)

# Menjalankan Pipeline dengan Pertanyaan

In [None]:
search_terms = ["diabetes complications management", "innovations in diabetes treatment"]
user_query = "What are the latest innovations in diabetes complications management?"
folder_path = "/Users/dwiyulianto/Downloads/AI PROJECT/Journal_HHS Public Acces _Health"
questions = [
    "What is the impact of peripheral neuropathy on quality of life?",
    "What are the common symptoms of diabetic peripheral neuropathy?",
]

# Menjalankan pipeline dengan pertanyaan
multi_agent_pipeline_with_questions(search_terms, user_query, folder_path, questions)

# Shutdown Ray setelah selesai
ray.shutdown()



Daftar PMIDs yang ditemukan: ['39460909', '39460375', '39459574', '39459480', '39459455', '39460977', '39460909', '39460896', '39460887', '39460283']
Contoh abstrak: 1. Diabetes Ther. 2024 Oct 26. doi: 10.1007/s13300-024-01658-8. Online ahead of 
print.

Current Perspectives in Pre- and Diabetic Peripheral Neuropathy Diagnosis and 
Management: An Expert Statement for the Gulf Region.

Beshyah SA(1)(2)(3), Jayyousi A(4), Al-Mamari AS(5), Shaaban A(6), Ozairi EA(7), 
Nafach J(8), Jallo MKI(9), Khader S(10), Evans M(11).

Author information:
(1)Department of Medicine, NMC Royal, MBZ, Abu Dhabi, United Arab Emirates. 
beshyah@yahoo.com.
(2)Department of Medicine, College of Medicine and Health Sciences, Khalifa 
University, Abu Dhabi, UAE. beshyah@yahoo.com.
(3)Department of Medicine, Dubai Medical College, Dubai, United Arab Emirates. 
beshyah@yahoo.com.
(4)Hamad Medical Corporation, Doha, Qatar.
(5)Department of Medicine, Sultan Qaboos University Hospital, Muscat, Oman.
(6)Diabetes Contr