In [1]:
!pip install transformers



In [2]:
!pip install PyPDF2



# SCRAPING FOR SUMMARIZATION

In [3]:
import requests
import PyPDF2  # Or pdfplumber, if you prefer
from bs4 import BeautifulSoup
from transformers import T5Tokenizer, T5ForConditionalGeneration
import os
from urllib.parse import urljoin  # To join base URL and relative URLs

# Base URL of the website
base_url = "https://www.bi.go.id/"

# URL of the page
url = "https://www.bi.go.id/id/publikasi/peraturan/Pages/PADG_162024.aspx"  # Replace with the actual URL

# Make a request to the website
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')

# Variables to store content
from_Web = ""
from_pdf1 = ""
from_pdf2 = ""

# Extract text content from the webpage
web_content_div = soup.find('div', id='ctl00_PlaceHolderMain_ctl04__ControlWrapper_RichHtmlField')
if web_content_div:
    from_Web = web_content_div.get_text(strip=True)

# Extract the PDF link from div with ID 'layout-lampiran'
lampiran_div = soup.find('div', id='layout-lampiran')
if lampiran_div:
    pdf_links = lampiran_div.find_all('a', href=True)
    print(pdf_links)

    # Loop through each found link and extract the PDF URLs
    pdf_count = 1  # Counter for naming PDFs
    for pdf_link in pdf_links:
        pdf_url = pdf_link['href']

        # Ensure the URL is absolute
        pdf_url = urljoin(base_url, pdf_url)  # This ensures the link is absolute

        if pdf_url.endswith(".pdf"):
            print("PDF URL found:", pdf_url)

            # Extract file name from URL
            pdf_filename = os.path.basename(pdf_url)

            # Check if the PDF file already exists locally
            if os.path.exists(pdf_filename):
                print(f"PDF already exists: {pdf_filename}")
            else:
                # Download the PDF if it does not exist locally
                pdf_response = requests.get(pdf_url)
                if pdf_response.status_code == 200:
                    with open(pdf_filename, 'wb') as pdf_file:
                        pdf_file.write(pdf_response.content)
                    print(f"Downloaded PDF: {pdf_filename}")
                else:
                    print(f"Failed to download PDF: {pdf_filename}")

            # Extract text from the downloaded PDF
            with open(pdf_filename, 'rb') as file:
                # Create a PDF reader object
                pdf_reader = PyPDF2.PdfReader(file)  # Or use pdfplumber here
                text = ""
                for page_num in range(len(pdf_reader.pages)):
                    page = pdf_reader.pages[page_num]
                    text += page.extract_text()

                # Store text in the appropriate variable
                if pdf_count == 1:
                    from_pdf1 = text
                elif pdf_count == 2:
                    from_pdf2 = text
                pdf_count += 1



[<a href="/id/publikasi/peraturan/Documents/PADG_162024.pdf"><img alt="" class="ms-asset-icon ms-rtePosition-4" src="/_layouts/15/images/icpdf.png"/>Peraturan Anggota Dewan Gubernur Nomor 16 Tahun 2024.pdf</a>, <a href="/id/publikasi/peraturan/Documents/FAQ_PADG_162024.pdf"><img alt="" class="ms-asset-icon ms-rtePosition-4" src="/_layouts/15/images/icpdf.png"/>Tanya Jawab Peraturan Anggota Dewan Gubernur Nomor 16 Tahun 2024.pdf</a>, <a href="/id/publikasi/peraturan/Documents/Lampiran_PADG_162024.zip"><img alt="" class="ms-asset-icon ms-rtePosition-4" src="/_layouts/15/images/iczip.gif"/>Lampiran Peraturan Anggota Dewan Gubernur Nomor 16​ Tahun 2024.zip</a>]
PDF URL found: https://www.bi.go.id/id/publikasi/peraturan/Documents/PADG_162024.pdf
PDF already exists: PADG_162024.pdf
PDF URL found: https://www.bi.go.id/id/publikasi/peraturan/Documents/FAQ_PADG_162024.pdf
PDF already exists: FAQ_PADG_162024.pdf


# SUMMARIZATION LATAR PERATURRAN ANGGOTA DEWAN GUBERNUR NOMOR 16 TAHUN 2024 TENTANG TRANSAKSI PASAR VALUTA ASING BERDASARKAN PRINSIP SYARIAH

In [4]:
from transformers import BartTokenizer, BartForConditionalGeneration

tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")

def summarize_text(text, max_input_length=1024):
    # Split text into chunks if it's too long
    chunks = [text[i:i+max_input_length] for i in range(0, len(text), max_input_length)]
    summaries = []

    for chunk in chunks:
        inputs = tokenizer(chunk, return_tensors="pt", truncation=True, padding="longest")
        summary_ids = model.generate(inputs["input_ids"], max_length=150, min_length=50, length_penalty=2.0, num_beams=4, early_stopping=True)
        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        summaries.append(summary)

    return " ".join(summaries)

long_text = from_Web
summarized_text = summarize_text(long_text)
print(summarized_text)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Bank Indonesia melakukan penyempurnaan pengaturan di pasar valuta asing melalui penerbitan Peraturan Anggota Dewan Gubernur (PADG) No. 16 Tahun 2024 tentang Transaksi Pasar Valuta Asing Berdasarkan Prinsip Syariah. PADG tentang Transaksi Pasar Valuta Asing Berdasarkan Prinsip Syariah mengatur mengenai hal-hal sebagai berikut. Istilah yang digunakan dalam kegiatan transaksi. Transaksi Pasar Valuta Asing Berdasarkan Prinsip Syariah yang mencakup:transaksi yang bersifat tunai; transaksi lindung nilai berd asarkan prinsip syariah; dantransaksi valuta asing lainnya yang ditetapkan oleh Bank Indonesia. Kewajiban bank untuk memastikan penyampaian dokumen underlying transaksi oleh pelaku Transaksi Valuta Asing Berdasarkan Prinsip Syariah. Aturan terkait jumlah tertentu (threshold) dan underlyingtransaksi mencakup antara lain. Pembiayaan dari bank kepada penduduk untuk tujuan perdagangan dan investasi. Underlying transaksi lain yang ditetapkan oleh Bank IndonesiaPembulatan nilai nominal underly

In [6]:
summarized_text

'Bank Indonesia melakukan penyempurnaan pengaturan di pasar valuta asing melalui penerbitan Peraturan Anggota Dewan Gubernur (PADG) No. 16 Tahun 2024 tentang Transaksi Pasar Valuta Asing Berdasarkan Prinsip Syariah. PADG tentang Transaksi Pasar Valuta Asing Berdasarkan Prinsip Syariah mengatur mengenai hal-hal sebagai berikut. Istilah yang digunakan dalam kegiatan transaksi. Transaksi Pasar Valuta Asing Berdasarkan Prinsip Syariah yang mencakup:transaksi yang bersifat tunai; transaksi lindung nilai berd asarkan prinsip syariah; dantransaksi valuta asing lainnya yang ditetapkan oleh Bank Indonesia. Kewajiban bank untuk memastikan penyampaian dokumen underlying transaksi oleh pelaku Transaksi Valuta Asing Berdasarkan Prinsip Syariah. Aturan terkait jumlah tertentu (threshold) dan underlyingtransaksi mencakup antara lain. Pembiayaan dari bank kepada penduduk untuk tujuan perdagangan dan investasi. Underlying transaksi lain yang ditetapkan oleh Bank IndonesiaPembulatan nilai nominal underl

# GAP ANALYSIS

In [11]:
!pip install PyPDF2



In [13]:
pdf_link_1 = "https://www.bi.go.id/id/publikasi/peraturan/PublishingImages/Pages/sp_241322/PADG_241322.pdf"  # Ganti dengan link PADG Nomor 24/13/PADG/2022
pdf_link_2 = "https://www.bi.go.id/id/publikasi/peraturan/Documents/PADG_162024.pdf"

In [17]:
!pip install PyMuPDF

Collecting PyMuPDF
  Downloading pymupdf-1.25.1-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.25.1-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (20.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.0/20.0 MB[0m [31m18.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDF
Successfully installed PyMuPDF-1.25.1


In [19]:
import requests
import fitz  # PyMuPDF
import nltk
nltk.download('punkt_tab')
from nltk.tokenize import sent_tokenize
from transformers import BertTokenizer, BertModel
import torch
from sklearn.metrics.pairwise import cosine_similarity

# Langkah 1: Mengambil PDF dari URL dan mengekstrak teks
def download_and_extract_pdf(url):
    # Download PDF dari URL
    response = requests.get(url)

    # Simpan file PDF
    with open("temp.pdf", "wb") as f:
        f.write(response.content)

    # Ekstrak teks dari PDF
    pdf_document = fitz.open("temp.pdf")
    text = ""
    for page_num in range(pdf_document.page_count):
        page = pdf_document.load_page(page_num)
        text += page.get_text()

    return text

# Langkah 2: Preprocessing Teks
def preprocess_text(text):
    # Tokenisasi teks menjadi kalimat
    nltk.download('punkt')
    sentences = sent_tokenize(text)
    return sentences

# Langkah 3: Menggunakan BERT untuk mendapatkan embeddings
def get_embedding(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze()

# Langkah 4: Menghitung kemiripan antar dua dokumen
def calculate_similarity(doc1, doc2):
    # Load pre-trained BERT model and tokenizer
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel.from_pretrained('bert-base-uncased')

    # Preprocess text: split into sentences
    sentences_doc1 = preprocess_text(doc1)
    sentences_doc2 = preprocess_text(doc2)

    # Mendapatkan embeddings dari setiap kalimat
    embeddings_doc1 = [get_embedding(sentence, tokenizer, model) for sentence in sentences_doc1]
    embeddings_doc2 = [get_embedding(sentence, tokenizer, model) for sentence in sentences_doc2]

    # Menghitung cosine similarity antar kalimat yang sesuai
    similarities = []
    for emb1 in embeddings_doc1:
        for emb2 in embeddings_doc2:
            similarity = cosine_similarity([emb1.numpy()], [emb2.numpy()])
            similarities.append(similarity[0][0])

    # Rata-rata similarity
    avg_similarity = sum(similarities) / len(similarities) if similarities else 0
    return avg_similarity

# URL PDF 1 dan PDF 2
url_pdf1 = "https://www.bi.go.id/id/publikasi/peraturan/PublishingImages/Pages/sp_241322/PADG_241322.pdf"  # Ganti dengan link PADG Nomor 24/13/PADG/2022
url_pdf2 = "https://www.bi.go.id/id/publikasi/peraturan/Documents/PADG_162024.pdf"

# Download dan ekstrak teks dari kedua PDF
text_pdf1 = download_and_extract_pdf(url_pdf1)
text_pdf2 = download_and_extract_pdf(url_pdf2)

# Hitung similarity antar dokumen
similarity_score = calculate_similarity(text_pdf1, text_pdf2)
print(f"Cosine Similarity antara kedua PDF: {similarity_score}")


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Cosine Similarity antara kedua PDF: 0.7766687716857341
