<h1><b>EXTRACK FILE PDF</b></h1>

In [1]:
!pip install PyPDF2 openpyxl pdfplumber

Collecting PyPDF2
  Using cached pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting pdfplumber
  Using cached pdfplumber-0.11.4-py3-none-any.whl.metadata (41 kB)
Collecting pdfminer.six==20231228 (from pdfplumber)
  Using cached pdfminer.six-20231228-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Using cached pypdfium2-4.30.0-py3-none-win_amd64.whl.metadata (48 kB)
Collecting cryptography>=36.0.0 (from pdfminer.six==20231228->pdfplumber)
  Using cached cryptography-44.0.0-cp39-abi3-win_amd64.whl.metadata (5.7 kB)
Collecting cffi>=1.12 (from cryptography>=36.0.0->pdfminer.six==20231228->pdfplumber)
  Downloading cffi-1.17.1-cp312-cp312-win_amd64.whl.metadata (1.6 kB)
Collecting pycparser (from cffi>=1.12->cryptography>=36.0.0->pdfminer.six==20231228->pdfplumber)
  Downloading pycparser-2.22-py3-none-any.whl.metadata (943 bytes)
Using cached pypdf2-3.0.1-py3-none-any.whl (232 kB)
Using cached pdfplumber-0.11.4-py3-none-any.whl (59 kB)
Using ca

In [3]:
import os
import re
import time
import pandas as pd
from PyPDF2 import PdfReader
from tabulate import tabulate
import pdfplumber
import warnings
warnings.filterwarnings('ignore')

In [3]:
pd.set_option('display.max_rows', None)  # Tampilkan semua baris
pd.set_option('display.max_columns', None)  # Tampilkan semua kolom

In [4]:
# Daftar bulan untuk pencocokan
bulan_list_id = [
    "Januari", "Februari", "Maret", "April", "Mei", "Juni",
    "Juli", "Agustus", "September", "Oktober", "November", "Desember"
]

bulan_list_en = [
    "January", "February", "March", "April", "May", "June",
    "July", "August", "September", "October", "November", "December"
]

# Fungsi mengambil teks dari halaman pertama file PDF
def extract_first_pages_pdf(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        first_page = pdf.pages[0]
        text = first_page.extract_text()
    return text

# Fungsi mengambil teks dari beberapa halaman file PDF
def extract_text_from_pages(pdf_path, start_page=1, end_page=None):
    try:
        reader = PdfReader(pdf_path)
        text_content = ""
        total_pages = len(reader.pages)

        if end_page is None or end_page > total_pages:
            end_page = total_pages

        for page_num in range(start_page - 1, end_page):
            page = reader.pages[page_num]
            text_content += page.extract_text() + "\n"
        return text_content
    except Exception as e:
        print(f"Error reading PDF {pdf_path}: {str(e)}")
        return None

# Fungsi untuk mengekstrak tabel dari teks
def extract_table_data(text_content, filename):
    lines = text_content.split('\n')

    items, values, notes, file_sources = [], [], [], []
    number_pattern = r'\(?[\d,.]+\)?'
    notes_pattern_1 = r'\d+[a-zA-Z](?:,\s?[a-zA-Z\d]+)*'
    notes_pattern_2 = r'\b\d{1,2}(?:,\s?\d{1,2})*\b'

    for line in lines:
        line = line.strip()
        number_match = re.search(number_pattern, line)
        notes_match_1 = re.search(notes_pattern_1, line)
        notes_match_2 = re.search(notes_pattern_2, line)

        if number_match:
            item = re.sub(number_pattern, '', line)
            item = re.sub(notes_pattern_1, '', item)
            item = re.sub(notes_pattern_2, '', item)
            item = re.sub(r'\s+', ' ', item).strip()
            item = re.sub(r'\(\s*\).*', '', item).strip()

            numbers = re.findall(number_pattern, line)
            value = numbers[0] if numbers else ""

            if notes_match_1:
                note = notes_match_1.group()
            elif notes_match_2:
                note = notes_match_2.group()
            else:
                note = ""

            if value.startswith('(') and value.endswith(')'):
                value = '-' + value[1:-1].replace('.', '').replace(',', '')
            else:
                value = value.replace('.', '').replace(',', '')

            items.append(item)
            values.append(value)
            notes.append(note)
            file_sources.append(filename)

    df = pd.DataFrame({
        'Item': items,
        'Notes': notes,
        'Value': values,
        'Source_File': file_sources
    })
    return df

# Fungsi untuk mengekstrak nama perusahaan, bulan, dan tahun
def find_company_and_month_year(text):
    company_pattern = re.compile(r"(PT\s[\w\s]+,\s?Tbk|PT\s[\w\s]+Tbk)", re.IGNORECASE)
    company_match = company_pattern.search(text)

    months_years = []
    for bulan in bulan_list_id + bulan_list_en:
        pattern = re.compile(rf"(\b{bulan}\b)\s(\d{{4}})", re.IGNORECASE)
        matches = pattern.findall(text)
        for match in matches:
            month, year = match
            months_years.append((month.strip(), int(year)))

    company_name = company_match.group(1) if company_match else "Tidak ditemukan"
    return company_name, months_years

# Fungsi untuk memproses semua file PDF dalam folder
def process_pdf_folder(folder_path, table_start_page=1, table_end_page=8):
    table_dfs = []
    info_dfs = []
    pdf_files = [f for f in os.listdir(folder_path) if f.lower().endswith('.pdf')]

    if not pdf_files:
        print(f"No PDF files found in {folder_path}")
        return None

    for pdf_file in pdf_files:
        pdf_path = os.path.join(folder_path, pdf_file)
        print(f"Processing {pdf_file}...")

        first_page_text = extract_first_pages_pdf(pdf_path)
        if first_page_text:
            company_name, months_years = find_company_and_month_year(first_page_text)

            if months_years:
                latest_month, latest_year = months_years[-1]
                info_df = pd.DataFrame({
                    'Nama Perusahaan': [company_name],
                    'Bulan': [latest_month],
                    'Tahun': [latest_year],
                    'Source_File': [pdf_file]
                })
                info_dfs.append(info_df)

        table_text = extract_text_from_pages(pdf_path, table_start_page, table_end_page)
        if table_text:
            table_df = extract_table_data(table_text, pdf_file)
            if not table_df.empty:
                table_dfs.append(table_df)

    if table_dfs and info_dfs:
        combined_table_df = pd.concat(table_dfs, ignore_index=True)
        combined_info_df = pd.concat(info_dfs, ignore_index=True)

        merged_df = pd.merge(combined_info_df, combined_table_df, on='Source_File', how='left')
        return merged_df
    else:
        print("No data was extracted from any PDF files")
        return None

# Main function
if __name__ == "__main__":
    folder_path = 'C:/TI-Dwika/Semester 7/Pangkalan Data/file_pdf'
    merged_pdf = process_pdf_folder(folder_path)

    if merged_pdf is not None:
        print("\nExtracted Data:")
        print(tabulate(merged_pdf, headers='keys', tablefmt='grid', showindex=False))
    else:
        print("No data extracted.")


Processing AGAR TW 2 KONSOL 2024.pdf...
Processing FKS Food Sejahtera TBK billingual 31 Maret 2024.pdf...
Processing Lapkeu ABBA 31 Mar 2024.pdf...
Processing LKA PT Samcro Hyosung Adilestari Tbk Maret 2024.pdf...

Extracted Data:
+----------------------------------+----------+---------+-----------------------------------------------------+------------------------------------------------------------------------------------------------+----------------+---------------+
| Nama Perusahaan                  | Bulan    |   Tahun | Source_File                                         | Item                                                                                           | Notes          | Value         |
| PT ASIA SEJAHTERA MINA, Tbk      | JUNI     |    2024 | AGAR TW 2 KONSOL 2024.pdf                           | PT ASIA SEJAHTERA MINA Tbk                                                                     |                |               |
+----------------------------------+-------

<h2><b>EKSTRAK NOTES</b></h2>

In [66]:
def extract_text_from_pdf(pdf_path, start_page=1, end_page=None):
    """
    Extract text content from a PDF file, limited to specific page range.
    """
    try:
        reader = PdfReader(pdf_path)
        text_content = ""
        total_pages = len(reader.pages)

        # Adjust end_page if it's not provided or exceeds total pages
        if end_page is None or end_page > total_pages:
            end_page = total_pages

        # Extract text from the specified page range
        for page_num in range(start_page - 1, end_page):
            page = reader.pages[page_num]
            text_content += page.extract_text() + "\n"
        return text_content
    except Exception as e:
        print(f"Error reading PDF {pdf_path}: {str(e)}")
        return None


def extract_table_data(text_content, filename):
    """
    Extract table data from text content with notes and values.
    Handles different table formats where notes and values might be in different positions.
    """
    # Split content into lines
    lines = text_content.split('\n')

    # Initialize lists to store extracted data
    items = []
    values = []
    notes = []
    file_sources = []  # To track which file each row came from

    # Regular expressions for matching
    number_pattern = r'\(?[\d,.]+\)?'  # Matches numbers with 6 or more digits

    # First notes pattern: Matches patterns like 2f, 2v, or 2v,2g
    notes_pattern_1 = r'\d+[a-zA-Z](?:,\s?[a-zA-Z\d]+)*'  # Matches patterns like 2v, 2g, 3f, or 2v,2g

    # Second notes pattern: Matches just numbers (like 22)
    notes_pattern_2 = r'\b\d{1,2}(?:,\s?\d{1,2})*\b'

    for line in lines:
        # Clean the line
        line = line.strip()

        # Try to identify components
        number_match = re.search(number_pattern, line)
        notes_match_1 = re.search(notes_pattern_1, line)
        notes_match_2 = re.search(notes_pattern_2, line)

        if number_match:
            # Extract the item name by removing numbers and notes
            item = re.sub(number_pattern, '', line)  # Remove numbers from item
            item = re.sub(notes_pattern_1, '', item)  # Remove notes from item
            item = re.sub(notes_pattern_2, '', item)  # Remove additional notes
            item = re.sub(r'\s+', ' ', item).strip()  # Replace multiple spaces with a single space

            # Remove empty parentheses and remaining text after them
            item = re.sub(r'\(\s*\).*', '', item).strip()

            # Remove single or double letters and all remaining text after them
            item = re.sub(r'\s[a-zA-Z]{1,2}\s.*', '', item).strip()

            # # Remove remaining words if they start with an uppercase letter
            # item = re.sub(r'\s[A-Z][a-zA-Z]*.*', '', item).strip()

            # Remove remaining words if they start with an uppercase letter, but not all caps
            item = re.sub(r'\s([A-Z][a-z][a-zA-Z]*)\b.*', '', item).strip()

            numbers = re.findall(number_pattern, line)
            if len(numbers) > 1:
                # General case: pick the second number, or use any logic to decide which number to take
                value = numbers[1]  # Take the second number (can be adjusted)
            elif len(numbers) == 1:
                value = numbers[0]
            else:
                # If no numbers found, return an empty string
                value = " "

            # Extract value
            # value = number_match.group()
            
            # Use the first notes pattern if it matches (allows comma-separated notes)
            if notes_match_1:
                note = notes_match_1.group()
            # Only use the second notes pattern if the first one didn't match
            elif notes_match_2 and not notes_match_1:
                note = notes_match_2.group()
            else:
                note = ""  # Default case if no notes are found

            # Directly append to lists without any additional condition
            values.append(value)
            notes.append(note)

            # Only include if we have all components and it's not a total
            if item and not item.lower().startswith('total'):
                items.append(item)
                file_sources.append(filename)  # Add filename to track source

            # Ensure lists remain the same length by filling any missing values
            while len(items) < len(values):
                items.append("")
            while len(notes) < len(values):
                notes.append("")
            while len(file_sources) < len(values):
                file_sources.append("")

    # Create DataFrame
    df = pd.DataFrame({
        'Item': items,
        'Notes': notes,
        'Value': values,
        'Source_File': file_sources
    })
    
    return df


# import re
# import pandas as pd

# def extract_table_data(text_content, filename):
#     """
#     Extract table data from text content with notes and values.
#     Handles different table formats where notes and values might be in different positions.
#     """
#     # Split content into lines
#     lines = text_content.split('\n')

#     # Initialize lists to store extracted data
#     items = []
#     values = []
#     notes = []
#     file_sources = []  # To track which file each row came from

#     # Regular expressions for matching
#     number_pattern = r'[\d,.]+'  # Matches numbers with commas and periods

#     # First notes pattern: Matches patterns like 2f, 2v, or 2v,2g
#     notes_pattern_1 = r'\d+[a-zA-Z](?:,\s?[a-zA-Z\d]+)*'  # Matches patterns like 2v, 2g, 3f, or 2v,2g

#     # Second notes pattern: Matches just numbers (like 22)
#     notes_pattern_2 = r'\b\d{1,2}(?:,\s?\d{1,2})*\b'

#     for line in lines:
#         # Clean the line
#         line = line.strip()

#         # Try to identify components
#         notes_match_1 = re.search(notes_pattern_1, line)
#         notes_match_2 = re.search(notes_pattern_2, line)

#         # Extract numbers from the line
#         numbers = re.findall(number_pattern, line)
        
#         # Default value for note and value
#         note = ""
#         value = ""

#         # Case 1: If notes pattern 1 matches (like 29d, 2f)
#         if notes_match_1:
#             note = notes_match_1.group()  # Capture the notes (e.g., '29d')
#             # Find numbers after the note
#             notes_end_pos = line.find(note) + len(note)
#             line_after_notes = line[notes_end_pos:].strip()
#             numbers_after_notes = re.findall(number_pattern, line_after_notes)
#             value = numbers_after_notes[0] if numbers_after_notes else numbers[0]  # Value after notes, or first number

#         # Case 2: If notes pattern 2 matches (like 22, 12)
#         elif notes_match_2:
#             note = notes_match_2.group()  # Capture the notes (e.g., '22')
#             # Take the first number as the value
#             value = numbers[0] if numbers else ""

#         # Case 3: If no notes pattern matches, take the first number as the value
#         elif numbers:
#             value = numbers[0]

#         # Clean the item name by removing numbers and notes
#         item = re.sub(number_pattern, '', line)  # Remove all numbers from item
#         item = re.sub(f"{note}$", '', item).strip()  # Remove notes from item
#         item = re.sub(r'\s+', ' ', item).strip()  # Replace multiple spaces with a single space

#         # Clean up further
#         item = re.sub(r'\(\s*\).*', '', item).strip()  # Remove empty parentheses and text after them
#         item = re.sub(r'\s[a-zA-Z]{1,2}\s.*', '', item).strip()  # Remove single/double letters and text after them
#         item = re.sub(r'\s([A-Z][a-z][a-zA-Z]*)\b.*', '', item).strip()  # Remove words starting with uppercase letters

#         # Append the extracted values
#         values.append(value)
#         notes.append(note)

#         # Only include if we have all components and it's not a total
#         if item and not item.lower().startswith('total'):
#             items.append(item)
#             file_sources.append(filename)  # Add filename to track source

#         # Ensure lists remain the same length by filling any missing values
#         while len(items) < len(values):
#             items.append("")
#         while len(notes) < len(values):
#             notes.append("")
#         while len(file_sources) < len(values):
#             file_sources.append("")

#     # Create DataFrame
#     df = pd.DataFrame({
#         'Item': items,
#         'Notes': notes,
#         'Value': values,
#         'Source_File': file_sources
#     })
    
#     return df




def process_folder(folder_path, start_page=1, end_page=8):
    """
    Process all PDF files in a folder and combine results, limited to specific page range.
    """
    # List to store all DataFrames
    all_dfs = []

    # Get all PDF files in the folder
    pdf_files = [f for f in os.listdir(folder_path) if f.lower().endswith('.pdf')]

    if not pdf_files:
        print(f"No PDF files found in {folder_path}")
        return None

    # Process each PDF file
    for pdf_file in pdf_files:
        print(f"Processing {pdf_file}...")
        pdf_path = os.path.join(folder_path, pdf_file)

        # Extract text from PDF
        text_content = extract_text_from_pdf(pdf_path, start_page, end_page)

        if text_content:
            # Process the content
            df = extract_table_data(text_content, pdf_file)
            if not df.empty:
                all_dfs.append(df)
            else:
                print(f"No relevant data found in {pdf_file}")

    if all_dfs:
        # Combine all DataFrames
        combined_df = pd.concat(all_dfs, ignore_index=True)

        # Clean up the DataFrame
        combined_df['Item'] = combined_df['Item'].str.strip()
        combined_df['Notes'] = combined_df['Notes'].str.strip()
        # combined_df['Value'] = combined_df['Value'].str.strip()

        return combined_df
    else:
        print("No data was extracted from any PDF files")
        return None


def main(folder_path, start_page=1, end_page=8):
    """
    Main function to process folder and save results.
    """
    # Process the folder with specific page range
    df = process_folder(folder_path, start_page, end_page)

    if df is not None:
        # Display summary
        print(f"\nTotal rows extracted: {len(df)}")
        print(f"Files processed: {df['Source_File'].nunique()}")

        # Display sample of results
        print("\nSample of extracted data:")
        print(df.to_string())

    return df

# Example usage
if __name__ == "__main__":
    # Replace with your folder path
    folder_path = "C:/TI-Dwika/Semester 7/Pangkalan Data/file_pdf"
    start_page = 1
    end_page = 8
    df_data = main(folder_path, start_page, end_page)

Processing AGAR TW 2 KONSOL 2024.pdf...
Processing FKS Food Sejahtera TBK_billingual_31 Maret 2024.pdf...
Processing Lapkeu ABBA 31 Mar 2024 (1).pdf...
Processing LKA PT Samcro Hyosung Adilestari Tbk Maret 2024_.pdf...

Total rows extracted: 539
Files processed: 5

Sample of extracted data:
                                                                    Item           Notes                Value                                           Source_File
0                                                 PT ASIA SEJAHTERA MINA                                    ,                             AGAR TW 2 KONSOL 2024.pdf
1                                                                 JUNI /              30                 2024                             AGAR TW 2 KONSOL 2024.pdf
2                                                                   JUNE              30                  202                             AGAR TW 2 KONSOL 2024.pdf
3                                             PT ASI

In [67]:
def extract_text_from_pdf(pdf_path, start_page=1, end_page=None):
    """
    Extract text content from a PDF file, limited to specific page range.
    """
    try:
        reader = PdfReader(pdf_path)
        text_content = ""
        total_pages = len(reader.pages)

        # Adjust end_page if it's not provided or exceeds total pages
        if end_page is None or end_page > total_pages:
            end_page = total_pages

        # Extract text from the specified page range
        for page_num in range(start_page - 1, end_page):
            page = reader.pages[page_num]
            text_content += page.extract_text() + "\n"
        return text_content
    except Exception as e:
        print(f"Error reading PDF {pdf_path}: {str(e)}")
        return None


def extract_table_data(text_content, filename):
    """
    Extract table data from text content with notes and values.
    Handles different table formats where notes and values might be in different positions.
    """
    # Split content into lines
    lines = text_content.split('\n')

    # Initialize lists to store extracted data
    items = []
    values = []
    notes = []
    file_sources = []  # To track which file each row came from

    # Regular expressions for matching
    number_pattern = r'\(?[\d,.]+\)?'  # Matches numbers with 6 or more digits

    # First notes pattern: Matches patterns like 2f, 2v, or 2v,2g
    notes_pattern_1 = r'\d+[a-zA-Z](?:,\s?[a-zA-Z\d]+)*'  # Matches patterns like 2v, 2g, 3f, or 2v,2g

    # Second notes pattern: Matches just numbers (like 22)
    notes_pattern_2 = r'\b\d{1,2}(?:,\s?\d{1,2})*\b'

    for line in lines:
        # Clean the line
        line = line.strip()

        # Try to identify components
        number_match = re.search(number_pattern, line)
        notes_match_1 = re.search(notes_pattern_1, line)
        notes_match_2 = re.search(notes_pattern_2, line)

        if number_match:
            # Extract the item name by removing numbers and notes
            item = re.sub(number_pattern, '', line)  # Remove numbers from item
            item = re.sub(notes_pattern_1, '', item)  # Remove notes from item
            item = re.sub(notes_pattern_2, '', item)  # Remove additional notes
            item = re.sub(r'\s+', ' ', item).strip()  # Replace multiple spaces with a single space

            # Remove empty parentheses and remaining text after them
            item = re.sub(r'\(\s*\).*', '', item).strip()

            # Remove single or double letters and all remaining text after them
            item = re.sub(r'\s[a-zA-Z]{1,2}\s.*', '', item).strip()

            # # Remove remaining words if they start with an uppercase letter
            # item = re.sub(r'\s[A-Z][a-zA-Z]*.*', '', item).strip()

            # Remove remaining words if they start with an uppercase letter, but not all caps
            item = re.sub(r'\s([A-Z][a-z][a-zA-Z]*)\b.*', '', item).strip()

            numbers = re.findall(number_pattern, line)
            if len(numbers) > 1:
                # General case: pick the second number, or use any logic to decide which number to take
                value = numbers[1]  # Take the second number (can be adjusted)
            elif len(numbers) == 1:
                value = numbers[0]
            else:
                # If no numbers found, return an empty string
                value = " "

            # Extract value
            # value = number_match.group()
            
            # Use the first notes pattern if it matches (allows comma-separated notes)
            if notes_match_1:
                note = notes_match_1.group()
            # Only use the second notes pattern if the first one didn't match
            elif notes_match_2 and not notes_match_1:
                note = notes_match_2.group()
            else:
                note = ""  # Default case if no notes are found

            # Modify values: remove commas, periods, and handle parentheses for negative values
            if value.startswith('(') and value.endswith(')'):
                # Remove parentheses, then make it negative
                value = '-' + value[1:-1].replace('.', '').replace(',', '')
            else:
                # Just remove commas and periods
                value = value.replace('.', '').replace(',', '')

            
            # Directly append to lists without any additional condition
            values.append(value)
            notes.append(note)

            # Only include if we have all components and it's not a total
            if item and not item.lower().startswith('total'):
                items.append(item)
                file_sources.append(filename)  # Add filename to track source

            # Ensure lists remain the same length by filling any missing values
            while len(items) < len(values):
                items.append("")
            while len(notes) < len(values):
                notes.append("")
            while len(file_sources) < len(values):
                file_sources.append("")

    # Create DataFrame
    df = pd.DataFrame({
        'Item': items,
        'Notes': notes,
        'Value': values,
        'Source_File': file_sources
    })
    
    return df


# import re
# import pandas as pd

# def extract_table_data(text_content, filename):
#     """
#     Extract table data from text content with notes and values.
#     Handles different table formats where notes and values might be in different positions.
#     """
#     # Split content into lines
#     lines = text_content.split('\n')

#     # Initialize lists to store extracted data
#     items = []
#     values = []
#     notes = []
#     file_sources = []  # To track which file each row came from

#     # Regular expressions for matching
#     number_pattern = r'[\d,.]+'  # Matches numbers with commas and periods

#     # First notes pattern: Matches patterns like 2f, 2v, or 2v,2g
#     notes_pattern_1 = r'\d+[a-zA-Z](?:,\s?[a-zA-Z\d]+)*'  # Matches patterns like 2v, 2g, 3f, or 2v,2g

#     # Second notes pattern: Matches just numbers (like 22)
#     notes_pattern_2 = r'\b\d{1,2}(?:,\s?\d{1,2})*\b'

#     for line in lines:
#         # Clean the line
#         line = line.strip()

#         # Try to identify components
#         notes_match_1 = re.search(notes_pattern_1, line)
#         notes_match_2 = re.search(notes_pattern_2, line)

#         # Extract numbers from the line
#         numbers = re.findall(number_pattern, line)
        
#         # Default value for note and value
#         note = ""
#         value = ""

#         # Case 1: If notes pattern 1 matches (like 29d, 2f)
#         if notes_match_1:
#             note = notes_match_1.group()  # Capture the notes (e.g., '29d')
#             # Find numbers after the note
#             notes_end_pos = line.find(note) + len(note)
#             line_after_notes = line[notes_end_pos:].strip()
#             numbers_after_notes = re.findall(number_pattern, line_after_notes)
#             value = numbers_after_notes[0] if numbers_after_notes else numbers[0]  # Value after notes, or first number

#         # Case 2: If notes pattern 2 matches (like 22, 12)
#         elif notes_match_2:
#             note = notes_match_2.group()  # Capture the notes (e.g., '22')
#             # Take the first number as the value
#             value = numbers[0] if numbers else ""

#         # Case 3: If no notes pattern matches, take the first number as the value
#         elif numbers:
#             value = numbers[0]

#         # Clean the item name by removing numbers and notes
#         item = re.sub(number_pattern, '', line)  # Remove all numbers from item
#         item = re.sub(f"{note}$", '', item).strip()  # Remove notes from item
#         item = re.sub(r'\s+', ' ', item).strip()  # Replace multiple spaces with a single space

#         # Clean up further
#         item = re.sub(r'\(\s*\).*', '', item).strip()  # Remove empty parentheses and text after them
#         item = re.sub(r'\s[a-zA-Z]{1,2}\s.*', '', item).strip()  # Remove single/double letters and text after them
#         item = re.sub(r'\s([A-Z][a-z][a-zA-Z]*)\b.*', '', item).strip()  # Remove words starting with uppercase letters

#         # Append the extracted values
#         values.append(value)
#         notes.append(note)

#         # Only include if we have all components and it's not a total
#         if item and not item.lower().startswith('total'):
#             items.append(item)
#             file_sources.append(filename)  # Add filename to track source

#         # Ensure lists remain the same length by filling any missing values
#         while len(items) < len(values):
#             items.append("")
#         while len(notes) < len(values):
#             notes.append("")
#         while len(file_sources) < len(values):
#             file_sources.append("")

#     # Create DataFrame
#     df = pd.DataFrame({
#         'Item': items,
#         'Notes': notes,
#         'Value': values,
#         'Source_File': file_sources
#     })
    
#     return df




def process_folder(folder_path, start_page=1, end_page=8):
    """
    Process all PDF files in a folder and combine results, limited to specific page range.
    """
    # List to store all DataFrames
    all_dfs = []

    # Get all PDF files in the folder
    pdf_files = [f for f in os.listdir(folder_path) if f.lower().endswith('.pdf')]

    if not pdf_files:
        print(f"No PDF files found in {folder_path}")
        return None

    # Process each PDF file
    for pdf_file in pdf_files:
        print(f"Processing {pdf_file}...")
        pdf_path = os.path.join(folder_path, pdf_file)

        # Extract text from PDF
        text_content = extract_text_from_pdf(pdf_path, start_page, end_page)

        if text_content:
            # Process the content
            df = extract_table_data(text_content, pdf_file)
            if not df.empty:
                all_dfs.append(df)
            else:
                print(f"No relevant data found in {pdf_file}")

    if all_dfs:
        # Combine all DataFrames
        combined_df = pd.concat(all_dfs, ignore_index=True)

        # Clean up the DataFrame
        combined_df['Item'] = combined_df['Item'].str.strip()
        combined_df['Notes'] = combined_df['Notes'].str.strip()
        # combined_df['Value'] = combined_df['Value'].str.strip()

        return combined_df
    else:
        print("No data was extracted from any PDF files")
        return None


def main(folder_path, start_page=1, end_page=8):
    """
    Main function to process folder and save results.
    """
    # Process the folder with specific page range
    df = process_folder(folder_path, start_page, end_page)

    if df is not None:
        # Display summary
        print(f"\nTotal rows extracted: {len(df)}")
        print(f"Files processed: {df['Source_File'].nunique()}")

        # Display sample of results
        print("\nSample of extracted data:")
        print(df.to_string())

    return df

# Example usage
if __name__ == "__main__":
    # Replace with your folder path
    folder_path = "C:/TI-Dwika/Semester 7/Pangkalan Data/file_pdf"
    start_page = 1
    end_page = 8
    df_data = main(folder_path, start_page, end_page)

Processing AGAR TW 2 KONSOL 2024.pdf...
Processing FKS Food Sejahtera TBK_billingual_31 Maret 2024.pdf...
Processing Lapkeu ABBA 31 Mar 2024 (1).pdf...
Processing LKA PT Samcro Hyosung Adilestari Tbk Maret 2024_.pdf...

Total rows extracted: 539
Files processed: 5

Sample of extracted data:
                                                                    Item           Notes          Value                                           Source_File
0                                                 PT ASIA SEJAHTERA MINA                                                            AGAR TW 2 KONSOL 2024.pdf
1                                                                 JUNI /              30           2024                             AGAR TW 2 KONSOL 2024.pdf
2                                                                   JUNE              30            202                             AGAR TW 2 KONSOL 2024.pdf
3                                             PT ASIA SEJAHTERA MINA TBK    

<h2><b>ESKTRAK IDENTIFIER</b></h2>

In [4]:
# Daftar bulan untuk pencocokan
bulan_list_id = [
    "Januari", "Februari", "Maret", "April", "Mei", "Juni",
    "Juli", "Agustus", "September", "Oktober", "November", "Desember"
]

bulan_list_en = [
    "January", "February", "March", "April", "May", "June",
    "July", "August", "September", "October", "November", "December"
]

# Fungsi untuk membaca teks dari file PDF
def extract_text_from_pdf(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        # Ambil teks dari halaman pertama
        first_page = pdf.pages[0]
        text = first_page.extract_text()
    return text

# Fungsi untuk mengekstrak nama perusahaan dan bulan/tahun dari teks PDF
def find_company_and_month_year(text):
    # Pencocokan nama perusahaan yang dimulai dengan "PT" dan mengandung "Tbk" atau variasinya
    company_pattern = re.compile(r"(PT\s[\w\s]+,\s?Tbk|PT\s[\w\s]+Tbk)", re.IGNORECASE)
    company_match = company_pattern.search(text)
    
    # Pencocokan bulan dan tahun (baik bahasa Indonesia maupun Inggris)
    months_years = []
    for bulan in bulan_list_id + bulan_list_en:  # Menggabungkan bulan dalam bahasa Indonesia dan Inggris
        # Regex untuk menemukan bulan dan tahun setelahnya
        pattern = re.compile(rf"(\b{bulan}\b)\s(\d{{4}})", re.IGNORECASE)
        
        # Cari kecocokan dengan pola regex
        matches = pattern.findall(text)
        
        for match in matches:
            month, year = match
            months_years.append((month.strip(), int(year)))  # Simpan bulan dan tahun sebagai tuple

    # Menyimpan nama perusahaan dan bulan/tahun yang ditemukan
    company_name = company_match.group(1) if company_match else "Tidak ditemukan"
    
    return company_name, months_years

# Fungsi untuk mendapatkan bulan dan tahun terbaru dari list bulan/tahun
def get_latest_month_and_year(months_years):
    # Jika tidak ada bulan atau tahun yang ditemukan
    if not months_years:
        return None, None
    
    # Menemukan tahun terbesar dan bulan yang sesuai dengan tahun tersebut
    latest_year = max(months_years, key=lambda x: x[1])[1]  # Ambil tahun terbesar
    latest_month = next(month for month, year in months_years if year == latest_year)  # Ambil bulan yang sesuai dengan tahun terbesar
    
    return latest_month, latest_year

# Fungsi untuk mengonversi bulan menjadi kuartal
def convert_month_to_quarter(month):
    # Normalisasi nama bulan
    month = month.strip().capitalize()
    
    q1 = ["Januari", "February", "Maret", "March"]
    q2 = ["April", "Mei", "May", "Juni", "June"]
    q3 = ["Juli", "July", "Agustus", "August", "September"]
    q4 = ["Oktober", "October", "November", "Desember", "December"]
    
    if month in q1:
        return "Kuartal I/First Quarter"
    elif month in q2:
        return "Kuartal II/Second Quarter"
    elif month in q3:
        return "Kuartal III/Third Quarter"
    elif month in q4:
        return "Tahunan/Annual"
    else:
        return "Unknown"

# Fungsi untuk memproses banyak file PDF dalam folder dan menghasilkan DataFrame
def extract_data_from_folder(folder_path):
    data = []
    
    # Loop untuk memproses setiap file PDF dalam folder
    for filename in os.listdir(folder_path):
        if filename.endswith('.pdf'):
            pdf_path = os.path.join(folder_path, filename)
            text = extract_text_from_pdf(pdf_path)
            
            # Temukan nama perusahaan dan bulan/tahun dari teks
            company_name, months_years = find_company_and_month_year(text)
            
            # Ambil bulan dan tahun terbaru
            latest_month, latest_year = get_latest_month_and_year(months_years)
            
            # Konversi bulan menjadi kuartal
            if latest_month and latest_year:
                quarter = convert_month_to_quarter(latest_month)
                data.append([company_name, quarter, latest_year, filename])  # Tambahkan nama file sebagai Source_File
    
    # Membuat DataFrame dari hasil ekstraksi
    df = pd.DataFrame(data, columns=["Nama Perusahaan", "Kuartal", "Tahun", "Source_File"])
    return df

# Ganti dengan path folder yang berisi file PDF
folder_path = "C:/TI-Dwika/Semester 7/Pangkalan Data/file_pdf"

# Ekstrak data dari folder dan tampilkan DataFrame
df_info = extract_data_from_folder(folder_path)
print(df_info)


                    Nama Perusahaan                    Kuartal  Tahun  \
0       PT ASIA SEJAHTERA MINA, Tbk  Kuartal II/Second Quarter   2024   
1         PT FKS Food Sejahtera Tbk    Kuartal I/First Quarter   2024   
2             PT ACSET INDONUSA Tbk  Kuartal II/Second Quarter   2024   
3               PT MAHAKA MEDIA TBK    Kuartal I/First Quarter   2024   
4  PT SAMCRO HYOSUNG ADILESTARI TBK    Kuartal I/First Quarter   2024   

                                         Source_File  
0                          AGAR TW 2 KONSOL 2024.pdf  
1  FKS Food Sejahtera TBK billingual 31 Maret 202...  
2             FS PT Acset Indonusa Tbk Juni 2024.pdf  
3                        Lapkeu ABBA 31 Mar 2024.pdf  
4  LKA PT Samcro Hyosung Adilestari Tbk Maret 202...  


In [None]:
# Fungsi untuk menggabungkan df_data dan df_file_info
# Gabungkan kedua DataFrame berdasarkan 'Source_File' yang menyimpan nama file PDF
merged_df = pd.merge(df_info, df_data, on='Source_File', how='left')

# Menghapus kolom 'Source_File' dari DataFrame yang telah digabungkan
merged_df = merged_df.drop(columns=['Source_File'])

merged_df


In [None]:
import os
import re
import time
import pandas as pd
from PyPDF2 import PdfReader
from sqlalchemy import create_engine
from watchdog.observers import Observer
from watchdog.events import FileSystemEventHandler


# Daftar bulan untuk pencocokan
bulan_list_id = [
    "Januari", "Februari", "Maret", "April", "Mei", "Juni",
    "Juli", "Agustus", "September", "Oktober", "November", "Desember"
]

bulan_list_en = [
    "January", "February", "March", "April", "May", "June",
    "July", "August", "September", "October", "November", "December"
]


def extract_text_from_pdf(pdf_path, start_page=1, end_page=None):
    try:
        reader = PdfReader(pdf_path)
        text_content = ""
        total_pages = len(reader.pages)

        # Adjust end_page if it's not provided or exceeds total pages
        if end_page is None or end_page > total_pages:
            end_page = total_pages

        # Extract text from the specified page range
        for page_num in range(start_page - 1, end_page):
            page = reader.pages[page_num]
            text_content += page.extract_text() + "\n"
        return text_content
    except Exception as e:
        print(f"Error reading PDF {pdf_path}: {str(e)}")
        return None


def extract_table_data(text_content, filename):
    # Split content into lines
    lines = text_content.split('\n')

    # Initialize lists to store extracted data
    items = []
    values = []
    notes = []
    file_sources = []  # To track which file each row came from

    # Regular expressions for matching
    number_pattern = r'\(?[\d,.]+\)?'  # Matches numbers with 6 or more digits

    # First notes pattern: Matches patterns like 2f, 2v, or 2v,2g
    notes_pattern_1 = r'\d+[a-zA-Z](?:,\s?[a-zA-Z\d]+)*'  # Matches patterns like 2v, 2g, 3f, or 2v,2g

    # Second notes pattern: Matches just numbers (like 22)
    notes_pattern_2 = r'\b\d{1,2}(?:,\s?\d{1,2})*\b'

    for line in lines:
        # Clean the line
        line = line.strip()

        # Try to identify components
        number_match = re.search(number_pattern, line)
        notes_match_1 = re.search(notes_pattern_1, line)
        notes_match_2 = re.search(notes_pattern_2, line)

        if number_match:
            # Extract the item name by removing numbers and notes
            item = re.sub(number_pattern, '', line)  # Remove numbers from item
            item = re.sub(notes_pattern_1, '', item)  # Remove notes from item
            item = re.sub(notes_pattern_2, '', item)  # Remove additional notes
            item = re.sub(r'\s+', ' ', item).strip()  # Replace multiple spaces with a single space

            # Remove empty parentheses and remaining text after them
            item = re.sub(r'\(\s*\).*', '', item).strip()

            # Remove single or double letters and all remaining text after them
            item = re.sub(r'\s[a-zA-Z]{1,2}\s.*', '', item).strip()

            # # Remove remaining words if they start with an uppercase letter
            # item = re.sub(r'\s[A-Z][a-zA-Z]*.*', '', item).strip()

            # Remove remaining words if they start with an uppercase letter, but not all caps
            item = re.sub(r'\s([A-Z][a-z][a-zA-Z]*)\b.*', '', item).strip()

            numbers = re.findall(number_pattern, line)
            if len(numbers) > 1:
                # General case: pick the second number, or use any logic to decide which number to take
                value = numbers[1]  # Take the second number (can be adjusted)
            elif len(numbers) == 1:
                value = numbers[0]
            else:
                # If no numbers found, return an empty string
                value = " "

            # Extract value
            # value = number_match.group()
            
            # Use the first notes pattern if it matches (allows comma-separated notes)
            if notes_match_1:
                note = notes_match_1.group()
            # Only use the second notes pattern if the first one didn't match
            elif notes_match_2 and not notes_match_1:
                note = notes_match_2.group()
            else:
                note = "" 

            # Directly append to lists without any additional condition
            values.append(value)
            notes.append(note)

            # Only include if we have all components and it's not a total
            if item and not item.lower().startswith('total'):
                items.append(item)
                file_sources.append(filename) 

            # Ensure lists remain the same length by filling any missing values
            while len(items) < len(values):
                items.append("")
            while len(notes) < len(values):
                notes.append("")
            while len(file_sources) < len(values):
                file_sources.append("")

    # Create DataFrame
    df = pd.DataFrame({
        'Item': items,
        'Notes': notes,
        'Value': values,
        'Source_File': file_sources
    })
    
    return df

def find_company_and_month_year(text):
    # Company name pattern
    company_pattern = re.compile(r"(PT\s[\w\s]+,\s?Tbk|PT\s[\w\s]+Tbk)", re.IGNORECASE)
    company_match = company_pattern.search(text)
    
    # Month and year matching
    months_years = []
    for bulan in bulan_list_id + bulan_list_en:
        pattern = re.compile(rf"(\b{bulan}\b)\s(\d{{4}})", re.IGNORECASE)
        
        # Find matches with regex pattern
        matches = pattern.findall(text)
        
        for match in matches:
            month, year = match
            months_years.append((month.strip(), int(year)))

    # Get company name
    company_name = company_match.group(1) if company_match else "Tidak ditemukan"
    
    return company_name, months_years

def get_latest_month_and_year(months_years):
    """
    Get the latest month and year from the list
    """
    if not months_years:
        return None, None
    
    latest_year = max(months_years, key=lambda x: x[1])[1]
    latest_month = next(month for month, year in months_years if year == latest_year)
    
    return latest_month, latest_year

def convert_month_to_quarter(month):
    # Normalize month name
    month = month.strip().capitalize()
    
    q1 = ["Januari", "February", "Maret", "March"]
    q2 = ["April", "Mei", "May", "Juni", "June"]
    q3 = ["Juli", "July", "Agustus", "August", "September"]
    q4 = ["Oktober", "October", "November", "Desember", "December"]
    
    if month in q1:
        return "Kuartal I/First Quarter"
    elif month in q2:
        return "Kuartal II/Second Quarter"
    elif month in q3:
        return "Kuartal III/Third Quarter"
    elif month in q4:
        return "Tahunan/Annual"
    else:
        return "Unknown"

def filter_dataframe(df):
    if df is None:
        return None
    
    # Function to check if a value is valid
    def is_valid_value(value):
        # Convert to string to handle various input types
        str_value = str(value).strip()
        
        unwanted_patterns = [
            r'^[\.]+$',  # hanya dots
            r'^2024$',   # hanya 2024
            r'^2023$',   # hanya 2023
            r'^[\.]{5,}$'  # 5 atau lebih dots
        ]
        
        # Check if value matches any unwanted pattern
        for pattern in unwanted_patterns:
            if re.match(pattern, str_value):
                return False
        
        # Check if value has at least 5 digits
        digit_count = len(re.findall(r'\d', str_value))
        
        return digit_count >= 1
    
    # Filter the DataFrame
    filtered_df = df[df['Value'].apply(is_valid_value)]
    
    # Print out how many rows were removed
    original_rows = len(df)
    filtered_rows = len(filtered_df)
    removed_rows = original_rows - filtered_rows
    print(f"Filtered out {removed_rows} rows (from {original_rows} to {filtered_rows} rows)")
    
    return filtered_df

def process_pdf_folder(folder_path, start_page=1, end_page=8):
    # Lists to store DataFrames
    table_dfs = []
    info_dfs = []

    # Get all PDF files in the folder
    pdf_files = [f for f in os.listdir(folder_path) if f.lower().endswith('.pdf')]

    if not pdf_files:
        print(f"No PDF files found in {folder_path}")
        return None

    # Process each PDF file
    for pdf_file in pdf_files:
        print(f"Processing {pdf_file}...")
        pdf_path = os.path.join(folder_path, pdf_file)

        # Extract text from PDF
        text_content = extract_text_from_pdf(pdf_path, start_page, end_page)

        if text_content:
            # Process table data
            table_df = extract_table_data(text_content, pdf_file)
            if not table_df.empty:
                table_dfs.append(table_df)
            
            # Process file info
            company_name, months_years = find_company_and_month_year(text_content)
            latest_month, latest_year = get_latest_month_and_year(months_years)
            
            if latest_month and latest_year:
                quarter = convert_month_to_quarter(latest_month)
                info_df = pd.DataFrame({
                    'Nama Perusahaan': [company_name],
                    'Kuartal': [quarter],
                    'Tahun': [latest_year],
                    'Source_File': [pdf_file]
                })
                info_dfs.append(info_df)

    # Combine DataFrames
    if table_dfs and info_dfs:
        combined_table_df = pd.concat(table_dfs, ignore_index=True)
        combined_info_df = pd.concat(info_dfs, ignore_index=True)
        
        # Merge the two DataFrames
        merged_pdf = pd.merge(combined_info_df, combined_table_df, on='Source_File', how='left')
        
        # Drop the Source_File column
        merged_pdf = merged_pdf.drop(columns=['Source_File'])
        
        # Filter the merged DataFrame
        filtered_pdf = filter_dataframe(merged_pdf)
        
        return filtered_pdf
    else:
        print("No data was extracted from any PDF files")
        return None

def get_merged_pdf_data(folder_path):
    merged_pdf = process_pdf_folder(folder_path)
    
    if merged_pdf is not None:
        print("\nMerged and Filtered PDF Data:")
        print(merged_pdf)
        return merged_pdf
    else:
        print("Tidak ada data yang berhasil diekstrak.")
        return None


def export_to_mysql(merged_df, table_name='tb_pdf', 
                    mysql_user='root', 
                    mysql_password='Dika007!#', 
                    mysql_host='localhost', 
                    mysql_port='3306', 
                    db_name='db_calk'):
    try:
        # Rename columns to match database schema
        column_mapping = {
            'Nama Perusahaan': 'nama_perusahaan_pdf',
            'Kuartal': 'kuartal_pdf',
            'Tahun': 'tahun_pdf',
            'Item': 'item_pdf',
            'Notes': 'notes_pdf',
            'Value': 'value_pdf',
        }
        
        # Rename columns using the mapping
        merged_df = merged_df.rename(columns=column_mapping)
        # Create SQLAlchemy engine
        engine = create_engine(f'mysql+pymysql://{mysql_user}:{mysql_password}@{mysql_host}:{mysql_port}/{db_name}')
        # Export DataFrame to MySQL
        merged_df.to_sql(table_name, con=engine, if_exists='append', index=False)
        
        print(f"\nData successfully exported to MySQL database '{db_name}', table '{table_name}'.")
    except Exception as e:
        print(f"Error exporting to MySQL: {e}")


# Kelas untuk menangani event file baru
class MyHandler(FileSystemEventHandler):
    def __init__(self, folder_path):
        self.folder_path = folder_path

    def on_created(self, event):
        if event.is_directory:
            return  # Skip folder
        if event.src_path.lower().endswith('.pdf'): 
            processed_file = event.src_path + '.processed'
            if not os.path.exists(processed_file):
                print(f"\nFile baru ditemukan: {event.src_path}")
                process_and_export(event.src_path)
            else:
                print(f"File {event.src_path} sudah diproses sebelumnya, melewatkan...")

# Fungsi untuk memproses dan mengekspor data dari file PDF
def process_and_export(pdf_path):
    # Mengambil data dari PDF
    merged_pdf = get_merged_pdf_data(pdf_path)
    
    if merged_pdf is not None:
        # Export ke MySQL
        export_to_mysql(
            merged_pdf,
            table_name='tb_pdf', 
            mysql_user='root', 
            mysql_password='Dika007!#', 
            db_name='db_calk' 
        )
        # Tandai file sudah diproses
        processed_file = pdf_path + '.processed'
        with open(processed_file, 'w') as f:
            f.write('Processed') 
        print(f"File {pdf_path} berhasil diproses dan diekspor ke database.")
    else:
        print(f"Gagal memproses file {pdf_path}")

# Fungsi untuk memproses semua file yang sudah ada
def process_existing_files(folder_path):
    for file_name in os.listdir(folder_path):
        file_path = os.path.join(folder_path, file_name)
        if file_path.lower().endswith('.pdf'): 
            processed_file = file_path + '.processed'
            if not os.path.exists(processed_file): 
                print(f"\nMemproses file yang sudah ada: {file_path}")
                process_and_export(file_path)
            else:
                print(f"File {file_path} sudah diproses sebelumnya, melewatkan...")

# Membuat observer untuk memantau folder
def start_monitoring(folder_path):
    event_handler = MyHandler(folder_path)
    observer = Observer()
    observer.schedule(event_handler, folder_path, recursive=False)
    
    # Proses file yang sudah ada di folder
    process_existing_files(folder_path)
    
    # Mulai memantau folder
    observer.start()
    print(f"Memantau folder: {folder_path} untuk file baru...")

    try:
        while True:
            time.sleep(1)
    except KeyboardInterrupt:
        observer.stop()
        print("\nPemantauan dihentikan.")
    observer.join()

# Main function
if __name__ == "__main__":
    folder_path = 'C:/TI-Dwika/Semester 7/Pangkalan Data/file_pdf'
    start_monitoring(folder_path)

In [None]:
merged_df.info()

In [1]:
import os
import re
import time
import pandas as pd
from PyPDF2 import PdfReader
from sqlalchemy import create_engine
from watchdog.observers import Observer
from watchdog.events import FileSystemEventHandler


# Daftar bulan untuk pencocokan
bulan_list_id = [
    "Januari", "Februari", "Maret", "April", "Mei", "Juni",
    "Juli", "Agustus", "September", "Oktober", "November", "Desember"
]

bulan_list_en = [
    "January", "February", "March", "April", "May", "June",
    "July", "August", "September", "October", "November", "December"
]


def extract_text_from_pdf(pdf_path, start_page=1, end_page=None):
    try:
        reader = PdfReader(pdf_path)
        text_content = ""
        total_pages = len(reader.pages)

        # Adjust end_page if it's not provided or exceeds total pages
        if end_page is None or end_page > total_pages:
            end_page = total_pages

        # Extract text from the specified page range
        for page_num in range(start_page - 1, end_page):
            page = reader.pages[page_num]
            text_content += page.extract_text() + "\n"
        return text_content
    except Exception as e:
        print(f"Error reading PDF {pdf_path}: {str(e)}")
        return None


def extract_table_data(text_content, filename):
    # Split content into lines
    lines = text_content.split('\n')

    # Initialize lists to store extracted data
    items = []
    values = []
    notes = []
    file_sources = []  # To track which file each row came from

    # Regular expressions for matching
    number_pattern = r'\(?[\d,.]+\)?'  # Matches numbers with 6 or more digits

    # First notes pattern
    notes_pattern_1 = r'\d+[a-zA-Z](?:,\s?[a-zA-Z\d]+)*' 

    # Second notes pattern
    notes_pattern_2 = r'\b\d{1,2}(?:,\s?\d{1,2})*\b'

    for line in lines:
        # Clean the line
        line = line.strip()

        # Try to identify components
        number_match = re.search(number_pattern, line)
        notes_match_1 = re.search(notes_pattern_1, line)
        notes_match_2 = re.search(notes_pattern_2, line)

        if number_match:
            # Extract the item name by removing numbers and notes
            item = re.sub(number_pattern, '', line)  
            item = re.sub(notes_pattern_1, '', item)  
            item = re.sub(notes_pattern_2, '', item) 
            item = re.sub(r'\s+', ' ', item).strip() 

            # Remove empty parentheses and remaining text after them
            item = re.sub(r'\(\s*\).*', '', item).strip()

            # Remove single or double letters and all remaining text after them
            item = re.sub(r'\s[a-zA-Z]{1,2}\s.*', '', item).strip()

            # # Remove remaining words if they start with an uppercase letter
            # item = re.sub(r'\s[A-Z][a-zA-Z]*.*', '', item).strip()

            # Remove remaining words if they start with an uppercase letter, but not all caps
            item = re.sub(r'\s([A-Z][a-z][a-zA-Z]*)\b.*', '', item).strip()

            numbers = re.findall(number_pattern, line)
            if len(numbers) > 1:
                # General case: pick the second number, or use any logic to decide which number to take
                value = numbers[1]
            elif len(numbers) == 1:
                value = numbers[0]
            else:
                # If no numbers found, return an empty string
                value = " "

            # Extract value
            # value = number_match.group()
            
            # Use the first notes pattern if it matches (allows comma-separated notes)
            if notes_match_1:
                note = notes_match_1.group()
            # Only use the second notes pattern if the first one didn't match
            elif notes_match_2 and not notes_match_1:
                note = notes_match_2.group()
            else:
                note = ""  # Default case if no notes are found

            # Modify values: remove commas, periods, and handle parentheses for negative values
            if value.startswith('(') and value.endswith(')') or value.startswith('('):
                # Remove parentheses, then make it negative
                value = '-' + value[1:-1].replace('.', '').replace(',', '')
            else:
                # Just remove commas and periods
                value = value.replace('.', '').replace(',', '')

            
            # Directly append to lists without any additional condition
            values.append(value)
            notes.append(note)

            # Only include if we have all components and it's not a total
            if item and not item.lower().startswith('total'):
                items.append(item)
                file_sources.append(filename)  # Add filename to track source

            # Ensure lists remain the same length by filling any missing values
            while len(items) < len(values):
                items.append("")
            while len(notes) < len(values):
                notes.append("")
            while len(file_sources) < len(values):
                file_sources.append("")

    # Create DataFrame
    df = pd.DataFrame({
        'Item': items,
        'Notes': notes,
        'Value': values,
        'Source_File': file_sources
    })
    
    return df


def find_company_and_month_year(text):
    # Company name pattern
    company_pattern = re.compile(r"(PT\s[\w\s]+,\s?Tbk|PT\s[\w\s]+Tbk)", re.IGNORECASE)
    company_match = company_pattern.search(text)
    
    # Month and year matching
    months_years = []
    for bulan in bulan_list_id + bulan_list_en:
        pattern = re.compile(rf"(\b{bulan}\b)\s(\d{{4}})", re.IGNORECASE)
        
        # Find matches with regex pattern
        matches = pattern.findall(text)
        
        for match in matches:
            month, year = match
            months_years.append((month.strip(), int(year)))

    # Get company name
    company_name = company_match.group(1) if company_match else "Tidak ditemukan"
    
    return company_name, months_years


def get_latest_month_and_year(months_years):
    if not months_years:
        return None, None
    
    latest_year = max(months_years, key=lambda x: x[1])[1]
    latest_month = next(month for month, year in months_years if year == latest_year)
    
    return latest_month, latest_year


def convert_month_to_quarter(month):
    # Normalize month name
    month = month.strip().capitalize()
    
    q1 = ["Januari", "February", "Maret", "March"]
    q2 = ["April", "Mei", "May", "Juni", "June"]
    q3 = ["Juli", "July", "Agustus", "August", "September"]
    q4 = ["Oktober", "October", "November", "Desember", "December"]
    
    if month in q1:
        return "Kuartal I / First Quarter"
    elif month in q2:
        return "Kuartal II / Second Quarter"
    elif month in q3:
        return "Kuartal III / Third Quarter"
    elif month in q4:
        return "Tahunan / Annual"
    else:
        return "Unknown"


def filter_dataframe(df):
    if df is None:
        return None
    
    # Function to check if a value is valid
    def is_valid_value(value):
        # Convert to string to handle various input types
        str_value = str(value).strip()
        
        unwanted_patterns = [
            r'^[\.]+$',  # hanya dots
            r'^2024$',   # hanya 2024
            r'^2023$',   # hanya 2023
            r'^[\.]{5,}$'  # 5 atau lebih dots
        ]
        
        # Check if value matches any unwanted pattern
        for pattern in unwanted_patterns:
            if re.match(pattern, str_value):
                return False
        
        # Check if value has at least 1 digits
        digit_count = len(re.findall(r'\d', str_value))
        
        return digit_count >= 1
    
    # Filter the DataFrame
    filtered_df = df[df['Value'].apply(is_valid_value)]
    
    # Print out how many rows were removed
    original_rows = len(df)
    filtered_rows = len(filtered_df)
    removed_rows = original_rows - filtered_rows
    print(f"Filtered out {removed_rows} rows (from {original_rows} to {filtered_rows} rows)")
    
    return filtered_df


def process_pdf_folder(folder_path, start_page=1, end_page=8):
    # Lists to store DataFrames
    table_dfs = []
    info_dfs = []

    # Normalize the folder path to use os.path methods
    folder_path = os.path.normpath(folder_path)

    # Get all PDF files in the folder
    pdf_files = [f for f in os.listdir(folder_path) if f.lower().endswith('.pdf')]

    if not pdf_files:
        print(f"No PDF files found in {folder_path}")
        return None

    # Process each PDF file
    for pdf_file in pdf_files:
        print(f"Processing {pdf_file}...")
        # Use os.path.join to create the full file path
        pdf_path = os.path.join(folder_path, pdf_file)

        # Extract text from PDF
        text_content = extract_text_from_pdf(pdf_path, start_page, end_page)

        if text_content:
            # Process table data
            table_df = extract_table_data(text_content, pdf_file)
            if not table_df.empty:
                table_dfs.append(table_df)
            
            # Process file info
            company_name, months_years = find_company_and_month_year(text_content)
            latest_month, latest_year = get_latest_month_and_year(months_years)
            
            if latest_month and latest_year:
                quarter = convert_month_to_quarter(latest_month)
                info_df = pd.DataFrame({
                    'Nama Perusahaan': [company_name],
                    'Kuartal': [quarter],
                    'Tahun': [latest_year],
                    'Source_File': [pdf_file]
                })
                info_dfs.append(info_df)

    # Combine DataFrames
    if table_dfs and info_dfs:
        combined_table_df = pd.concat(table_dfs, ignore_index=True)
        combined_info_df = pd.concat(info_dfs, ignore_index=True)
        
        # Merge the two DataFrames
        merged_pdf = pd.merge(combined_info_df, combined_table_df, on='Source_File', how='left')
        
        # Drop the Source_File column
        merged_pdf = merged_pdf.drop(columns=['Source_File'])
        merged_pdf['Nama Perusahaan'] = merged_pdf['Nama Perusahaan'].str.upper().str.replace(', TBK', 'TBK')
        # Filter the merged DataFrame
        filtered_pdf = filter_dataframe(merged_pdf)
        
        return filtered_pdf
    else:
        print("No data was extracted from any PDF files")
        return None


def get_merged_pdf_data(folder_path):
    merged_pdf = process_pdf_folder(folder_path)
    
    if merged_pdf is not None:
        print("\nMerged and Filtered PDF Data:")
        print(merged_pdf)
        return merged_pdf
    else:
        print("Tidak ada data yang berhasil diekstrak.")
        return None


def export_to_mysql(merged_df, table_name='tb_pdf', 
                    mysql_user='root', 
                    mysql_password='Dika007!#', 
                    mysql_host='localhost', 
                    mysql_port='3306', 
                    db_name='db_calk'):
    try:
        # Rename columns to match database schema
        column_mapping = {
            'Nama Perusahaan': 'nama_perusahaan_pdf',
            'Kuartal': 'kuartal_pdf',
            'Tahun': 'tahun_pdf',
            'Item': 'item_pdf',
            'Notes': 'notes_pdf',
            'Value': 'value_pdf',
        }
        
        # Rename columns using the mapping
        merged_df = merged_df.rename(columns=column_mapping)
        # Create SQLAlchemy engine
        engine = create_engine(f'mysql+pymysql://{mysql_user}:{mysql_password}@{mysql_host}:{mysql_port}/{db_name}')
        # Export DataFrame to MySQL
        merged_df.to_sql(table_name, con=engine, if_exists='append', index=False)
        
        print(f"\nData successfully exported to MySQL database '{db_name}', table '{table_name}'.")
    except Exception as e:
        print(f"Error exporting to MySQL: {e}")


# Kelas untuk menangani event file baru
class MyHandler(FileSystemEventHandler):
    def __init__(self, folder_path):
        self.folder_path = os.path.normpath(folder_path)

    def on_created(self, event):
        if event.is_directory:
            return  # Skip folder
        
        # Normalize the file path
        event_src_path = os.path.normpath(event.src_path)
        
        if event_src_path.lower().endswith('.pdf'): 
            processed_file = event_src_path + '.processed'
            if not os.path.exists(processed_file):
                print(f"\nFile baru ditemukan: {event_src_path}")
                process_and_export(event_src_path)
            else:
                print(f"File {event_src_path} sudah diproses sebelumnya, melewatkan...")


# Fungsi untuk memproses dan mengekspor data dari file PDF
def process_and_export(pdf_path):
    # Pastikan pdf_path dinormalisasi
    pdf_path = os.path.normpath(pdf_path)
    
    # Ambil direktori dari pdf_path
    pdf_folder = os.path.dirname(pdf_path)
    
    # Mengambil data dari PDF
    merged_pdf = get_merged_pdf_data(pdf_folder)
    
    if merged_pdf is not None:
        # Export ke MySQL
        export_to_mysql(
            merged_pdf,
            table_name='tb_pdf', 
            mysql_user='root', 
            mysql_password='Dika007!#', 
            db_name='db_calk' 
        )
        # Tandai file sudah diproses
        processed_file = pdf_path + '.processed'
        with open(processed_file, 'w') as f:
            f.write('Processed') 
        print(f"File {pdf_path} berhasil diproses dan diekspor ke database.")
    else:
        print(f"Gagal memproses file {pdf_path}")


# Fungsi untuk memproses semua file yang sudah ada
def process_existing_files(folder_path):
    for file_name in os.listdir(folder_path):
        file_path = os.path.join(folder_path, file_name)
        if file_path.lower().endswith('.pdf'): 
            processed_file = file_path + '.processed'
            if not os.path.exists(processed_file): 
                print(f"\nMemproses file yang sudah ada: {file_path}")
                process_and_export(file_path)
            else:
                print(f"File {file_path} sudah diproses sebelumnya, melewatkan...")

# Membuat observer untuk memantau folder
def start_monitoring(folder_path):
    event_handler = MyHandler(folder_path)
    observer = Observer()
    observer.schedule(event_handler, folder_path, recursive=False)
    
    # Proses file yang sudah ada di folder
    process_existing_files(folder_path)
    
    # Mulai memantau folder
    observer.start()
    print(f"Memantau folder: {folder_path} untuk file baru...")

    try:
        while True:
            time.sleep(1)
    except KeyboardInterrupt:
        observer.stop()
        print("\nPemantauan dihentikan.")
    observer.join()

# Main function
if __name__ == "__main__":
    folder_path = 'C:/TI-Dwika/Semester 7/Pangkalan Data/file_pdf'
    start_monitoring(folder_path)


Memproses file yang sudah ada: C:/TI-Dwika/Semester 7/Pangkalan Data/file_pdf\AGAR TW 2 KONSOL 2024.pdf
Processing AGAR TW 2 KONSOL 2024.pdf...
Processing FKS Food Sejahtera TBK billingual 31 Maret 2024.pdf...
Processing Lapkeu ABBA 31 Mar 2024.pdf...
Processing LKA PT Samcro Hyosung Adilestari Tbk Maret 2024.pdf...
Filtered out 172 rows (from 512 to 340 rows)

Merged and Filtered PDF Data:
                      Nama Perusahaan                    Kuartal  Tahun  \
2          PT ASIA SEJAHTERA MINA TBK  Kuartal I / First Quarter   2024   
7          PT ASIA SEJAHTERA MINA TBK  Kuartal I / First Quarter   2024   
8          PT ASIA SEJAHTERA MINA TBK  Kuartal I / First Quarter   2024   
9          PT ASIA SEJAHTERA MINA TBK  Kuartal I / First Quarter   2024   
10         PT ASIA SEJAHTERA MINA TBK  Kuartal I / First Quarter   2024   
..                                ...                        ...    ...   
506  PT SAMCRO HYOSUNG ADILESTARI TBK  Kuartal I / First Quarter   2024   
507  