In [17]:
from pdf2docx import Converter
from docx import Document
import pandas as pd
import re
from datetime import datetime

def convert_pdf_to_docx(pdf_file, docx_file):
    cv = Converter(pdf_file)
    cv.convert(docx_file, start=0, end=None)
    cv.close()

def data_from_docx(docx_file, column_index):
    document = Document(docx_file)
    column_data = []
    for table in document.tables:
        for row in table.rows:
            if len(row.cells) > column_index:
                cell = row.cells[column_index]
                cell_text = cell.text.strip()
                column_data.append(cell_text)
    return column_data

def extract_price(column_data):
    extracted_values = []
    for item in column_data:
        if item.strip().isdigit(): 
            extracted_values.append(item.strip())
        else:
            numbers = [int(num) for num in item.split() if num.isdigit()]
            if numbers:
                extracted_values.extend(numbers)
    desired_values = [extracted_values[50], extracted_values[52], extracted_values[54]]
    return desired_values

def extract_date(column_data):
    if column_data:
        last_item = column_data[-1]
        date_part = last_item.split('|')[-1].strip()
        return date_part
    else:
        return None

def main(pdf_file, docx_file, column_index_data, column_index_date):
    #  PDF -> DOCX
    convert_pdf_to_docx(pdf_file, docx_file)
    
    # DOCX data
    column_data = data_from_docx(docx_file, column_index_data)
    column_date = data_from_docx(docx_file, column_index_date)
    
    # Formate date
    date = extract_date(column_date)
    formatted_date = re.sub(r'(\d{2}).(\d{2}).(\d{4})', r'\3.\2.\1', date)

    desired_values = extract_price(column_data)
    
    # DF in Excel
    data = {'symbol': ['Aluminium-Legierung 226 G-Al Si9 Cu3', 'Aluminium-Legierung 231 G-Al Si12 (Cu)', 'Aluminium-Legierung 233 G-Al Si10 Mg (Cu)'],
            'boerse': ['-'] * 3,
            'datum': [formatted_date] * 3,
            'letzter': desired_values}
    df = pd.DataFrame(data)
    df.to_excel('gewünschte_daten.xlsx', index=False)
    print("Excel-Datei wurde erstellt.")
    display(df)

if __name__ == "__main__":
    pdf_file = 'MPL_202408.pdf'
    docx_file = 'MPL_202408.docx'
    column_index_preis = 2
    column_index_date = 1
    main(pdf_file, docx_file, column_index_preis, column_index_date)



[INFO] Start to convert MPL_202408.pdf
[INFO] [1;36m[1/4] Opening document...[0m
[INFO] [1;36m[2/4] Analyzing document...[0m
[INFO] [1;36m[3/4] Parsing pages...[0m
[INFO] (1/2) Page 1
[INFO] (2/2) Page 2
[INFO] [1;36m[4/4] Creating pages...[0m
[INFO] (1/2) Page 1
[INFO] (2/2) Page 2
[INFO] Terminated in 9.05s.


Excel-Datei wurde erstellt.


Unnamed: 0,symbol,boerse,datum,letzter
0,Aluminium-Legierung 226 G-Al Si9 Cu3,-,2024.02.21,331
1,Aluminium-Legierung 231 G-Al Si12 (Cu),-,2024.02.21,339
2,Aluminium-Legierung 233 G-Al Si10 Mg (Cu),-,2024.02.21,348
