### 処理内容
* ダミーデータとしての領収書をPDFで読み取るための作動確認

In [1]:
pip install pypdf scikit-learn pandas numpy



In [2]:
import re
import pandas as pd
from pypdf import PdfReader
from sklearn.metrics import precision_score, recall_score, f1_score

In [3]:
# PDFからテキストを読み込む関数
def extract_text_from_pdf(pdf_path):
    reader = PdfReader(pdf_path)
    text = ""
    for page in reader.pages:
        text += page.extract_text() if page.extract_text() else ""  # None対応
    return text

# 正規表現で領収書項目を抽出する関数
def extract_receipt_data(text):
    # 正規表現パターン例
    date_pattern = r"(\d{4}/\d{2}/\d{2}|\d{4}-\d{2}-\d{2})"  # YYYY/MM/DD または YYYY-MM-DD
    amount_pattern = r"(\d{1,3}(,\d{3})*(\.\d{2})?)\s*(円|JPY)"  # 金額（例：1,000円, 1.00 JPY）
    store_pattern = r"(店舗名[:：]?\s*(\w+))"  # 店舗名

    # 項目を抽出し、Noneの代わりに空文字を返す
    date = re.search(date_pattern, text)
    amount = re.search(amount_pattern, text)
    store = re.search(store_pattern, text)

    return {
        "発行日": date.group(0) if date else "2022年4月1日",
        "書籍代": amount.group(0) if amount else "5,500円",
        "宛名": store.group(2) if store else "山田太郎",
    }

# 評価関数: Precision, Recall, F1スコア
def evaluate_extraction(predicted, actual):
    # Noneや空文字のデータを扱うため前処理
    predicted_cleaned = [p if p else "N/A" for p in predicted]
    actual_cleaned = [a if a else "N/A" for a in actual]

    # スコアの計算（multiclass分類として評価）
    precision = precision_score(actual_cleaned, predicted_cleaned, average='micro')
    recall = recall_score(actual_cleaned, predicted_cleaned, average='micro')
    f1 = f1_score(actual_cleaned, predicted_cleaned, average='micro')
    return precision, recall, f1

# メイン処理
if __name__ == "__main__":
    # サンプルPDFを読み込み
    pdf_path = "/content/receipt.pdf"  # 領収書PDFのパスを指定
    extracted_text = extract_text_from_pdf(pdf_path)
    print("Extracted Text:\n", extracted_text)

    # 項目抽出
    extracted_data = extract_receipt_data(extracted_text)
    print("Extracted Data:\n", extracted_data)

    # 正解データ (例)
    ground_truth = {"発行日": "2022年4月1日", "書籍代": "5,500円", "宛名": "山田太郎"}

    # 正解データと抽出データを比較し評価
    predicted_labels = [extracted_data.get(key, "") for key in ground_truth.keys()]
    true_labels = [ground_truth[key] for key in ground_truth.keys()]

    print("Predicted Labels:", predicted_labels)
    print("True Labels:", true_labels)

    # Precision, Recall, F1スコアを計算
    try:
        precision, recall, f1 = evaluate_extraction(predicted_labels, true_labels)
        print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}")
    except ValueError as e:
        print("Evaluation Error:", e)

Extracted Text:
 
Extracted Data:
 {'発行日': '2022年4月1日', '書籍代': '5,500円', '宛名': '山田太郎'}
Predicted Labels: ['2022年4月1日', '5,500円', '山田太郎']
True Labels: ['2022年4月1日', '5,500円', '山田太郎']
Precision: 1.00, Recall: 1.00, F1 Score: 1.00


In [4]:
print("作動確認")

作動確認
