In [6]:
!pip install openai-whisper




In [7]:
import whisper
model = whisper.load_model("medium")


Format: [Nama/Aku/Saya] [Tanggal] membeli [Items] di [Nama Toko] dengan total [Total Harga]

In [255]:
audio_path = "/content/Recording (12).m4a"

result = model.transcribe(audio_path, language="indonesian")
print("Transkripsi:", result["text"])




Transkripsi:  Amir 5 hari yang lalu membeli mayones, sate, ayam goreng, dan kecap di Indo-Maret dengan total 20 ribu.


In [256]:
import re
from datetime import datetime, timedelta

MONTHS = {
    "januari": "01",
    "februari": "02",
    "maret": "03",
    "april": "04",
    "mei": "05",
    "juni": "06",
    "juli": "07",
    "agustus": "08",
    "september": "09",
    "oktober": "10",
    "november": "11",
    "desember": "12",
}

def parse_date(phrase):
    """Mengubah tanggal dalam berbagai format menjadi DD-MM-YYYY."""
    today = datetime.today()

    if "hari ini" in phrase:
        return today.strftime("%d-%m-%Y")
    elif "kemarin" in phrase:
        return (today - timedelta(days=1)).strftime("%d-%m-%Y")
    elif match := re.search(r"(\d+)\s*hari(?:\s+yang)?\s+lalu", phrase):
        days_ago = int(match.group(1))
        calculated_date = today - timedelta(days=days_ago)
        return calculated_date.strftime("%d-%m-%Y")

    patterns = [
        r"(\d{1,2})\s+(\w+)\s+(\d{4})",
        r"(\d{1,2})\s+(\w+)",
    ]

    for pattern in patterns:
        match = re.search(pattern, phrase, re.IGNORECASE)
        if match:
            if len(match.groups()) == 3 and pattern == patterns[0]:
                day, month_str, year = match.groups()
                month = MONTHS.get(month_str.lower(), "01")
                return f"{int(day):02d}-{month}-{year}"
            elif len(match.groups()) == 2 and pattern == patterns[1]:
                day, month_str = match.groups()
                month = MONTHS.get(month_str.lower(), "01")
                return f"{int(day):02d}-{month}-{today.year}"
            elif len(match.groups()) == 3 and pattern == patterns[2]:
                day, month, year = match.groups()
                return f"{int(day):02d}-{int(month):02d}-{year}"

    return None


def parse_total(total_str):
    total_str = total_str.lower().replace("rp", "").replace(".", "").replace(",", "")
    if "ribu" in total_str:
        total_str = total_str.replace("ribu", "").strip()
        return int(total_str) * 1000
    elif "juta" in total_str:
        total_str = total_str.replace("juta", "").strip()
        return int(total_str) * 1000000
    elif "miliar" in total_str:
        total_str = total_str.replace("miliar", "").strip()
        return int(total_str) * 1000000000
    elif "triliun" in total_str:
        total_str = total_str.replace("triliun", "").strip()
        return int(total_str) * 1000000000000
    return int(total_str)

def extract_transcribed_data(text):
    invalid_date_patterns = [
        r"\d{1,2}\s+bulan\s+\d{1,2}\s+tahun\s+\d{4}",
        r"\d{1,2}\s+bulan\s+\d{1,2}\s+\d{4}",
    ]
    for pattern in invalid_date_patterns:
        if re.search(pattern, text, re.IGNORECASE):
            return None

    pattern = re.compile(
        r"(?:\w+)\s+(hari ini|kemarin|\d+\s*hari(?:\s+yang)?\s+lalu|tanggal\s*\d{1,2}-\d{1,2}-\d{4}|\d{1,2}\s+\w+\s+\d{4}|\d{1,2}\s+\w+|\d{1,2}\s+bulan\s+\d{1,2}\s+tahun\s+\d{4})"
        r"\s+(?:membeli|beli|dibeli)\s+(.+?)\s+di\s+(.+?)\s+dengan total\w*\s+([\w\s,.]+)",
        re.IGNORECASE,
    )
    match = pattern.search(text)
    if not match:
        return None

    raw_date = match.group(1)
    items = match.group(2)
    company = match.group(3)
    raw_total = match.group(4)

    date = parse_date(raw_date)
    total = parse_total(raw_total)

    items_list = [
        item.strip()
        for item in re.split(r",\s*(?=dan)|,\s*|dan\s*", items)
        if item.strip()
    ]

    return {
        "Company": company,
        "Date": date,
        "Items": items_list,
        "Total": total,
    }

transcribed_text = result["text"]
extracted_data = extract_transcribed_data(transcribed_text)

if extracted_data:
    print("Company:", extracted_data["Company"])
    print("Date:", extracted_data["Date"])
    print("Items:")
    for item in extracted_data["Items"]:
        print(f"- {item}")
    print("Total:", extracted_data["Total"])
else:
    print("Kalimat tidak sesuai format.")


Company: Indo-Maret
Date: 27-11-2024
Items:
- mayones
- sate
- ayam goreng
- kecap
Total: 20000
