In [235]:
import pymupdf
import spacy
import re
from datetime import datetime

In [236]:
# Load NLP model
nlp = spacy.load('en_core_web_sm')

In [237]:
def extract_text_from_pdf(file_path):
    doc = pymupdf.open(file_path)
    text = ""
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        text += page.get_text()
    return text

In [238]:
def extract_date(text):
    # Define the regex patterns for Bab V/Chapter V
    date_pattern_chapter_v_indonesian = (
        r"Jangka waktu Nota Kesepahaman ini\s*berlaku untuk jangka waktu\s*[\w\s(),]+\s*sejak\s([\d\s\w]+)\s*sampai"
    )
    date_pattern_chapter_v_english = (
        r"The term of this Memorandum of\s*Understanding is valid for a period of\s*[\w\s(),]+\s*from\s([\d\w\s]+)\sto"
    )

    dates = []

    # Find matches in Chapter V
    match_chapter_v_indonesian = re.search(date_pattern_chapter_v_indonesian, text)
    match_chapter_v_english = re.search(date_pattern_chapter_v_english, text)

    # Append matches to the dates list
    if match_chapter_v_indonesian:
        dates.append(match_chapter_v_indonesian.group(1))
    if match_chapter_v_english:
        dates.append(match_chapter_v_english.group(1))
    
    return dates

In [239]:
def extract_letter_number(text):
    # Regex patterns to find the letter number
    letter_number_pattern_english = r"Number\s*:\s*([\w\-\/]+)"
    letter_number_pattern_indonesian = r"Nomor\s*:\s*([\w\-\/]+)"

    # Search for letter number in the text
    match_english = re.search(letter_number_pattern_english, text)
    match_indonesian = re.search(letter_number_pattern_indonesian, text)

    letter_number = None
    if match_english:
        letter_number = match_english.group(1)
    elif match_indonesian:
        letter_number = match_indonesian.group(1)
    
    return letter_number

In [240]:
def extract_person_in_charge(text):
    # Regex patterns to find first party details
    first_party_pattern_english = r"(?P<company_name>[\w\s]+), a (?P<company_desc>[\w\s]+), which is located at (?P<address>[\w\s,]+) represented by (?P<pic>[\w\s]+) as (?P<pic_position>[\w\s]+) and therefore authorized to act for and on behalf of (?P=company_name), hereinafter referred to as the FIRST PARTY"
    first_party_pattern_indonesian = r"(?P<company_name>[\w\s]+), sebuah (?P<company_desc>[\w\s]+), beralamat (?P<address>[\w\s,]+), dalam hal ini diwakili oleh (?P<pic>[\w\s]+), Jabatan sebagai (?P<pic_position>[\w\s]+) yang bertindak dalam jabatannya untuk dan atas nama (?P=company_name), untuk selanjutnya disebut sebagai PIHAK PERTAMA"
    
    # Regex patterns to find second party details (speakers)
    second_party_speaker_pattern_english = r"(?P<name>[\w\s]+), who is located in (?P<address>[\w\s,]+), hereinafter referred to as the SECOND PARTY."
    second_party_speaker_pattern_indonesian = r"(?P<name>[\w\s]+), yang berkedudukan di (?P<address>[\w\s,]+), untuk selanjutnya disebut sebagai PIHAK KEDUA."
    
    # Regex patterns to find second party details (partners)
    second_party_partner_pattern_english = r"(?P<company_name>[\w\s]+), a (?P<company_desc>[\w\s]+) established and existing with having domicile in (?P<address>[\w\s,]+), in this matter represented by (?P<pic>[\w\s]+), in this matter acting in her capacity as (?P<pic_position>[\w\s]+), hereinafter referred to as the SECOND PARTY"
    second_party_partner_pattern_indonesian = r"(?P<company_name>[\w\s]+), suatu (?P<company_desc>[\w\s]+) berkedudukan di (?P<address>[\w\s,]+), dalam hal ini diwakili oleh (?P<pic>[\w\s]+), yang dalam hal ini bertindak dalam kapasitasnya sebagai Kuasa (?P<pic_position>[\w\s]+), untuk selanjutnya disebut sebagai PIHAK KEDUA."

    first_party = []
    second_party = []

    match_first_party_english = re.search(first_party_pattern_english, text)
    match_first_party_indonesian = re.search(first_party_pattern_indonesian, text)
    
    match_second_party_speaker_english = re.search(second_party_speaker_pattern_english, text)
    match_second_party_speaker_indonesian = re.search(second_party_speaker_pattern_indonesian, text)
    
    match_second_party_partner_english = re.search(second_party_partner_pattern_english, text)
    match_second_party_partner_indonesian = re.search(second_party_partner_pattern_indonesian, text)

    if match_first_party_english:
        first_party.append(match_first_party_english.groupdict())
    if match_first_party_indonesian:
        first_party.append(match_first_party_indonesian.groupdict())

    if match_second_party_speaker_english:
        second_party.append(match_second_party_speaker_english.groupdict())
    if match_second_party_speaker_indonesian:
        second_party.append(match_second_party_speaker_indonesian.groupdict())
    if match_second_party_partner_english:
        second_party.append(match_second_party_partner_english.groupdict())
    if match_second_party_partner_indonesian:
        second_party.append(match_second_party_partner_indonesian.groupdict())

    return first_party, second_party

In [241]:
def extract_stakeholder_data(text):
    first_party_pattern_english = (
        r"In this Agreement, the FIRST PARTY\s+designates:\s+"
        r"Name\s*:\s*(?P<name>[\w\s]+)\s+"
        r"Position\s*:\s*(?P<position>[\w\s]+)\s+"
        r"Telp/fax\s*:\s*(?P<tel>[\w\s/-]+)\s+"
        r"Email\s*:\s*(?P<email>[\w\s@.]+)\s+"
        r"Address\s*:\s*(?P<address>[\w\s,]+)"
    )
    first_party_pattern_indonesian = (
        r"PIHAK PERTAMA\s+menunjuk:\s+"
        r"Nama\s*:\s*(?P<name>[\w\s]+)\s+"
        r"Jabatan\s*:\s*(?P<position>[\w\s]+)\s+"
        r"Telp/fax\s*:\s*(?P<tel>[\w\s/-]+)\s+"
        r"Email\s*:\s*(?P<email>[\w\s@.]+)\s+"
        r"Alamat\s*:\s*(?P<address>[\w\s,]+)"
    )

    second_party_pattern_english = (
        r"and the SECOND PARTY\s+designates:\s+"
        r"Name\s*:\s*(?P<name>[\w\s]+)\s+"
        r"Position\s*:\s*(?P<position>[\w\s]+)\s+"
        r"Telp/fax\s*:\s*(?P<tel>[\w\s/-]+)\s+"
        r"Email\s*:\s*(?P<email>[\w\s@.]+)\s+"
        r"Address\s*:\s*(?P<address>[\w\s,]+)"
    )
    second_party_pattern_indonesian = (
        r"dan PIHAK KEDUA\s+menunjuk:\s+"
        r"Nama\s*:\s*(?P<name>[\w\s]+)\s+"
        r"Jabatan\s*:\s*(?P<position>[\w\s]+)\s+"
        r"Telp/fax\s*:\s*(?P<tel>[\w\s/-]+)\s+"
        r"Email\s*:\s*(?P<email>[\w\s@.]+)\s+"
        r"Alamat\s*:\s*(?P<address>[\w\s,]+)"
    )

    first_party_data = []
    second_party_data = []

    match_first_party_english = re.search(first_party_pattern_english, text)
    match_first_party_indonesian = re.search(first_party_pattern_indonesian, text)

    match_second_party_english = re.search(second_party_pattern_english, text)
    match_second_party_indonesian = re.search(second_party_pattern_indonesian, text)

    if match_first_party_english:
        first_party_data.append(match_first_party_english.groupdict())
    if match_first_party_indonesian:
        first_party_data.append(match_first_party_indonesian.groupdict())

    if match_second_party_english:
        second_party_data.append(match_second_party_english.groupdict())
    if match_second_party_indonesian:
        second_party_data.append(match_second_party_indonesian.groupdict())

    return first_party_data, second_party_data

In [242]:
def extract_supply(text):
    supply = {
        "logo_placement": None,
        "data_transparency": None,
        "tracking_cooperation": None,
        "regulation_obeyment": None,
        "certificate_newsletter": None,
        "pre_event_article": None,
        "selling_space": None,
        "company_research_survey": None,
        "live_ad_libs": None,
        "company_video_promotion": None,
        "instagram_story_post": None
    }

    patterns = {
        'logo_placement': r"FIRST PARTY responsibilities to place a logo placement of SECOND PARTY in official poster event FIRST PARTY, and LPJ internal FIRST PARTY.",
        'data_transparency': r"FIRST PARTY responsibilities to inform all things needed related the partnership with SECOND PARTY.",
        'tracking_cooperation': r"FIRST PARTY responsibilities to keep track of cooperation in order running well and according to the agreement",
        'regulation_obeyment': r"FIRST PARTY responsibilities to obey entirely regulation which has been agreed.",
        'certificate_newsletter': r"FIRST PARTY responsibilities to give a certificate and newsletter report to SECOND PARTY.",
        'pre_event_article': r"FIRST PARTY responsibilities to including SECOND PARTY in pre-event article",
        'selling_space': r"FIRST PARTY responsibilities to conduct selling space of SECOND PARTY product which will be held for (\d+) minutes.",
        'company_research_survey': r"FIRST PARTY responsibilities to fulfill SECOND PARTY Research Survey which in total of (\d+) Participant.",
        'live_ad_libs': r"FIRST PARTY responsibilities to conduct Ad-Libs of SECOND PARTY when the event is on going.",
        'company_video_promotion': r"FIRST PARTY responsibilities to play Company Video Promotion of SECOND PARTY when the event is on going.",
        'instagram_story_post': r"FIRST PARTY responsibilities to post 1 \(one\) Story for SECOND PARTY with 20.000\+ Account Follower on Instagram."
    }

    for key, pattern in patterns.items():
        match = re.search(pattern, text)
        if match:
            supply[key] = match.group(0)

    return supply

In [243]:
def extract_demand(text):
    demand = {
        # Add possible demand keys here
        # e.g. 'key_name': None
    }

    patterns = {
        # 'key_name': r'regex_pattern'
        # Add your demand patterns here
    }

    for key, pattern in patterns.items():
        match = re.search(pattern, text)
        if match:
            demand[key] = match.group(0)

    return demand

In [244]:
def extract_duration(text):
    duration_pattern_english = (
        r"The term of this Memorandum of Understanding is valid for a period of "
        r"(?P<duration_time>[\w\s]+) from (?P<start_date>[\w\s,]+) to (?P<end_date>[\w\s,]+)\."
    )
    duration_pattern_indonesian = (
        r"Jangka waktu Nota Kesepahaman ini berlaku untuk jangka waktu "
        r"(?P<duration_time>[\w\s]+), sejak (?P<start_date>[\w\s,]+) sampai dengan (?P<end_date>[\w\s,]+)\."
    )

    match_english = re.search(duration_pattern_english, text)
    match_indonesian = re.search(duration_pattern_indonesian, text)

    duration_info = {}
    if match_english:
        duration_info = match_english.groupdict()
    if match_indonesian:
        duration_info = match_indonesian.groupdict()
    
    return duration_info

In [245]:
def calculate_roi(supply, demand):
    # Implement your ROI calculation logic based on the supply and demand data
    roi = 0.0
    # Example logic (you need to replace this with your actual ROI calculation logic)
    if supply and demand:
        roi = len(supply) * len(demand)  # Example: ROI as a product of the lengths of supply and demand lists
    return roi

In [246]:
def process_document(file_path):
    text = extract_text_from_pdf(file_path)
    
    date_of_agreement = extract_date(text)
    letter_number = extract_letter_number(text)
    first_party, second_party = extract_person_in_charge(text)
    first_party_data, second_party_data  = extract_stakeholder_data(text)
    supply = extract_supply(text)
    demand = extract_demand(text)
    duration = extract_duration(text)
    roi = calculate_roi(supply, demand)
    
    return {
        "Date of Agreement": date_of_agreement,
        "Letter Number": letter_number,
        "First Party Person in Charge": first_party,
        "First Party Stakeholder Data": first_party_data,
        "Second Party Person in Charge": second_party,
        "Second Party Stakeholder Data": second_party_data,
        "Supply": supply,
        "Demand": demand,
        "Duration": duration,
        "ROI": roi
    }

In [247]:
file_path = "MoU Watery Nation.pdf"
result = process_document(file_path)
print(result)

{'Date of Agreement': [], 'Letter Number': '001/UNSRI/EwA/LoA/VII/2023', 'First Party Person in Charge': [], 'First Party Stakeholder Data': [], 'Second Party Person in Charge': [], 'Second Party Stakeholder Data': [], 'Supply': {'logo_placement': None, 'data_transparency': None, 'tracking_cooperation': None, 'regulation_obeyment': None, 'certificate_newsletter': None, 'pre_event_article': None, 'selling_space': None, 'company_research_survey': None, 'live_ad_libs': None, 'company_video_promotion': None, 'instagram_story_post': None}, 'Demand': {}, 'Duration': {}, 'ROI': 0.0}
