In [29]:
# pip install python-docx==1.1.2

import os
import json 

from openai import OpenAI
from docx import Document

In [9]:
# put your openai key here
openai_key = '...'
model = 'gpt-4o-2024-08-06'

In [22]:
def list_word_documents(folder_path):
    """
    List all Word documents in the specified folder and return their full paths.

    :param folder_path: Path to the folder where Word documents are located
    :return: List of full paths to Word document files
    """
    # List to store the full paths of Word documents
    word_documents = []

    # Iterate through all files in the specified folder
    for file_name in os.listdir(folder_path):
        # Check if the file is a Word document
        if file_name.endswith('.docx') or file_name.endswith('.doc'):
            # Get the full path and append it to the list
            full_path = os.path.join(folder_path, file_name)
            word_documents.append(full_path)
    
    return word_documents

In [23]:
documents = list_word_documents(r"Contracts")
documents

['Contracts/Lionel Messi.docx', 'Contracts/Cristiano Ronaldo.docx']

In [42]:
client = OpenAI(api_key= openai_key)

# this is the chat function, takes our system instruction, user message and chatbot memory
def chat(user):

    # this is the instruction that is persistently passed to the model 
    system = """You are a helpful assistant. You only need to find the requested data, if present, in the given text, and return it in this json format:\n
            ```json
                name: 
                surname:
                phone_number:
                email_address:
                city_of_residence:
                role:
                contracted_days: (i.e., these are the days the employee may be asked to work, usually this is either weekdays only or flexible, icluding weekends)
                contracted_hours: (i.e., these are the hours the employee has been contracted for, usually 20 or 40)
                salary_type: (i.e., hourly, daily, weekly, monthly)
                salary_amount: 
            ``` 
                """
    completion = client.beta.chat.completions.parse(
        model="gpt-4o-2024-08-06",
        messages=[
            {"role": "system", "content": system},
            {"role": "user", "content": user},
        ],
    )

    message = completion.choices[0].message
    return message

def extract_text_from_docx(docx_path):
    """
    Extracts text from a .docx file.

    :param docx_path: Path to the .docx file
    :return: Extracted text as a string
    """
    doc = Document(docx_path)
    full_text = []
    for para in doc.paragraphs:
        full_text.append(para.text)
    return '\n'.join(full_text)


def process_contracts(docx_files):
    """
    Processes a list of Word documents, extracts text, and uses the chat function to get the desired output.

    :param system: The persistent system instruction
    :param docx_files: A list of paths to Word documents
    :return: A list of outputs for each document
    """
    results = []  # List to store the results

    for docx_file in docx_files:
        text = extract_text_from_docx(docx_file)  # Extract text from the Word document
        output = chat(text)  # Process the extracted text with the chat function
        results.append(output)

    return results

In [43]:
# Process the messages and get the results
results = process_contracts(documents)

In [44]:
for r in results:
    print(r.content)

```json
{
    "name": "Lionel",
    "surname": "Messi",
    "phone_number": "(305) 555-1234",
    "email_address": "Lionel.Messi@inter-miami.com",
    "city_of_residence": "Miami",
    "role": "Full-Time Cook",
    "contracted_days": "flexible, including weekends",
    "contracted_hours": "40",
    "salary_type": "monthly",
    "salary_amount": "60,000 dollars"
}
```
```json
{
    "name": "Cristiano",
    "surname": "Ronaldo",
    "phone_number": "+966 55 123 4567",
    "email_address": "Cristiano.Ronaldo@ai-nassar.com",
    "city_of_residence": "Dubai",
    "role": "Full-Time Waiter",
    "contracted_days": "flexible, including weekends",
    "contracted_hours": "40",
    "salary_type": "monthly",
    "salary_amount": "50,000 dollars"
}
```
