### Firstly, load the scrapped website data that was saved in json format.


In [47]:
# import all libraries
import json
from openai import OpenAI
import tiktoken

In [48]:
dict = {}
# read json file and save in policy_dict
with open("../output/otago_policies_20250206102803.json") as f:
    dict = json.load(f)

# then print all policy type in the dictionary key
for key in dict.keys():
    print(key)

Guidelines
Plans & Strategies
Policies
Procedures
Regulations
Statutes


In [49]:
# load the api.json file
with open("../api.json") as f:
    api_dict = json.load(f)

# get deepseek-api-key and save to DEEPSEEK_API_KEY
DEEPSEEK_API_KEY = api_dict["deepseek-api-key"]
# get openai-api-key and save to OPENAI_API_KEY
# OPENAI_API_KEY = api_dict['openai-api-key']


In [50]:
# Create count_tokens function which takes in a text and encoding name as input and returns the number of tokens in the text.
def count_tokens(text: str, encoding_name: str = "cl100k_base") -> int:
    """
    Count the number of tokens in a given text using the specified encoding.

    Args:
        text (str): The input text to count tokens for.
        encoding_name (str): The name of the encoding to use. Default is "cl100k_base" (used by GPT-4 and similar models).

    Returns:
        int: The number of tokens in the text.
    """
    # Get the encoding
    encoding = tiktoken.get_encoding(encoding_name)

    # Tokenize the text and count the tokens
    tokens = encoding.encode(text)
    return len(tokens)


In [51]:
# get policy type list and policy
def get_policy_type_details(policy_type: str) -> dict:
    """
    Get the details of a policy type from the Otago University policies.

    Args:
        policy_type (str): The name of the policy type to get details for.

    Returns:
        dict: The details of the policy type.
    """
    # Get the policy type details
    policy_list = []

    for item in dict[policy_type]:
        policy_dict = {}
        policy_dict["name"] = item["name"]
        policy_dict["scope"] = item["scope"]
        policy_dict["content"] = item["content"]
        # append the policy_dict to policy_list
        policy_list.append(policy_dict)
    return policy_list

### Secondly, prepare all questions need to ask LLM model.

In [52]:
q1 = "From below list of JSON data, please list all policies that contain unnecessary content such as cross-references to other policies (there's a space for this in the policy documents, but not in the words of the policy itself)?"

q2 = "From below list of JSON data, please list all olicies that contain parts that are actually Procedures, not Policies (e.g. they contain instructions on how to do something, rather than rules or principles)?"

q3 = "From below list of JSON data, please list all policies that contain inconsistencies or structural flaws. For example, if a policy is missing a section that is present in other policies, or if a policy has a section that is not relevant to the policy type?"

q4 = "From below list of JSON data, please list all redundancy or incorrect structure of policy. It could be that there's a better way to structure the policy database, or that some policies are redundant or overlapping?"


### Then design functions to call Deepseek LLM model with api key


In [53]:
def call_llm_about_policy(questions, data, deepseek_model="deepseek-chat"):
    # chec number of questions
    system_content = "I will send you data in any python list format. Each item in the data list contains one policy document record. Each record contains the following fields: 'name', 'scope', and 'content'. Please only answer question once you have read all the data."
    
    client = OpenAI(api_key=DEEPSEEK_API_KEY, base_url="https://api.deepseek.com")

    responses = []
    num_question_answered = 0
    for question in questions:
        if num_question_answered == 0:  
            user_question_content = f"{question}\n: {data}"
            messages = [
                {"role": "system", "content": system_content},
                {"role": "user", "content": user_question_content},
            ]
            response = client.chat.completions.create(
                model=deepseek_model, messages=messages, max_tokens=5000
            )
            responses.append(response)
            num_question_answered += 1
            print(
                f"Question {num_question_answered} answered and {response.usage.prompt_tokens} prompt tokens are used as input and {response.usage.completion_tokens} complete tokens to generate the answer."
            )
        else:
            messages.append(response.choices[0].message)
            messages.append({"role": "user", "content": user_question_content})
            response = client.chat.completions.create(
                model=deepseek_model, messages=messages, max_tokens=5000
            )
            responses.append(response)
            num_question_answered += 1
            print(f"Question {num_question_answered} answered.")
            

    # Extract and print the final response
    if len(responses) > 0:
        print("Returning output response:")
        return responses
    else: 
        print("No response to return.")
        return None

In [54]:
regulation_list = get_policy_type_details("Plans & Strategies")
# show number of policy_list
print(f"There are {len(regulation_list)} items in the list.")
# show number of policy_list tokens
print(f"There are {count_tokens(str(regulation_list))} tokens in the policy list.")


There are 2 items in the list.
There are 87 tokens in the policy list.


In [59]:
# question = "From below list of JSON data, please answer following 4 questions: 1. list all policies that contain unnecessary content such as cross-references to other policies (there's a space for this in the policy documents, but not in the words of the policy itself)? 2. please list all olicies that contain parts that are actually Procedures, not Policies (e.g. they contain instructions on how to do something, rather than rules or principles)? 3. please list all policies that contain inconsistencies or structural flaws. For example, if a policy is missing a section that is present in other policies, or if a policy has a section that is not relevant to the policy type? 4. please list all redundancy or incorrect structure of policy. It could be that there's a better way to structure the policy database, or that some policies are redundant or overlapping?:"
responses_q1 = call_llm_about_policy(
    questions=[q1], data=regulation_list, deepseek_model="deepseek-chat"
)
print(responses_q1.choices[0].message.content)


JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [58]:
responses_q2 = call_llm_about_policy(
    questions=[q2], data=regulation_list, deepseek_model="deepseek-chat"
)
print(responses_q2.choices[0].message.content)

KeyboardInterrupt: 

In [None]:
responses_q3 = call_llm_about_policy(
    questions=[q3], data=regulation_list, deepseek_model="deepseek-chat"
)
print(responses_q3.choices[0].message.content)


In [None]:
responses_q4 = call_llm_about_policy(
    questions=[q4], data=regulation_list, deepseek_model="deepseek-chat"
)
print(responses_q4.choices[0].message.content)

In [None]:
# deepseek_model = "deepseek-reasoner"
deepseek_model = "deepseek-chat"
question = "From below list of JSON data, please list all policies that contain unnecessary content such as cross-references to other policies (there's a space for this in the policy documents, but not in the words of the policy itself)?:"
data = regulation_list
system_content = "I will send you data in any python list format. Each item in the data list contains one policy document record. Each record contains the following fields: 'name', 'scope', and 'content'. Please only answer question once you have read all the data."
user_question_content = f"{str(question)}\n: {str(data)}"
# for backward compatibility, you can still use `https://api.deepseek.com/v1` as `base_url`.
client = OpenAI(api_key=DEEPSEEK_API_KEY, base_url="https://api.deepseek.com")

response = client.chat.completions.create(
    model=deepseek_model,
    messages=[
        {"role": "system", "content": system_content},
        {"role": "user", "content": user_question_content},
    ],
    max_tokens=1024,
    temperature=0.7,
    stream=False,
)

print(response.choices[0].message.content)

In [None]:
# 
# Plans & Strategies
# Policies
# Procedures
# Regulations
# Statutes
regulation_list = get_policy_type_details("Guidelines")
# show number of policy_list
print(f"There are {len(regulation_list)} items in the list.")
# show number of policy_list tokens
print(f"There are {count_tokens(str(regulation_list))} tokens in the policy list.")
