### Firstly, load the scrapped website data that was saved in json format.


In [5]:

# import all libraries
import json
import ollama
import tiktoken
import sys
import regex as re
# import sanitize_filename from miscellaneous.py in ../utilities folder
sys.path.insert(0, "../utilities/")
from miscellaneous import sanitize_filename
from transformers import pipeline

# Load a pre-trained sentiment analysis model
sentiment_analyzer = pipeline("sentiment-analysis")

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


RuntimeError: At least one of TensorFlow 2.0 or PyTorch should be installed. To install TensorFlow 2.0, read the instructions at https://www.tensorflow.org/install/ To install PyTorch, read the instructions at https://pytorch.org/.

In [7]:
dict = {}
# read json file and save in policy_dict
with open("../output/otago_policies_20250206102803.json") as f:
    dict = json.load(f)

# then print all policy type in the dictionary key
for key in dict.keys():
    print(key)
    
# define selected llm model
selected_llm = "llama3.2"

Guidelines
Plans & Strategies
Policies
Procedures
Regulations
Statutes


In [8]:
# Create count_tokens function which takes in a text and encoding name as input and returns the number of tokens in the text.
def count_tokens(text: str, encoding_name: str = "cl100k_base") -> int:
    """
    Count the number of tokens in a given text using the specified encoding.

    Args:
        text (str): The input text to count tokens for.
        encoding_name (str): The name of the encoding to use. Default is "cl100k_base" (used by GPT-4 and similar models).

    Returns:
        int: The number of tokens in the text.
    """
    # Get the encoding
    encoding = tiktoken.get_encoding(encoding_name)

    # Tokenize the text and count the tokens
    tokens = encoding.encode(text)
    return len(tokens)


In [9]:
# get policy type list and policy
def get_policy_type_details(policy_type: str) -> dict:
    """
    Get the details of a policy type from the Otago University policies.

    Args:
        policy_type (str): The name of the policy type to get details for.

    Returns:
        dict: The details of the policy type.
    """
    # Get the policy type details
    policy_list = []

    for item in dict[policy_type]:
        policy_dict = {}
        policy_dict["name"] = item["name"]
        policy_dict["scope"] = item["scope"]
        policy_dict["content"] = item["content"]
        # append the policy_dict to policy_list
        policy_list.append(policy_dict)
    return policy_list

### Secondly, prepare all questions need to ask LLM model.

In [10]:
q1 = "Below is the policy data showing in JSON format, please list all policies that contain unnecessary content such as cross-references to other policies (there's a space for this in the policy documents, but not in the words of the policy itself)?"

q2 = "Below is the policy data showing in JSON format, please list all policies that contain parts that are actually Procedures, not Policies (e.g. they contain instructions on how to do something, rather than rules or principles)?"

q3 = "Below is the policy data showing in JSON format, please list all policies that contain inconsistencies or structural flaws. For example, if a policy is missing a section that is present in other policies, or if a policy has a section that is not relevant to the policy type?"

q4 = "Below is the policy data showing in JSON format, please list all redundancy or incorrect structure of policy. It could be that there's a better way to structure the policy database, or that some policies are redundant or overlapping?"


### Then design functions to call Deepseek LLM model with api key


In [11]:
def call_llm_about_policy(question, data, model="deepseek-r1:14b"):
    # check number of questions
    system_content = "Below is a JSON format data contains one policy document record. Each record contains the following fields: 'name', 'scope', and 'content'. Please only answer question once you have read all the data."
    
    user_question_content = f"{question}\n: {str(data)}"
    messages = [
        {"role": "system", "content": system_content},
        {"role": "user", "content": user_question_content},
    ]
    response = ollama.chat(
        model=model, messages=messages
    )
    

    # Extract and print the final response
    if response:
        return response
    else: 
        # raise an exception
        raise Exception("No response from the model")
        return None

In [12]:
def update_policy_list(policy_name: str, model_name):    
    policy_list = get_policy_type_details(policy_name)
    # show number of policy_list
    print(f"There are {len(policy_list)} items in the list.")
    # show number of policy_list tokens
    print(f"There are {count_tokens(str(policy_list))} tokens in the policy list.")
    total_policies = len(policy_list)
    processed_document = 0
    for policy in policy_list:
        # ask question 1
        response = call_llm_about_policy(q1, policy, model=model_name)
        match = re.search(r"</think>\n*(.*)", response.message.content)
        policy_list[processed_document]["q1_response"] = match.group(1) if match else response.message.content
        
        # ask question 2
        response = call_llm_about_policy(q2, policy, model=model_name)
        match = re.search(r"</think>\n*(.*)", response.message.content)
        policy_list[processed_document]["q2_response"] = match.group(1) if match else response.message.content
        
        
        # ask question 3
        response = call_llm_about_policy(q3, policy, model=model_name)
        match = re.search(r"</think>\n*(.*)", response.message.content)
        policy_list[processed_document]["q3_response"] = match.group(1) if match else response.message.content
        
        
        # ask question 4
        response = call_llm_about_policy(q4, policy, model=model_name)
        match = re.search(r"</think>\n*(.*)", response.message.content)
        policy_list[processed_document]["q4_response"] = match.group(1) if match else response.message.content
        
        processed_document += 1
        
        print(f"Processed {processed_document} of {total_policies} documents.")
    # printing out updated policy_list in JSON
    with open(f"./{sanitize_filename(f"{policy_name}_{model_name}")}.json", "w") as f:
        json.dump(policy_list, f, indent=4)
    print(
        f"Updated policy list has been saved to {policy_name}_{model_name}_updated.json"
    )

In [8]:
update_policy_list("Guidelines", model_name=selected_llm)


There are 54 items in the list.
There are 75481 tokens in the policy list.
Processed 1 of 54 documents.
Processed 2 of 54 documents.
Processed 3 of 54 documents.
Processed 4 of 54 documents.
Processed 5 of 54 documents.
Processed 6 of 54 documents.
Processed 7 of 54 documents.
Processed 8 of 54 documents.
Processed 9 of 54 documents.
Processed 10 of 54 documents.
Processed 11 of 54 documents.
Processed 12 of 54 documents.
Processed 13 of 54 documents.
Processed 14 of 54 documents.
Processed 15 of 54 documents.
Processed 16 of 54 documents.
Processed 17 of 54 documents.
Processed 18 of 54 documents.
Processed 19 of 54 documents.
Processed 20 of 54 documents.
Processed 21 of 54 documents.
Processed 22 of 54 documents.
Processed 23 of 54 documents.
Processed 24 of 54 documents.
Processed 25 of 54 documents.
Processed 26 of 54 documents.
Processed 27 of 54 documents.
Processed 28 of 54 documents.
Processed 29 of 54 documents.
Processed 30 of 54 documents.
Processed 31 of 54 documents.
Proc

In [9]:
update_policy_list("Plans & Strategies", model_name=selected_llm)

There are 2 items in the list.
There are 87 tokens in the policy list.
Processed 1 of 2 documents.
Processed 2 of 2 documents.
Updated policy list has been saved to Plans & Strategies_llama3.2_updated.json


In [10]:
update_policy_list("Policies", model_name=selected_llm)


There are 119 items in the list.
There are 148288 tokens in the policy list.
Processed 1 of 119 documents.
Processed 2 of 119 documents.
Processed 3 of 119 documents.
Processed 4 of 119 documents.
Processed 5 of 119 documents.
Processed 6 of 119 documents.
Processed 7 of 119 documents.
Processed 8 of 119 documents.
Processed 9 of 119 documents.
Processed 10 of 119 documents.
Processed 11 of 119 documents.
Processed 12 of 119 documents.
Processed 13 of 119 documents.
Processed 14 of 119 documents.
Processed 15 of 119 documents.
Processed 16 of 119 documents.
Processed 17 of 119 documents.
Processed 18 of 119 documents.
Processed 19 of 119 documents.
Processed 20 of 119 documents.
Processed 21 of 119 documents.
Processed 22 of 119 documents.
Processed 23 of 119 documents.
Processed 24 of 119 documents.
Processed 25 of 119 documents.
Processed 26 of 119 documents.
Processed 27 of 119 documents.
Processed 28 of 119 documents.
Processed 29 of 119 documents.
Processed 30 of 119 documents.
Pr

In [11]:
update_policy_list("Procedures", model_name=selected_llm)

There are 61 items in the list.
There are 82778 tokens in the policy list.
Processed 1 of 61 documents.
Processed 2 of 61 documents.
Processed 3 of 61 documents.
Processed 4 of 61 documents.
Processed 5 of 61 documents.
Processed 6 of 61 documents.
Processed 7 of 61 documents.
Processed 8 of 61 documents.
Processed 9 of 61 documents.
Processed 10 of 61 documents.
Processed 11 of 61 documents.
Processed 12 of 61 documents.
Processed 13 of 61 documents.
Processed 14 of 61 documents.
Processed 15 of 61 documents.
Processed 16 of 61 documents.
Processed 17 of 61 documents.
Processed 18 of 61 documents.
Processed 19 of 61 documents.
Processed 20 of 61 documents.
Processed 21 of 61 documents.
Processed 22 of 61 documents.
Processed 23 of 61 documents.
Processed 24 of 61 documents.
Processed 25 of 61 documents.
Processed 26 of 61 documents.
Processed 27 of 61 documents.
Processed 28 of 61 documents.
Processed 29 of 61 documents.
Processed 30 of 61 documents.
Processed 31 of 61 documents.
Proc

In [12]:
update_policy_list("Regulations", model_name=selected_llm)

There are 9 items in the list.
There are 13124 tokens in the policy list.
Processed 1 of 9 documents.
Processed 2 of 9 documents.
Processed 3 of 9 documents.
Processed 4 of 9 documents.
Processed 5 of 9 documents.
Processed 6 of 9 documents.
Processed 7 of 9 documents.
Processed 8 of 9 documents.
Processed 9 of 9 documents.
Updated policy list has been saved to Regulations_llama3.2_updated.json


### After policies are analyzed by the LLM mode, we will perform a sentiment analysis on each policy's 4 questions' response and then mark down the positive and negative sentiment score for each policy.

In [None]:
policy_name = "Regulations"
with open(f"./{sanitize_filename(f'{policy_name}_{selected_llm}')}.json", "r") as f:
    policy_list = json.load(f)

total_policies = len(policy_list)
processed_document = 0
for policy in policy_list:
    # determine the sentiment of the q1_response
    q1_polarity = TextBlob(policy["q1_response"]).sentiment.polarity
    policy_list[processed_document]["q1_positiveness"] = (
        "positive" if q1_polarity > 0 else "negative" if q1_polarity < 0 else "neutral"
    )
    q2_polarity = TextBlob(policy["q2_response"]).sentiment.polarity
    policy_list[processed_document]["q2_positiveness"] = (
        "positive" if q2_polarity > 0 else "negative" if q2_polarity < 0 else "neutral"
    )
    q3_polarity = TextBlob(policy["q3_response"]).sentiment.polarity
    policy_list[processed_document]["q3_positiveness"] = (
        "positive" if q3_polarity > 0 else "negative" if q3_polarity < 0 else "neutral"
    )
    q4_polarity = TextBlob(policy["q4_response"]).sentiment.polarity
    policy_list[processed_document]["q4_positiveness"] = (
        "positive" if q4_polarity > 0 else "negative" if q4_polarity < 0 else "neutral"
    )
    processed_document += 1
    
print(f"Processed {processed_document} of {total_policies} documents.")

# printing out updated policy_list in JSON
# with open(f"./{sanitize_filename(f'{policy_name}_{selected_llm}')}_updated.json", "w") as f:
#     json.dump(policy_list, f, indent=4)
print(policy_list)

Processed 9 of 9 documents.
