In [None]:

import os
from dotenv import load_dotenv
import pandas as pd
import openai
import tiktoken
from tqdm import tqdm
import numpy as np
from openai.embeddings_utils import distances_from_embeddings


load_dotenv()

openai.api_key = os.getenv("OPENAI_API_KEY")

In [None]:
%load_ext autoreload
%autoreload 2

We'll use this notebook to explore using the Chat api instead of completion, which should improve performance and cost.

In [None]:

def create_context(
    question, df, max_len=1800, size="ada"
):
    """
    Create a context for a question by finding the most similar context from the dataframe
    """

    # Get the embeddings for the question
    q_embeddings = openai.Embedding.create(input=question, engine='text-embedding-ada-002')['data'][0]['embedding']

    # Get the distances from the embeddings
    df['distances'] = distances_from_embeddings(q_embeddings, df['embeddings'].values, distance_metric='cosine')


    returns = []
    cur_len = 0

    # Sort by distance and add the text to the context until the context is too long
    for i, row in df.sort_values('distances', ascending=True).iterrows():
        
        # Add the length of the text to the current length
        cur_len += row['n_tokens'] + 4
        
        # If the context is too long, break
        if cur_len > max_len:
            break
        
        # Else add it to the text that is being returned
        returns.append(row["text"])

    # Return the context
    return "\n\n###\n\n".join(returns)

In [None]:


def machine_risk_assessment(
    question,
    df,
    model="text-davinci-003",
    max_len=1800,
    size="ada",
    debug=False,
    max_tokens=150,
    stop_sequence=None
):
    """
    Answer a question based on the most similar context from the dataframe texts
    """

    context = create_context(
        question,
        df,
        max_len=max_len,
        size=size,
    )

    missing_risk_prompt = f""" Using the information below on a missing person, decide on the appropriate risk grading for the person, from either
- High risk
- Medium risk
- Low risk
- no apparent risk

Return your answer in the format: 'Graded as X risk, because of the below risk factors:\n - Y \n - Z \n'

Where X is your risk grading (high, medium, low, or no apparent risk) and Y and Z are a few sentences explaining the most important risks you have identified.

if the question can't be answered based on the context, say \"I don't know\"\n\nContext: {context}\n\n---\n\nQuestion: {question}\nAnswer:""",

    # If debug, print the raw model response

    
    if debug:
        print("Question:\n" + question)
        print("Context:\n" + context)
        print("\n\n")

    try:
        # Create a completions using the question and context
        response = openai.Completion.create(
            prompt=missing_risk_prompt,
            temperature=0,
            max_tokens=max_tokens,
            top_p=1,
            frequency_penalty=0,
            presence_penalty=0,
            stop=stop_sequence,
            model=model,
        )

        answer = response["choices"][0]["text"].strip()
        
        return answer, context
    except Exception as e:
        print(e)
        return ""

In [None]:
df = pd.read_parquet('processed/embeddings.parquet')
df

Unnamed: 0,text,n_tokens,embeddings
4431,.police.uk app public order core principles an...,393,"[0.002413947368040681, 0.016671480610966682, -..."
849,To co-create the training with practitioner st...,419,"[-0.010183705948293209, 0.008118431083858013, ..."
6612,The goal of problem analysis is to help you id...,482,"[0.010752184316515923, 0.018526222556829453, 0..."
6068,"Police data, investigation files, and intervie...",384,"[0.01356875617057085, 0.009414355270564556, 0...."
3432,"Of a racialist nature means consisting of, or ...",486,"[-0.012264551594853401, -0.008601714856922626,..."
...,...,...,...
7261,.police.uk app armed policing deployment autho...,405,"[-0.002368964720517397, 0.004855205304920673, ..."
6941,.police.uk cdn cgi l email protection#2d485f47...,203,"[-0.0013092802837491035, -0.01627718284726143,..."
8758,.police.uk article neighbourhood policing week...,484,"[0.003850934561342001, 0.014110091142356396, 0..."
6055,The neighbourhood role also enabled me to prot...,488,"[0.005274541676044464, 0.006226117257028818, -..."


In [None]:
about_yannik = """ Yannik is a 15 year old boy. He has recently been down, and was reported missing by his parents as he did not return home from school today.

His friends are worried he may be depressed, and when he apparently told one a few days ago 'if it doesn't get any better, I'm going to end it soon'
"""

yannik_answer, yannik_context = machine_risk_assessment(about_yannik, df)
yannik_answer

'Graded as High risk, because of the below risk factors: \n- Yannik is a 15 year old boy who has been reported missing by his parents\n- His friends are worried he may be depressed\n- He has expressed suicidal ideation to one of his friends'

Now let's modify our code to use the chat api.  Below we have the test API call
.

In [None]:
# Note: you need to be using OpenAI Python v0.27.0 for the code below to work
import openai

test_response = openai.ChatCompletion.create(
  model="gpt-3.5-turbo",
  messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": "Who won the world series in 2020?"},
        {"role": "assistant", "content": "The Los Angeles Dodgers won the World Series in 2020."},
        {"role": "user", "content": "Where was it played?"}
    ]
)

test_response

<OpenAIObject chat.completion id=chatcmpl-734PYCs3fvPxVf68NVINq4wOIu6xK> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "message": {
        "content": "The 2020 World Series was played at Globe Life Field in Arlington, Texas.",
        "role": "assistant"
      }
    }
  ],
  "created": 1680966296,
  "id": "chatcmpl-734PYCs3fvPxVf68NVINq4wOIu6xK",
  "model": "gpt-3.5-turbo-0301",
  "object": "chat.completion",
  "usage": {
    "completion_tokens": 17,
    "prompt_tokens": 57,
    "total_tokens": 74
  }
}

So, we need to decide how to fill in our bits of context. Of note, the OpenAI docs say the system messages aren't always picked up... we can expirtment. Let's start with testing using the system messages to define role and response format.

In [None]:
copbot_chat_content = '''
You are CopBot, an assistant designed to help police officers risk assess missing persons.

Using the information provide on a missing person, you will decide on the appropriate risk grading for the person, from either
- No apparent risk (when there is no apparent risk of harm to either the subject or the public.)
- Low risk (when the risk of harm to the subject or the public is assessed as possible but minimal)
- Medium risk (when the risk of harm to the subject or the public is assessed as likely but not serious.)
- High risk (when the risk of serious harm to the subject or the public is assessed as very likely.)

Risk assessment should be guided by the College of Policing Risk principles.

Return your answer in the format: 'Graded as X risk, because of the below risk factors:\n - Y \n - Z \n'

Where X is your risk grading (high, medium, low, or no apparent risk) and Y and Z are a few sentences explaining the most important risks you have identified.

if the question can't be answered based on the context, say \"I don't know\"'''


It looks like our content will be what was previously our context, so we'll modify our code to use that.

In [None]:

def create_chat_assistant_content(
    question, df, max_len=1800, size="ada"
):
    """
    Create a context for a question by finding the most similar context from the dataframe
    """

    # Get the embeddings for the question
    q_embeddings = openai.Embedding.create(input=question, engine='text-embedding-ada-002')['data'][0]['embedding']

    # Get the distances from the embeddings
    df['distances'] = distances_from_embeddings(q_embeddings, df['embeddings'].values, distance_metric='cosine')


    returns = ["Here is some relevant guidance and documentation from the College of Policing"]
    cur_len = 0

    # Sort by distance and add the text to the context until the context is too long
    for i, row in df.sort_values('distances', ascending=True).iterrows():
        
        # Add the length of the text to the current length
        cur_len += row['n_tokens'] + 4
        
        # If the context is too long, break
        if cur_len > max_len:
            break
        
        # Else add it to the text that is being returned
        returns.append(row["text"])

    # Return the context
    return "\n\n###\n\n".join(returns)

In [None]:
yannik_assistant_content = create_chat_assistant_content(about_yannik, df)
yannik_assistant_content

"Here is some relevant guidance and documentation from the College of Policing\n\n###\n\n First published 22 November 2016  Updated 15 March 2023   Latest changes  Written by College of Policing  Missing persons  30 mins read   Implications for the UK leaving the European Union are currently under review – please see\xa0APP\xa0on international investigation\xa0for latest available detail on specific areas, for example: Schengen Information System Europol INTERPOL Joint Investigation Teams This section provides additional information to aid the investigation based on the vulnerability of the individual and the circumstances in which they are missing. Missing children Safeguarding young and vulnerable people is a responsibility of the police service and partner agencies (see\xa0Children Act 2004). When the police are notified that a child is missing, there is a clear responsibility on them to prevent the child from coming to harm. Where appropriate, a strategy meeting may be held. For fu

Right, that seems to work.  We now have user settings and content defined. Let's try our first API call.

In [None]:
about_yannik

" Yannik is a 15 year old boy. He has recently been down, and was reported missing by his parents as he did not return home from school today.\n\nHis friends are worried he may be depressed, and when he apparently told one a few days ago 'if it doesn't get any better, I'm going to end it soon'\n"

In [None]:
# Note: you need to be using OpenAI Python v0.27.0 for the code below to work
import openai

test_response = openai.ChatCompletion.create(
  model="gpt-3.5-turbo",
  messages=[
        {"role": "system", "content": copbot_chat_content},
        {"role": "user", "content": about_yannik},
        {"role": "assistant", "content": yannik_assistant_content},
    ]
)

test_response

<OpenAIObject chat.completion id=chatcmpl-734cnqHtj5vmtx1RxZtZT8RhUkgbQ> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "message": {
        "content": "Graded as high risk because of the below risk factors:\n\n- The missing person is a 15-year-old boy who might be suffering from depression.\n- He was reported missing by his parents, who may have concerns about his mental state.\n- According to one of his friends, he expressed suicidal thoughts a few days before going missing.\n\nAny missing person report involving a potential mental health emergency should be treated as high risk. The fact that Yannik is a teenager and may not have fully developed coping mechanisms adds to the seriousness of the situation. As such, every possible measure should be taken to locate him as quickly as possible, and his current state of mind should be taken into account in any interactions with him.",
        "role": "assistant"
      }
    }
  ],
  "created": 168096711

I think that worked!  Let's build it into a funciton

In [None]:
# Note: you need to be using OpenAI Python v0.27.0 for the code below to work
import openai

test_response = openai.ChatCompletion.create(
  model="gpt-3.5-turbo",
  messages=[
        {"role": "system", "content": copbot_chat_content},
        {"role": "user", "content": about_yannik},
        {"role": "assistant", "content": yannik_assistant_content},
    ]
)

test_response

<OpenAIObject chat.completion id=chatcmpl-734i2Qfm8STXFjPV66rQYhFcRUqmV> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "message": {
        "content": "Graded as high risk, because of the following risk factors:\n- Yannik has been reported missing after expressing suicidal ideation.\n- Yannik is a minor and his mental health is at risk, and he may not have the capacity to make decisions that ensure his safety. \n- The risk of harm to the public is low, but the risk of harm to the subject, Yannik, is very high.\n- It is imperative the police issue an immediate alert and work with Yannik's parents, school and other agencies who may have knowledge of his current location to swiftly locate him and provide mental health support, to help ensure his safety.",
        "role": "assistant"
      }
    }
  ],
  "created": 1680967442,
  "id": "chatcmpl-734i2Qfm8STXFjPV66rQYhFcRUqmV",
  "model": "gpt-3.5-turbo-0301",
  "object": "chat.completion",
  "usage": {


In [None]:
test_response['choices'][0]['message']['content']

"Graded as high risk, because of the following risk factors:\n- Yannik has been reported missing after expressing suicidal ideation.\n- Yannik is a minor and his mental health is at risk, and he may not have the capacity to make decisions that ensure his safety. \n- The risk of harm to the public is low, but the risk of harm to the subject, Yannik, is very high.\n- It is imperative the police issue an immediate alert and work with Yannik's parents, school and other agencies who may have knowledge of his current location to swiftly locate him and provide mental health support, to help ensure his safety."

I'm also going to increase our context length, given we're using the discounted gpt3.5

In [None]:

def create_chat_assistant_content(
    question, df, max_len=3600, size="ada"
):
    """
    Create a context for a question by finding the most similar context from the dataframe
    """

    # Get the embeddings for the question
    q_embeddings = openai.Embedding.create(input=question, engine='text-embedding-ada-002')['data'][0]['embedding']

    # Get the distances from the embeddings
    df['distances'] = distances_from_embeddings(q_embeddings, df['embeddings'].values, distance_metric='cosine')


    returns = ["Here is some relevant guidance and documentation from the College of Policing"]
    cur_len = 0

    # Sort by distance and add the text to the context until the context is too long
    for i, row in df.sort_values('distances', ascending=True).iterrows():
        
        # Add the length of the text to the current length
        cur_len += row['n_tokens'] + 4
        
        # If the context is too long, break
        if cur_len > max_len:
            break
        
        # Else add it to the text that is being returned
        returns.append(row["text"])

    # Return the context
    return "\n\n###\n\n".join(returns)

def copbot_chat_risk_assessment(individual_circumstances, df, show_return_details=False, show_context=False):
    """Takes a user input string about the individual circumstances of a missing person, and returns a risk assessment"""

    individual_context = create_chat_assistant_content(individual_circumstances, df)


    openai_response = openai.ChatCompletion.create(
    model="gpt-3.5-turbo",
    messages=[
            {"role": "system", "content": copbot_chat_content},
            {"role": "user", "content": individual_circumstances},
            {"role": "assistant", "content": individual_context},
        ]
    )

    if show_context:
        print(individual_context)
        print('\n\n\n')

    if show_return_details:
        print(openai_response)
        print('\n\n\n')

    return openai_response['choices'][0]['message']['content']

    

Let's test that on some of our previous examples.

In [None]:
margaret_risk_profile = """ Margaret is a 97 year old woman with severe dementia from Twickenham. She lives in supported accomodation, but regularly goes missing, as she walks out when left unsupervised.

She has been missing 6 hours, and it is now 2200.  It is getting dark, and staff are saying she is rarely missing this long"""

jason_risk_profile = """ Jason is a 15 year old adult male, who has gone missing from his care home in Southwark. His carer has contacted the school, which has said he was not in today.
They that this is not the first time, and that Jason has been seen hanging out with older boys, who may be involved in crime and drugs."""

john_risk_profile = """John Smith is a 31 year old man who has been missing for 5 hours. He went to work as normal this morning, and has not returned home, and his partner is concerned. There are no signs of foul play, no vulnerabilities. John is in good health, the weather is good, and there are no concerns for his welfare."""


In [None]:
copbot_chat_risk_assessment(margaret_risk_profile, df, show_return_details=True, show_context=True)

Here is some relevant guidance and documentation from the College of Policing

###

.police.uk research projects maximizing effectiveness police scotland investigations when people living dementia go missing.            Maximizing the effectiveness of Police Scotland investigations when people living with dementia go missing | College of Policing             Sorry, you need to enable JavaScript to visit this website.    Skip to content Jump to search         Menu      Secondary navigation About us News & views Contact us  Search Search     Main navigation Policing guidance Research Career & learning Support for forces Ethics     Breadcrumb Home Research Research projects map           Maximizing the effectiveness of Police Scotland investigations when people living with dementia go missing            Maximizing the effectiveness of Police Scotland investigations when people living with dementia go missing         On this page     This research aims to explore the effectiveness of searc

"Graded as high risk, because of the following factors:\n\nMargaret, a 97-year-old woman with severe dementia, has been missing for six hours, and it is now getting dark. Staff report that she is rarely missing for this long, and given her vulnerable state and her history of walking out when left unsupervised, there is a significant risk of harm to herself. Additionally, Margaret's lack of awareness due to her dementia renders her liable to poor decision-making, and she could unknowingly put herself in danger, especially since it has been raining heavily. Immediate urgent action needs to be taken to locate Margaret as soon as possible to prevent any potential harm."