# Generating Knowledge Graphs from HKLI Verdicts

This notebook contains sample code and the actual prompts required to download data from the HKLI database, and then extract the data into a specific schema, so it can be presented into a Graph Database like Neo4j 


In [1]:
from langchain.llms import OpenAI
from langchain.document_loaders import SeleniumURLLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
from langchain.vectorstores import Chroma
persist_directory = './docs/chroma/'
from langchain.embeddings.openai import OpenAIEmbeddings
llm = OpenAI(openai_api_key="OPENAPI_KEY",temperature=0)
embedding = OpenAIEmbeddings(openai_api_key="OPENAPI_KEY")
import json
answer = []
prompt = []



# SUMMARIZATION

### Option #1: Function to summarize the verdict with LangChain 


In [6]:

from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import WebBaseLoader
from langchain.chains.summarize import load_summarize_chain
from langchain.docstore.document import Document

def summarize(text_to_be_summarized):
    chain = load_summarize_chain(llm, chain_type="stuff")

    d = [Document(page_content=text_to_be_summarized)]

    return chain.run(d)

### Option #2: Prompt to summarize the verdict

In [7]:
summarize_prompt = """

Summarize the JUDICIAL VERDICT below in 1500 words, with details
such as the defendants and victims name, specifics of the crime, etc.  
"""

# PROMPTS

### CrimePrompt


In [8]:
crime_prompt = """

From the Hongkong JUDICIAL VERDICT, extract the properties of the crime as instructed below:

"caseid" : from the verdict, extract the caseid,
"case_datetime": what was the date and time of the case,
"location": from the verdict, extract the location of the crime,
"is_remote": from the verdict, extract whether the location of the crime was secluded. return yes or no,
"crime_datetime": from the verdict, extract the the date and time of the crime,
"weapon": from the verdict, extract the weapons used in the crime ,
"more_killed": from the verdict, extract whether the crime had more than 1 victim, return yes or no,
"more_wounded": from the verdict, extract whether the crime had more than 1 victim wounded, return yes or no,
"Evidence_collected": the list of the evidences in this crime,
"type": the nature of the crime, like robbery, homicide, murder, etc.

Format the output in the the following JSON structure


{
  "crime": {

"caseid":  "..", 
"case_datetime":  "..", 
"location":  "..",
"is_remote":  "..",
..
  }
}

JUDICIAL VERDICT: ###

"""


In [9]:
crime_details_prompt = """

From the Hongkong JUDICIAL VERDICT, extract the properties of the crime as instructed below:

"timeline": from the verdict,  list the timeline of events of the crime,
"judge" : from the verdict, extract the presiding judge of the case. 

Format the output in the the following JSON structure


{
  "crime details": {

"timeline":  "..", 
"judge"" ".."
  }
}

JUDICIAL VERDICT: ###

"""


### Victim Prompt


In [10]:
victim_prompt = """
From the Hongkong JUDICIAL VERDICT, extract the properties of the victim entity as instructed below. 
If there are multiple victims, then extract these properties for each victim. 

"name": from the verdict, extract  full name of the victim.  respond with the name, or unknown.
"age": from the verdict, extract the age of the victim at the time of the crime. If unavailable, get any mentioned age of the victim. 
"gender" : from the verdict, extract the gender of the victim as male or female.
"origin" : from the verdict, extract the origin location of the victim.
"marital_status": from the verdict, extract marital status of the victim, return married or single or divorced.
"is_ill": from the verdict, extract whether the victim was chronically ill at the time of the crime. return yes or no.
"illness_indicators": from the verdict, extract list of key phrases from the verdict that describe the health of the victim.
"on_drugs": from the verdict, extract whether the victim was abusing drugs at the time of the crime. return yes or no.
"drug_indicators": from the verdict, extract list of key phrases from the verdict that describe drug abuse of the victim.
"is_related": from the verdict, extract whether the victim is related to the suspect, return yes or no.
"relationship": from the verdict, extract the relationship between victim and suspect.
"any_employment": from the verdict, extract whether the victim was employed at the time of the crime. return yes or no.
"employment_indications": from the verdict, extract list of phrases that describe employment of the victim.


Format the output in the the following JSON structure


{
  "victim": {
  "name" : "..",
  "age" : "..",
  ...
  }
}

JUDICIAL VERDICT: ###

"""

### Suspect Prompt


In [11]:
suspect_prompt = """
From the Hongkong JUDICIAL VERDICT, extract the properties of the suspect entity as instructed below:

"name": from the verdict text, extract the full name of the suspect.
"age": from the verdict text, extract the age of the suspect at the time of the crime. If unavailable, get any mentioned age of the suspect.
"gender" : from the verdict text, extract the gender of the suspect as male or female.
"origin" :  from the verdict text, extract the origin location of the suspect.
"marital_status": from the verdict text, extract the marital status of the suspect, return married or single or divorced or unknown.
"is_ill": from the verdict text, extract whether the suspect was chronically sick at the time of the crime. return yes or no or unknown.
"illness_indicators": extract list of key phrases from the verdict text that describe the health of the suspect.
"on_drugs": From the verdict, extract whether the suspect was abusing drugs at the time of the crime. return yes or no or unknown.
"drug_indicators": extract list of key phrases from the verdict that describe drug abuse of the suspect.
"on_alcohol": from the verdict text, extract whether the suspect was under the influence of alcohol at the time of the crime. return yes or no or unknown.
"alcohol_indicators": from the verdict text,extract list of key phrases that describe alcohol abuse of the suspect.
"employment": from the verdict text, extract whether the suspect was employed at the time of the crime. return yes or no or unknown. 
"employment_indicators": extract list of key phrases that describe employment of the suspect.
"money_trouble": from the verdict text, extract whether the suspect had any financial problems at the time of the crime. return yes or no or unknown. 
"money_trouble_indicators": from the verdict text, extract list of key phrases from the verdict that describe the financial problems of the suspect.
"has_prior_records": From the verdict, extract whether the suspect had any prior criminal record at the time of the crime. return yes or no or unknown. 
"details of prior criminal record": details on the prior criminal record of the suspect.
"motive": From the verdict, extract the key motive for the crime.
"motive_indicator": From the verdict, extract list of key phrases from the verdict that indicate the motive for the crime.
"committed_suicide":  from the verdict, extract whether the suspect committed suicide after the crime. return yes or no or unknown.
"suicide_indicator":  extract list of key phrases from the verdict that indicate if the suspect committed suicide.
"know_victim": did the suspect know the victim before the crime, return yes or no or unknown. 




Format the output in the the following JSON structure


{
  "suspect": {
  "name" : "..",
  "age" : "..",
  ...
  }
}

JUDICIAL VERDICT: ###

"""

closing_prompt = " ### \n OUTPUT JSON: "


## This is the set of URLs from where the judicial verdicts are available. 

In [None]:
urls = ["https://www.hklii.hk/en/cases/hkcfi/2018/2859",
"https://www.hklii.hk/en/cases/hkcfi/2016/1974",
"https://www.hklii.hk/en/cases/hkcfi/2018/209",
"https://www.hklii.hk/en/cases/hkcfi/2016/1193",
"https://www.hklii.hk/en/cases/hkcfi/2016/1189",
"https://www.hklii.hk/en/cases/hkcfi/2015/2390",
"https://www.hklii.hk/en/cases/hkcfi/2016/1413",
"https://www.hklii.hk/en/cases/hkcfi/2018/2311",
"https://www.hklii.hk/en/cases/hkcfi/2018/2857",
"https://www.hklii.hk/en/cases/hkcfi/2013/1669",
"https://www.hklii.hk/en/cases/hkcfi/2017/2213",
"https://www.hklii.hk/en/cases/hkcfi/2018/604",
"https://www.hklii.hk/en/cases/hkcfi/2019/1151",
"https://www.hklii.hk/en/cases/hkcfi/2017/525",
"https://www.hklii.hk/en/cases/hkcfi/2015/2305",
"https://www.hklii.hk/en/cases/hkcfi/2016/364",
"https://www.hklii.hk/en/cases/hkcfi/2019/223",
"https://www.hklii.hk/en/cases/hkcfi/2020/2634"]



### Load the data using Selenium

In [13]:

loader = SeleniumURLLoader(urls)
data = loader.load()


## Run through all the data we got and prompt to get the structured data in JSON format. 

In [14]:
case = []
case_url = []

cp = []
crime = []
crime_json = []

cdp = []
crime_details = []
crime_details_json = []

vp = []
victim = []
victim_json = []

sp = []
suspect = []
suspect_json = []

for i in range(len(data)):

    try:
#        usable_case_text = data[i].page_content[:3500*4]
        usable_case_text = data[i].page_content
        # CRIME    
        cp.append(crime_prompt +  usable_case_text + closing_prompt)
        c = llm.predict(cp[-1])


        # CRIME DETAILS
        cdp.append(crime_details_prompt +  usable_case_text + closing_prompt)
        cd = llm.predict(cdp[-1])

        # VICTIM 
        vp.append(victim_prompt +  usable_case_text + closing_prompt)
        v= llm.predict(vp[-1])
        
        # SUSPECT
        sp.append(suspect_prompt +  usable_case_text + closing_prompt)
        s = llm.predict(sp[-1])

        
        # Everything succeeded -- so now we can append. 
        
        case.append(usable_case_text)
        case_url.append(data[i].metadata['source'])
        suspect.append(s)
        try:
            json.loads(suspect[-1])
            suspect_json.append("Good")
        except:
            suspect_json.append("Bad")
        
        victim.append(v)
        try:
            json.loads(victim[-1])
            victim_json.append("Good")
        except:
            victim_json.append("Bad")


        crime_details.append(cd)
        try:
            json.loads(crime_details[-1])
            crime_details_json.append("Good")
        except:
            crime_details_json.append("Bad")

            
        crime.append(c)
        try:
            json.loads(crime[-1])
            crime_json.append("Good")
        except:
            crime_json.append("Bad")



    except:
        print("Length Exception for " + str(data[i].metadata['source']))

        

import pandas as pd
case_file = {'case': case, 'url' : case_url,'crime':crime, 'crime_json':crime_json,'crime_details':crime_details,
             'crime_details_json':crime_details_json,'victim':victim, 'victim_json':victim_json,
             'suspect':suspect, 'suspect_json':suspect_json}
df = pd.DataFrame(case_file)
df.to_csv("NODES23.csv")
print("DONE")

DONE


### Validate the Excel file to ensure the putput JSONs are correct; 

#### if not update and correct them / tune the prompt appropriately. 

------------