In [7]:
from docx import Document
def read_docx(file_path):
    doc = Document(file_path)
    full_text = []
    for para in doc.paragraphs:
        full_text.append(para.text)
    return '\n'.join(full_text)

file_path = r"Y:\Yoshi\MIDS\Datasci209\Projects\FinalProject\Data\ExampleReports\KCCA CT 12 15 23 Not fatal.docx"
text = read_docx(file_path)

In [1]:
import logging
import sys

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from IPython.display import Markdown, display
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from guardrails.validators import ValidRange, ValidChoices



  from .autonotebook import tqdm as notebook_tqdm


In [14]:
# load documents
documents = SimpleDirectoryReader(
input_files = [r"Y:\Yoshi\MIDS\Datasci209\Projects\FinalProject\Data\ExampleReports\KCCACT121523Notfatal.docx"]
#input_files = [r"Y:\Yoshi\MIDS\Datasci209\Projects\FinalProject\Data\ExampleReports\KCCA CT JL 2 25 22 Not Fatal.docx"]
).load_data()

In [15]:
# define embedding function
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-base-en-v1.5")

In [16]:
index = VectorStoreIndex.from_documents(documents, chunk_size=512,embed_model=embed_model)

In [17]:
from llama_index.core import Settings
from llama_index.llms.ollama import Ollama
Settings.llm = Ollama(model="mistral", request_timeout=30.0)

In [18]:
from llama_index.output_parsers.guardrails import GuardrailsOutputParser
from llama_index.legacy.llm_predictor import StructuredLLMPredictor
llm_predictor = StructuredLLMPredictor(llm=Ollama(model="mistral", request_timeout=30.0))

  llm_predictor = StructuredLLMPredictor(llm=Ollama(model="mistral", request_timeout=30.0))


In [19]:
from llama_index.legacy.prompts import PromptTemplate
from llama_index.legacy.prompts.default_prompts import (
    DEFAULT_TEXT_QA_PROMPT_TMPL,
    DEFAULT_REFINE_PROMPT_TMPL,
)
from pydantic import BaseModel, Field
import guardrails as gd


In [20]:
# You can either define a RailSpec and initialise a Guard object from_rail_string()
# OR define Pydantic classes and initialise a Guard object from_pydantic()
# For more info: https://docs.guardrailsai.com/defining_guards/pydantic/
# Guardrails recommends Pydantic

class Point(BaseModel):
    # In all the fields below, you can define validators as well
    # Left out for brevity
    fatal: str = Field(
        description="did the car theft result in a fatality?",
        validators=[ValidChoices(choices=['Yes','No'], on_fail='reask')]
    )
    date: str = Field(
        description="what date did the car theft occur?"
    )
    city: str = Field(
        description="what city did the car theft occur in?"
    )
    state: str = Field(
        description="what state did the car theft occur in?"
    )
    location: str = Field(
        description="business name of where the car theft occurred?"
    )
    victim_injuries: str = Field(
        description="did the victim sustain any injuries?"
    )
    victim_injury_type: str = Field(
        description="if the victim was injured, what type of injuries did the victim sustain?"
    )
    victim_count: int = Field(
        description="how many victims were there?"
    )
    victim_sex: str = Field(
        description="what was the sex of the victim?"
    )
    victime_ages: str = Field(
        description="what were the ages of the victims?"
    )
    notes: str = Field(
        description="summary of what happened along with details of where the car theft occurred and how the child was alone in the car"
    )



# Define the prompt
prompt = """
Extract the information from the news article.

${output_schema}

${gr.json_suffix_prompt_v2_wo_none}
"""

In [21]:
# Create a guard object
guard = gd.Guard.from_pydantic(output_class=Point, prompt=prompt)

# Create output parse object
output_parser = GuardrailsOutputParser(guard, llm=llm_predictor.llm.complete)

In [22]:
# NOTE: we use the same output parser for both prompts, though you can choose to use different parsers
# NOTE: here we add formatting instructions to the prompts.

fmt_qa_tmpl = output_parser.format(DEFAULT_TEXT_QA_PROMPT_TMPL)
fmt_refine_tmpl = output_parser.format(DEFAULT_REFINE_PROMPT_TMPL)

qa_prompt = PromptTemplate(fmt_qa_tmpl, output_parser=output_parser)
refine_prompt = PromptTemplate(fmt_refine_tmpl, output_parser=output_parser)

In [23]:
# take a look at the new QA template!
print(fmt_qa_tmpl)

Context information is below.
---------------------
{context_str}
---------------------
Given the context information and not prior knowledge, answer the query.
Query: {query_str}
Answer: 

ng name="fatal" description="did the car theft result in a fatality?" format="valid-choices: choices=['Yes', 'No']"/>
    <string name="date" description="what date did the car theft occur?"/>
    <string name="city" description="what city did the car theft occur in?"/>
    <string name="state" description="what state did the car theft occur in?"/>
    <string name="location" description="business name of where the car theft occurred?"/>
    <string name="victim_injuries" description="did the victim sustain any injuries?"/>
    <string name="victim_injury_type" description="if the victim was injured, what type of injuries did the victim sustain?"/>
    <integer name="victim_count" description="how many victims were there?"/>
    <string name="victim_sex" description="what was the sex of the victim?"

In [24]:
query_engine = index.as_query_engine(
    text_qa_template=qa_prompt,
    refine_template=refine_prompt,
    llm_predictor=llm_predictor,
)
response = query_engine.query(
    "What city and state did the car theft occur in? Did the car theft result in a fatality? What date did the car theft occur? Where did the car theft occur? Did the victim sustain any injuries? What type of injuries did the victim sustain? How many victims were there",
)

INFO:httpx:HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK"
HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK"


In [25]:
print(response)

 {
"fatal": "No",
"date": "Dec. 16, 2023",
"city": "San Francisco",
"state": "California",
"location": "Cool Guys Market",
"victim_injuries": "N/A",
"victim_injury_type": "N/A",
"victim_count": 1,
"victim_sex": "Female",
"victime_ages": "2 years old",
"notes": "Someone stole a parked SUV with a sleeping toddler in the back seat from outside Cool Guys Market. The child was found crying in the vehicle 11 minutes later and appeared to be unharmed."
}


In [31]:
#from the response object extract the structured data
#structured_data = response.metadata
structured_data = response.response

In [32]:
type(structured_data)

str

In [33]:
#read string into a dictionary
import json
structured_data = json.loads(structured_data)

In [34]:
#read the structured data into a pandas dataframe
import pandas as pd
df = pd.DataFrame(structured_data, index=[0])
df.head()

Unnamed: 0,fatal,date,city,state,location,victim_injuries,victim_injury_type,victim_count,victim_sex,victime_ages,notes
0,No,"Feb 25, 2022",Alameda,California,,,,1,,,The victim was a child who was found safe and ...


In [35]:
#df.to_csv(r"Y:\Yoshi\MIDS\Datasci209\Projects\FinalProject\Data\ExampleReports\KCCACT121523Notfatal.csv", index=False)
df.to_csv(r"Y:\Yoshi\MIDS\Datasci209\Projects\FinalProject\Data\ExampleReports\KCCA_CT_JL_2_25_22_Not_Fatal.csv", index=False)