In [1]:
!pip install sentence-transformers
!pip install datasets
!pip install pinecone-client
!pip install cohere
!pip install tqdm




In [2]:
from sentence_transformers import SentenceTransformer
from datasets import load_dataset
from pinecone import Pinecone, ServerlessSpec
import os
from tqdm import tqdm
import cohere
import numpy as np
import warnings
from IPython.display import display
warnings.filterwarnings("ignore")

  from tqdm.autonotebook import tqdm, trange


In [3]:
 PINECONE_API_KEY='42d60cd8-9617-4082-9b4d-018476442277'
 COHERE_API_KEY='vBu8LJBZw7N3EVc1P4JUXF20cBZ73xfAvNjjsF0r'

In [4]:

EMBEDDING_MODEL = 'all-MiniLM-L6-v2'
model = SentenceTransformer(EMBEDDING_MODEL)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [5]:
def load_and_embedd_dataset(
        dataset_name: str = 'yvonne90190/NBA_salary_advanced_stats',
        split: str = 'train',
        model: SentenceTransformer = SentenceTransformer('all-MiniLM-L6-v2'),
        text_field: str = 'notes',
        rec_num: int = 400
) -> tuple:
    """
    Load a dataset and embedd the text field using a sentence-transformer model
    Args:
        dataset_name: The name of the dataset to load
        split: The split of the dataset to load
        model: The model to use for embedding
        text_field: The field in the dataset that contains the text
        rec_num: The number of records to load and embedd
    Returns:
          tuple: A tuple containing the dataset and the embeddings
      """
    from datasets import load_dataset

    print("Loading and embedding the dataset")

    # Load the dataset
    from datasets import load_dataset


    dataset = load_dataset(dataset_name, split=split)
    print(dataset)
    # Embed the first `rec_num` rows of the dataset
    embeddings = model.encode(dataset[text_field][:rec_num])

    print("Done!")
    return dataset, embeddings

In [6]:
DATASET_NAME = "vinaykudari/acled-information-extraction"


dataset, embeddings = load_and_embedd_dataset(
    dataset_name=DATASET_NAME,
    rec_num=40,
    model=model,
)
shape = embeddings.shape

Loading and embedding the dataset


Downloading metadata:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/9.96M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/82758 [00:00<?, ? examples/s]

Dataset({
    features: ['location', 'fatalities', 'notes', 'target_text'],
    num_rows: 82758
})
Done!


In [7]:
pd_dataset = dataset.to_pandas()
print(pd_dataset.head(5))
print(f"The embeddings shape: {embeddings.shape}")

    location  fatalities                                              notes  \
0  Jacobabad           3  Three people were killed while 27 others injur...   
1     Baidoa           0  Government security forces opened fire at a pr...   
2      Hudur          11  Al Shabaab forces attacked government forces b...   
3     Bamako           0  Agreement: Rebels resumed peace talks with the...   
4    Ganjoni           0  Unidentified assailants launched an attack tar...   

                          target_text  
0  location: Jacobabad, fatalities: 3  
1     location: Baidoa, fatalities: 0  
2     location: Hudur, fatalities: 11  
3     location: Bamako, fatalities: 0  
4    location: Ganjoni, fatalities: 0  
The embeddings shape: (40, 384)


In [8]:
def create_pinecone_index(
        index_name: str,
        dimension: int,
        metric: str = 'cosine',
):
    """
    Create a pinecone index if it does not exist
    Args:
        index_name: The name of the index
        dimension: The dimension of the index
        metric: The metric to use for the index
    Returns:
        Pinecone: A pinecone object which can later be used for upserting vectors and connecting to VectorDBs
    """
    from pinecone import Pinecone, ServerlessSpec
    print("Creating a Pinecone index...")
    pc = Pinecone(api_key=PINECONE_API_KEY)
    existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]
    if index_name not in existing_indexes:
        pc.create_index(
            name=index_name,
            dimension=dimension,
            metric=metric,
            spec=ServerlessSpec(
                cloud="aws",
                region="us-east-1"
            )
        )
    print("Done!")
    return pc

In [9]:
INDEX_NAME = 'acled-information-extraction'

# Create the vector database
# We are passing the index_name and the size of our embeddings
pc = create_pinecone_index(INDEX_NAME, shape[1])

Creating a Pinecone index...
Done!


In [10]:
def upsert_vectors(
        index: Pinecone,
        embeddings: np.ndarray,
        dataset: dict,
        text_field: str = 'notes',
        batch_size: int = 128
):
    """
    Upsert vectors to a pinecone index
    Args:
        index: The pinecone index object
        embeddings: The embeddings to upsert
        dataset: The dataset containing the metadata
        batch_size: The batch size to use for upserting
    Returns:
        An updated pinecone index
    """
    print("Upserting the embeddings to the Pinecone index...")
    shape = embeddings.shape

    ids = [str(i) for i in range(shape[0])]
    meta = [{text_field: text} for text in dataset[text_field]]

    # create list of (id, vector, metadata) tuples to be upserted
    to_upsert = list(zip(ids, embeddings, meta))

    for i in tqdm(range(0, shape[0], batch_size)):
        i_end = min(i + batch_size, shape[0])
        index.upsert(vectors=to_upsert[i:i_end])
    return index


In [11]:
# Upsert the embeddings to the Pinecone index
index = pc.Index(INDEX_NAME)
index_upserted = upsert_vectors(index, embeddings, dataset)

Upserting the embeddings to the Pinecone index...


100%|██████████| 1/1 [00:00<00:00,  1.07it/s]


In [16]:
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 40}},
 'total_vector_count': 40}

In [24]:
import cohere

#First lets write a query for the LLM
query1 = "how much people were killed when a Peshawar-bound train hit a bomb planted by unidentified militants on railway tracks in Tul town in Jacobabad district in Sindh?"
query2 = "When did An ULFA (I) militant was killed in an encounter with security forces in Assam's Udalguri district?"
query3 = "where did FRPC-MPC and UPC fighters have engaged in clashes between Feb.4-7, resulting in at least 20 people killed (fatalities coded across 4 events). The UPC fighters allegedly withdrew from the areas to Bamodo, but later took the cities back?"
query4 = "Where did Two people have been killed and eight others wounded after Puntland police engaged in a gunfight with unidentified gunmen?"
queries=[query1,query2,query3,query4]
for q in queries:
  co = cohere.Client(api_key=COHERE_API_KEY)
  response = co.chat(
        model='command-r-plus',
        message=q,
    )
  print(response.text)

At least 17 people were killed and over 40 others injured when a Peshawar-bound train hit a bomb planted by unidentified militants on railway tracks in Tul town in Pakistan's Sindh province.
An ULFA (I) militant was killed in an encounter with security forces in Assam's Udalguri district on June 29, 2024.
Bam, Kongoya, and Bourou in Oudalan Province, Burkina Faso.
Two people were killed and eight others wounded in the Somali city of Galkayo when Puntland police engaged in a gunfight with unidentified gunmen on June 30, 2024. The incident occurred in the southern Galkayo district of Tudhow, which is part of the larger Puntland region in Somalia. The area has been the scene of recent clashes between rival clan militias, and the situation remains tense as security forces work to stabilize the region.


In [25]:
def augment_prompt(
        query: str,
        model: SentenceTransformer = SentenceTransformer('all-MiniLM-L6-v2'),
        index=None,
) -> str:
    """
    Augment the prompt with the top 3 results from the knowledge base
    Args:
        query: The query to augment
        index: The vectorstore object
    Returns:
        str: The augmented prompt
    """
    results = [float(val) for val in list(model.encode(query))]

    # get top 3 results from knowledge base
    query_results = index.query(
        vector=results,
        top_k=3,
        include_values=True,
        include_metadata=True
    )['matches']
    text_matches = [match['metadata']['notes'] for match in query_results]

    # get the text from the results
    source_knowledge = "\n\n".join(text_matches)

    # feed into an augmented prompt
    augmented_prompt = f"""Using the contexts below, answer the query.
    Contexts:
    {source_knowledge}
    If the answer is not included in the source knowledge - say that you don't know.
    Query: {query}"""
    return augmented_prompt, source_knowledge

In [26]:
query1 = "how much people were killed when a Peshawar-bound train hit a bomb planted by unidentified militants on railway tracks in Tul town in Jacobabad district in Sindh?"
query2 = "When did An ULFA (I) militant was killed in an encounter with security forces in Assam's Udalguri district?"
query3 = "where did FRPC-MPC and UPC fighters have engaged in clashes between Feb.4-7, resulting in at least 20 people killed (fatalities coded across 4 events). The UPC fighters allegedly withdrew from the areas to Bamodo, but later took the cities back?"
query4 = "Where did Two people have been killed and eight others wounded after Puntland police engaged in a gunfight with unidentified gunmen?"
queries=[query1,query2,query3,query4]
for q in queries:
  augmented_prompt, source_knowledge = augment_prompt(q, model=model, index=index)
  response = co.chat(
        model='command-r-plus',
        message=augmented_prompt,
    )
  print(response.text)

Three people were killed in the train bombing in Tul town, Jacobabad district, Sindh.
The ULFA (I) militant was killed on December 1.
The FRPC-MPC and UPC fighters clashed in the Ndassima area between February 4 and 7, resulting in at least 20 fatalities.
The gunfight between Puntland police and unidentified gunmen, which resulted in two deaths and eight injuries, took place in Gaalkacyo town, central Somalia.
