# Context

In [1]:
!pip install transformers bitsandbytes accelerate datasets outlines scikit-learn
!pip install pykeen
!pip install fsspec==2024.6.0
!pip install -U transformers accelerate
!pip install faiss-cpu

Collecting bitsandbytes
  Downloading bitsandbytes-0.45.2-py3-none-manylinux_2_24_x86_64.whl.metadata (5.8 kB)
Collecting datasets
  Downloading datasets-3.3.1-py3-none-any.whl.metadata (19 kB)
Collecting outlines
  Downloading outlines-0.1.14-py3-none-any.whl.metadata (17 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting interegular (from outlines)
  Downloading interegular-0.3.3-py37-none-any.whl.metadata (3.0 kB)
Collecting lark (from outlines)
  Downloading lark-1.2.2-py3-none-any.whl.metadata (1.8 kB)
Collecting diskcache (from outlines)
  Downloading diskcache-5.6.3-py3-none-any.whl.metadata (20 kB)
Collecting pycountry (from outlines)
  Downloading pycountry

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from transformers import (
    BitsAndBytesConfig,
    AutoTokenizer,
    AutoModelForCausalLM,
    GenerationConfig
)

import torch

# Put your hugging face token here: https://huggingface.co/docs/hub/en/security-tokens
# You need to fill the access form with your huggingface account on this link: https://huggingface.co/mistralai/Ministral-8B-Instruct-2410
llm_name = "mistralai/Ministral-8B-Instruct-2410"

# We want to use 4bit quantization to save memory
quantization_config = BitsAndBytesConfig(
    load_in_8bit=False, load_in_4bit=True
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(llm_name, padding_side="left", token=hf_token)
# Prevent some transformers specific issues.
tokenizer.use_default_system_prompt = False
tokenizer.pad_token_id = tokenizer.eos_token_id

# Load LLM.
llm = AutoModelForCausalLM.from_pretrained(
    llm_name,
    quantization_config=quantization_config,
    device_map={"": 0}, # load all the model layers on GPU 0
    #device_map="auto",
    torch_dtype=torch.bfloat16, # float precision
    token=hf_token
)
# Set LLM on eval mode.
llm.eval()

tokenizer_config.json:   0%|          | 0.00/181k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message.


config.json:   0%|          | 0.00/624 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.07G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(131072, 4096)
    (layers): ModuleList(
      (0-35): 36 x MistralDecoderLayer(
        (self_attn): MistralAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
        )
        (mlp): MistralMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=12288, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=12288, bias=False)
          (down_proj): Linear4bit(in_features=12288, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): MistralRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): MistralRMSNorm((4096,), eps=1e-05)
      )
    )
    (norm): Mis

In [40]:
# Set up our generation configuration.
# We set max_new_token to 128 to reduce computation time (we may also lose some accuracy).
# We disable beamsearch to ensure reproducibility (we may lose some accuracy).
generation_config = GenerationConfig(
  max_new_tokens = 128,
  do_sample=False,
  eos_token_id=tokenizer.eos_token_id,
  pad_token_id=tokenizer.pad_token_id,
)

# Base CSKG dataset

In [None]:
from pykeen.datasets import CSKG
import pandas as pd

# Load the CSKG dataset
dataset = CSKG()

# Convert to Pandas DataFrame
triples = dataset.training.mapped_triples.numpy()

# Get entity and relation mappings
entities = {i: entity for i, entity in dataset.training.entity_id_to_label.items()}
relations = {i: relation for i, relation in dataset.training.relation_id_to_label.items()}

# Convert triples to human-readable format
knowledge_texts = [
    f"{entities[head]} {relations[rel]} {entities[tail]}."
    for head, rel, tail in triples
]

# Show sample triples in text format
df = pd.DataFrame(knowledge_texts, columns=["Commonsense Statement"])

INFO:pykeen.utils:Using opt_einsum
INFO:pykeen.datasets.base:downloading data from https://zenodo.org/record/4331372/files/cskg.tsv.gz to /root/.data/pykeen/datasets/cskg/cskg.tsv.gz


Downloading cskg.tsv.gz: 0.00B [00:00, ?B/s]

INFO:pykeen.datasets.base:reordering columns: ['node1', 'relation', 'node2']
INFO:pykeen.triples.splitting:done splitting triples to groups of sizes [2683381, 574841, 574842]


In [6]:
# Create a prompt with a CSKG fact
prompt = f"""Here is a commonsense fact:
{knowledge_texts[0]}

Question: What does this imply?
"""

def generate(prompt, llm=llm, generation_config=generation_config):

  # Create turns with the given prompt
  turns = [
    {'role':'user', 'content':prompt}
  ]

  # Tokenize turns.
  input_ids = tokenizer.apply_chat_template(turns, return_tensors='pt').to('cuda')

  # Ensure we don't use gradient to save memory space and computation time.
  with torch.no_grad():
    outputs = llm.generate(
      input_ids,
      generation_config
    )

  # Recover and decode answer.
  answer_tokens = outputs[0, input_ids.shape[1]:-1]
  return tokenizer.decode(answer_tokens).strip()

generate(prompt)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


'This commonsense fact implies that the empty set, denoted by the symbol ∅, is defined as a set that contains no elements. In other words, it is a set with no members. This is a fundamental concept in set theory and mathematics.'

In [9]:
!pip install faiss-cpu

Collecting faiss-cpu
  Using cached faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl (30.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m31.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.10.0


In [36]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

# Step 1: Load knowledge from the existing DataFrame (df)
knowledge_texts = df["Commonsense Statement"].tolist()[0:100000]

# Step 2: Convert knowledge statements into embeddings
embedder = SentenceTransformer("all-MiniLM-L6-v2")
knowledge_embeddings = np.array(embedder.encode(knowledge_texts))

# Step 3: Build FAISS index for retrieval
index = faiss.IndexFlatL2(knowledge_embeddings.shape[1])  # L2 distance
index.add(knowledge_embeddings)

# Step 4: Define retrieval function
def retrieve_knowledge(query, top_k=3):
    """Retrieve top-k relevant knowledge statements from FAISS."""
    query_embedding = np.array(embedder.encode([query]))  # Encode query
    distances, indices = index.search(query_embedding, top_k)  # Retrieve top-k
    return [knowledge_texts[i] for i in indices[0]]  # Return retrieved knowledge

In [11]:
# Function to generate response with RAG
def generate_response_with_rag(user_query):
    # Step 1: Retrieve relevant knowledge
    retrieved_facts = retrieve_knowledge(user_query, top_k=100)
    knowledge_context = "\n".join(retrieved_facts)

    # Step 2: Construct prompt with retrieved knowledge
    prompt = f"""You are a highly knowledgeable assistant. Only use the following facts to answer the user's question accurately.

    Retrieved Knowledge:
    {knowledge_context}

    Question: {user_query}
    Answer:"""

    # Step 3: Tokenize and generate response
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    output = llm.generate(**inputs, max_new_tokens=100)
    response = tokenizer.decode(output[0], skip_special_tokens=True)

    return response

# Example query
query = "What are the common things between cats and dogs ?"
response = generate_response_with_rag(query)
print(response)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


You are a highly knowledgeable assistant. Only use the following facts to answer the user's question accurately.

    Retrieved Knowledge:
    /c/en/animal /r/RelatedTo /c/en/dogs_cats.
/c/en/animal /r/RelatedTo /c/en/dog_cat.
/c/en/animal /r/RelatedTo /c/en/cat_dog.
/c/en/alley_cat/n/wn/animal /r/IsA /c/en/domestic_cat/n/wn/animal.
/c/en/ailurophobic/a /r/RelatedTo /c/en/having_or_relating_to_fear_or_hatred_of_cats.
/c/en/ailurophobia/n /r/RelatedTo /c/en/irrational_fear_or_hatred_of_cats_or_other_felines.
/c/en/animal /r/RelatedTo /c/en/generic_dog.
/c/en/all_cats /r/IsA /c/en/carnivores.
/c/en/ailurophilia/n /r/RelatedTo /c/en/fondness_or_love_for_cats_or_other_felines.
/c/en/1_pet/n /r/RelatedTo /c/en/2_pet.
/c/en/animal /r/RelatedTo /c/en/dog_pig.
/c/en/2_pet/n /r/RelatedTo /c/en/1_pet.
/c/en/andean_cats/n /r/FormOf /c/en/andean_cat.
/c/en/animals /r/ReceivesAction /c/en/used_as_pets.
/c/en/ailuromancy/n /r/RelatedTo /c/en/cat.
/c/en/achate /r/EtymologicallyRelatedTo /c/en/cates.


# Pykeen embedded Graph

In [12]:
# Step 1: Reload the embeddings from CSV
entity_df = pd.read_csv("/content/drive/MyDrive/LLM_Project/CSKG_Entity_Embeddings.csv")
relation_df = pd.read_csv("/content/drive/MyDrive/LLM_Project/CSKG_Relation_Embeddings.csv")

# Reload embeddings as numpy arrays
entity_embeddings_np = entity_df.iloc[:, 1:].values  # Exclude label column
relation_embeddings_np = relation_df.iloc[:, 1:].values  # Exclude label column

# Step 2: Reconstruct Entity and Relation Mappings
entities_reloaded = {i: entity_df.iloc[i, 0] for i in range(len(entity_df))}
relations_reloaded = {i: relation_df.iloc[i, 0] for i in range(len(relation_df))}

# Step 3: Reload CSKG dataset to retrieve triples
from pykeen.datasets import CSKG

dataset = CSKG()
triples = dataset.training.mapped_triples.numpy()  # Get original triples

# Step 4: Convert triples back into human-readable format
knowledge_texts = [
    f"{entities_reloaded[head]} {relations_reloaded[rel]} {entities_reloaded[tail]}."
    for head, rel, tail in triples
]

# Step 5: Store in DataFrame and Display
df_reconstructed = pd.DataFrame(knowledge_texts, columns=["Commonsense Statement"])
df_reconstructed.to_csv("/content/drive/MyDrive/LLM_Project/reconstructed_knowledge_texts.csv")

INFO:pykeen.datasets.base:reordering columns: ['node1', 'relation', 'node2']
INFO:pykeen.triples.splitting:done splitting triples to groups of sizes [2683381, 574841, 574842]


In [51]:
entity_df.shape

(2087833, 51)

In [54]:
relation_df.head()

Unnamed: 0,Relation,0,1,2,3,4,5,6,7,8,...,40,41,42,43,44,45,46,47,48,49
0,/r/Antonym,-0.003184,-0.009346,-0.00874,0.016277,-0.021665,-0.021074,0.011963,-0.006814,0.000386,...,-0.008491,-0.009129,0.002675,-0.019211,-0.009548,-0.00259,0.044895,-0.000716,1.358356,0.005905
1,/r/AtLocation,-0.112103,-0.076837,-0.007118,-0.029166,0.858887,0.142359,-0.074548,0.041391,-0.196697,...,-0.079229,-0.080921,0.025651,0.288468,0.181327,0.132924,0.113443,-0.085928,1.068158,0.120661
2,/r/CapableOf,-0.04799,0.111392,-0.005297,0.178622,-0.084771,0.15145,1.197167,0.089098,-0.065905,...,0.378562,0.231751,0.075898,-0.239711,0.099923,0.694459,-0.022317,-0.045899,0.845781,-0.198628
3,/r/Causes,-0.013659,-0.108627,-0.006306,0.157546,0.303824,0.107846,-0.197402,-0.428333,0.059407,...,0.01954,-0.065133,0.118104,-0.141516,0.188693,0.626302,-0.205033,-0.174766,1.041573,0.385045
4,/r/CausesDesire,0.353154,0.055363,-0.181231,-0.357021,-0.136167,0.132361,0.032325,0.296691,0.029619,...,-0.23023,0.170822,0.06008,0.092216,-0.047291,-0.280302,0.056468,0.010544,-0.167761,-0.664228


In [37]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

# Step 1: Load knowledge from the existing DataFrame (df)
reconstructed_knowledge_texts = df_reconstructed["Commonsense Statement"].tolist()[0:100000]

# Step 2: Convert knowledge statements into embeddings
embedder = SentenceTransformer("all-MiniLM-L6-v2")
reconstructed_knowledge_embeddings = np.array(embedder.encode(reconstructed_knowledge_texts))

# Step 3: Build FAISS index for retrieval
reconstructed_index = faiss.IndexFlatL2(reconstructed_knowledge_embeddings.shape[1])  # L2 distance
reconstructed_index.add(reconstructed_knowledge_embeddings)

# Step 4: Define retrieval function
def reconstructed_retrieve_knowledge(query, top_k=3):
    """Retrieve top-k relevant knowledge statements from FAISS."""
    query_embedding = np.array(embedder.encode([query]))  # Encode query
    distances, indices = reconstructed_index.search(query_embedding, top_k)  # Retrieve top-k
    return [reconstructed_knowledge_texts[i] for i in indices[0]]  # Return retrieved knowledge

In [14]:
# Function to generate response with RAG
def reconstructed_generate_response_with_rag(user_query):
    # Step 1: Retrieve relevant knowledge
    retrieved_facts = reconstructed_retrieve_knowledge(user_query, top_k=100)
    knowledge_context = "\n".join(retrieved_facts)

    # Step 2: Construct prompt with retrieved knowledge
    prompt = f"""You are a highly knowledgeable assistant. Only use the following facts to answer the user's question accurately.

    Retrieved Knowledge:
    {knowledge_context}

    Question: {user_query}
    Answer:"""

    # Step 3: Tokenize and generate response
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    output = llm.generate(**inputs, max_new_tokens=100)
    response = tokenizer.decode(output[0], skip_special_tokens=True)

    return response

# Example query
query = "What can dogs do ?"
reconstructed_response = reconstructed_generate_response_with_rag(query)
print(reconstructed_response)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


You are a highly knowledgeable assistant. Only use the following facts to answer the user's question accurately.

    Retrieved Knowledge:
    /c/en/assistance_dogs/n /r/FormOf /c/en/assistance_dog.
/c/en/attack_dog/n/wn/animal /r/IsA /c/en/watchdog/n/wn/animal.
/c/en/animal /r/RelatedTo /c/en/generic_dog.
/c/en/aid_dogs/n /r/FormOf /c/en/aid_dog.
/c/en/attack_dogs/n /r/FormOf /c/en/attack_dog.
/c/en/all_dogs /r/HasProperty /c/en/very_very_cute.
/c/en/attack_dogs /r/FormOf /c/en/attack_dog.
/c/en/americans /r/NotCapableOf /c/en/eat_dogs.
/c/en/back_yard /r/UsedFor /c/en/keeping_dog.
/c/en/ali /r/CapableOf /c/en/deal_with_dog.
/c/en/antidog /r/DerivedFrom /c/en/dog.
/c/en/animal /r/RelatedTo /c/en/dogs_cats.
/c/en/arson_dog/n /r/IsA /c/en/detection_dog.
/c/en/animal /r/RelatedTo /c/en/cat_dog.
/c/en/australian_shepherd/n /r/IsA /c/en/dog/n.
/c/en/american_staffordshire_terrier/n/wn/animal /r/IsA /c/en/bullterrier/n/wn/animal.
/c/en/animals /r/ReceivesAction /c/en/used_as_pets.
/c/en/ani

# Compare answers

In [59]:
query = "What do you know about dogs ?"
reconstructed_response = reconstructed_generate_response_with_rag(query)
response = generate_response_with_rag(query)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


In [60]:
import re

answer_match = re.search(r"Answer:(.*)", response, re.DOTALL)

# If found, extract the answer text
if answer_match:
    response_cut = answer_match.group(1).strip()
else:
    response_cut = "No answer found."

answer_match = re.search(r"Answer:(.*)", reconstructed_response, re.DOTALL)

# If found, extract the answer text
if answer_match:
    reconstructed_response_cut = answer_match.group(1).strip()
else:
    reconstructed_response_cut = "No answer found."
print("Query \n" + query + "\n")
print("Response \n"+ str(response_cut)+"\n")
print("With Pykeen Response \n" + str(reconstructed_response_cut))

Query 
What do you know about dogs ?

Response 
Dogs are a type of animal that are often kept as pets. They are known for their loyalty and companionship. There are many different breeds of dogs, each with their own unique characteristics and temperaments. Some popular breeds include the Australian Shepherd, Australian Cattle Dog, and American Staffordshire Terrier. Dogs are also known for their ability to be trained to perform various tasks, such as search and rescue, detection of drugs or explosives, and assistance to people with disabilities. Additionally, dogs are often

With Pykeen Response 
Dogs are a type of animal that are often kept as pets. They are known for their loyalty and companionship. There are many different breeds of dogs, each with their own unique characteristics and temperaments. Some popular breeds include the Australian Shepherd, Australian Cattle Dog, and American Staffordshire Terrier. Dogs are also known for their ability to be trained to perform various task