# Note: This notebook was used for experimentation and creation of Faiss and vectors. To run this notebook properly you need OPEN AI AIP key and you can add that to the 4th cell in this notebook.

In [3]:
from transformers import AutoTokenizer, AutoModel
import torch
import pandas as pd
import numpy as np
from tqdm import tqdm
import re
import nltk
from nltk.corpus import stopwords

# Download NLTK stopwords
nltk.download('stopwords')

# Check if GPU is available and set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize the tokenizer and model for all-mpnet-base-v2
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-mpnet-base-v2")
model = AutoModel.from_pretrained("sentence-transformers/all-mpnet-base-v2").to(device)

def clean_text(text):
    """Clean the input text."""
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Convert text to lowercase
    text = text.lower()
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Remove stopwords (optional)
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop_words])
    # Strip extra whitespaces
    text = text.strip()
    return text

def encode_articles(articles, batch_size=32):
    """Encode list of articles into vectors."""
    model.eval()
    all_vectors = []

    # Integrate tqdm progress bar
    for i in tqdm(range(0, len(articles), batch_size), desc="Encoding articles"):
        batch = [clean_text(article) for article in articles[i:i+batch_size]]
        with torch.no_grad():
            # Process batch and ensure it is on the same device as the model
            encoded_batch = tokenizer(batch, padding=True, truncation=True, max_length=512, return_tensors='pt').to(device)
            outputs = model(**encoded_batch)
            # Get the mean of the last hidden states as the sentence embeddings
            vectors = outputs.last_hidden_state.mean(dim=1)
            all_vectors.append(vectors.cpu().numpy())
    
    return np.concatenate(all_vectors, axis=0)

# Example usage
wiki_simple = pd.read_csv('wiki_simple_text.csv')

# Convert the 'text' column into a list of articles
articles = wiki_simple['text'].tolist()

# Generate article vectors
article_vectors = encode_articles(articles)



[nltk_data] Downloading package stopwords to /home/ali/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Encoding articles: 100%|██████████| 6417/6417 [31:38<00:00,  3.38it/s]


In [4]:
article_vectors = np.array(article_vectors, dtype=np.float32)
print(article_vectors.shape)

(205328, 768)


In [18]:
import faiss
def build_faiss_index(vectors):
    """Build and train a FAISS index."""
    dimension = vectors.shape[1]
    index = faiss.IndexFlatIP(dimension)
    index.add(vectors)
    return index

# Building the index
faiss_index = build_faiss_index(article_vectors)


In [19]:
from openai import OpenAI

def process_query_with_chatgpt(query, openai_api_key):
    client = OpenAI(api_key=openai_api_key)

    try:
        response = client.chat.completions.create(
            model="gpt-3.5-turbo",  # Verify this is the correct model
            messages=[{"role": "system", "content": '''You are a helpful assistant. You are given a query. 
                       Find the important entities in the query and write a 5 sentece summary definition of those entities. 
                       Afterward explain the query relation to the entities.
                       The output should be in a json format with the following keys: "summary", "entities", "relation'''},
                      {"role": "user", "content": query}],
            max_tokens=300,
            temperature=0.7,
        )
        return response
    except Exception as e:
        # Enhanced error logging
        print(f"An error occurred: {type(e).__name__}, {str(e)}")
        return None

# Example usage
api_key = "sk-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"  # Replace with your actual API key
query = "Who is Joseph Biden?"
processed_query = process_query_with_chatgpt(query, api_key)

if processed_query:
    print(processed_query.choices[0].message)
else:
    print("Failed to process query.")



ChatCompletionMessage(content='{\n  "summary": "Joseph Biden is a politician and the 46th President of the United States.",\n  "entities": [\n    "Joseph Biden"\n  ],\n  "relation": "is"\n}', role='assistant', function_call=None, tool_calls=None)


In [21]:
import json

# Assuming processed_query.choices[0].message.content contains the JSON data as a string
json_string = processed_query.choices[0].message.content

# Parse the JSON string into a Python dictionary
llm_output = json.loads(json_string)

# Accessing the different parts of the JSON
summary = llm_output['summary']
entities = llm_output['entities']
relation = llm_output['relation']
faiss_query = relation + " " + summary


In [22]:
def search_with_faiss(query, index, articles, top_k=5):
    """Search for similar articles using FAISS and return the original articles."""
    # Encode the query to get its vector representation
    query_vector = encode_articles([query])[0]
    # Search in FAISS index
    distances, indices = index.search(np.array([query_vector]), top_k)
    # Retrieve the original articles based on the indices
    return [articles[i] for i in indices[0]]

# Example usage
# Assuming `articles` is a list of your articles in the same order as they were added to the FAISS index
# similar_articles_indices = search_with_faiss(clean_text(query+' '+processed_query.choices[0].message.content), faiss_index, articles, top_k=5)
similar_articles = search_with_faiss(clean_text(faiss_query), faiss_index, articles, top_k=5)
print(similar_articles)

Encoding articles: 100%|██████████| 1/1 [00:00<00:00, 41.06it/s]

['Joseph Robinette Biden Jr. (; born ) is an American politician and the 46th and current president of the United States since 2021. Biden was also the 47th vice president from 2009 through 2017 during the Barack Obama presidency. He is a member of the Democratic Party and is from Wilmington, Delaware. Before becoming vice president, Biden was a U.S. Senator from Delaware from 1973 to 2009. He had served in the Senate longer than any other President or Vice President.\n\nHe tried to become the Democratic candidate for president in 1988 and 2008 but did not win. During the 2008 election, then-Senator Barack Obama picked him to be his running mate. He is a Roman Catholic. Biden has received several awards. He has five honorary doctorates, including one from his alma mater and one from where he has taught law. He has also earned the "Best of Congress Award", an award from the Pakistani government, and the Presidential Medal of Freedom with distinction.\n\nAfter finishing his second term a




In [23]:
regex_pattern = '|'.join(entities)
articles_key_word = wiki_simple[wiki_simple['text'].str.contains(regex_pattern,case=False,na=False)][:1]['text'].tolist()
print(articles_key_word)

["Barack Obama, then a junior United States Senator from Illinois, announced his candidacy for the presidency of the United States in Springfield, Illinois, on February 10, 2007. On August 27, 2008, he was declared nominee of the Democratic Party for the 2008 presidential election. He was the first African American in history to be nominated on a major party ticket.\nOn August 23, 2008, Barack Obama's campaign announced that Senator Joe Biden of Delaware would be the Vice Presidential nominee.\n\nOn November 4, 2008, Obama won the election, making him the President-elect and the first African American elected President of the United States. He is the third sitting Senator, after Warren G. Harding and John F. Kennedy, to be elected President.\n\nHis constitutional election to the office was completed with the meeting of the Electoral College on December 15, 2008, and the subsequent certification of the college's vote by the Joint Session of the United States Congress on January 8, 2009.

In [24]:
def process_Faiss_output_with_chatgpt(query,list_pages, openai_api_key):
    client = OpenAI(api_key=openai_api_key)

    try:
        response = client.chat.completions.create(
            model="gpt-3.5-turbo-16k",  # Verify this is the correct model
            messages=[{"role": "system", "content": f'''You are a helpful assistant. You are given a query. 
                       Please try your best to find the answer to the query in the following list:{list_pages}.
                        Also return the index in the list that you found the answer in. The output needs to be in json format with the following keys: "answer", "index"'''},
                      {"role": "user", "content": query}],
            max_tokens=300,
            temperature=0.7,
        )
        return response
    except Exception as e:
        # Enhanced error logging
        print(f"An error occurred: {type(e).__name__}, {str(e)}")
        return None

# Example usage
#api_key = "sk-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"  # Replace with your actual API key
#query = "What is the capital of France?"
knowledge_base = similar_articles #+ articles_key_word
processed_Faiss_output = process_Faiss_output_with_chatgpt(query,knowledge_base, api_key)

if processed_query:
    print(processed_Faiss_output.choices[0].message.content)
else:
    print("Failed to process query.")

{
  "answer": "Joseph Robinette Biden Jr. is an American politician and the 46th and current president of the United States since 2021.",
  "index": 0
}


In [27]:
json_answer = processed_Faiss_output.choices[0].message.content
answer = json.loads(json_answer)['answer']
index = json.loads(json_answer)['index']
print(f'Answer: {answer}')
print(f'Article: {knowledge_base[index]}')

Answer: Joseph Robinette Biden Jr. is an American politician and the 46th and current president of the United States since 2021.
Article: Joseph Robinette Biden Jr. (; born ) is an American politician and the 46th and current president of the United States since 2021. Biden was also the 47th vice president from 2009 through 2017 during the Barack Obama presidency. He is a member of the Democratic Party and is from Wilmington, Delaware. Before becoming vice president, Biden was a U.S. Senator from Delaware from 1973 to 2009. He had served in the Senate longer than any other President or Vice President.

He tried to become the Democratic candidate for president in 1988 and 2008 but did not win. During the 2008 election, then-Senator Barack Obama picked him to be his running mate. He is a Roman Catholic. Biden has received several awards. He has five honorary doctorates, including one from his alma mater and one from where he has taught law. He has also earned the "Best of Congress Award"

In [26]:
# Save the article vectors to a file
np.save('article_vectors_mpnet.npy', article_vectors)
# Save the FAISS index to a file
faiss.write_index(faiss_index, 'faiss_index_mpnet.idx')

