In [None]:
!pip install pdfkit --quiet
!apt-get install -y wkhtmltopdf --quiet
!pip install PyPDF2 --quiet
!pip install tqdm --quiet
!pip install llama-index llama-parse --quiet
!pip install llama-index-vector-stores-pinecone --quiet
!pip install PyMuPDF --quiet
!pip install requests beautifulsoup4 pdfkit  --quiet
!pip install llama-index-agent-openai --quiet
!pip install llama-index-llms-openai --quiet
!pip install llama-index --quiet

# Setup

In [None]:
import os
import requests
import io
import pdfkit
import fitz
from tqdm.notebook import tqdm
from bs4 import BeautifulSoup
from IPython.display import Markdown, display
import random
import nest_asyncio
from llama_parse import LlamaParse
from llama_index.vector_stores.pinecone import PineconeVectorStore
from llama_index.core import VectorStoreIndex, StorageContext, Document, SummaryIndex
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core.node_parser import SimpleNodeParser, SentenceSplitter
from llama_index.llms.openai import OpenAI
from pinecone import Pinecone, ServerlessSpec

In [None]:
os.environ["OPENAI_API_KEY"] = "insert openai api key"

os.environ["PINECONE_API_KEY"] = "insert pinecone api key"

api_key = os.environ["PINECONE_API_KEY"]

# Loading

In [None]:
# List of URLs to scrape
url_parts = [
    "8", "Aiah", "Anagolay_(Born_To_Win)", "BINI", "BINI/Gallery",
    "BINI:_The_Launch", "BINI_Wiki", "BLOOM_(Fandom)", "B_HU_U_R",
    "Born_To_Win", "Born_To_Win_(Album)", "Born_To_Win_(song)",
    "Born_To_Win_%3D_Bahasa_Indonesia_Version", "Born_To_Win_%3D_Japanese_Version",
    "Born_To_Win_%3D_Spanish_Version", "Born_To_Win_%3D_Thai_Version",
    "Born_To_Win_Maxi", "Colet", "Da_Coconut_Nut", "Feel_Good_(Album)",
    "G22", "Golden_Arrow", "Gwen", "Here_With_You", "Huwag_Muna_Tayong_Umuwi",
    "I_Feel_Good", "Jhoanna", "Kapit_Lang", "Karera", "Kinikilig",
    "Lagi", "Lian_Kyla", "List_of_BINI_Performances", "Main_Page", "Maloi",
    "Maloi/Gallery", "Mayari_(Born_To_Win)", "Mikha", "Na_Na_Na", "No_Fear",
    "One_Dream:_The_BINI-BGYO_Journey", "Pantropiko", "SAB", "Sheena",
    "Sodop_(Born_To_Win)", "Stacey", "Star_Hunt_Academy", "Star_Hunt_Trainee_TV",
    "Strings", "Strings_-_Dance_ver.", "Zero_World_(Born_To_Win)"
]

# Function to generate full URLs
def generate_urls(base_url, parts):
    return [f"{base_url}{part}" for part in parts]

# Base URL
base_url = "https://bini.fandom.com/wiki/"

# Generate full URLs
urls = generate_urls(base_url, url_parts)

In [None]:
# Create a directory to save individual text files
os.makedirs("text_data", exist_ok=True)

# Function to extract text from a URL using BeautifulSoup
def extract_text_from_url(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find the main content area
    content_div = soup.find('div', {'class': 'mw-parser-output'})
    if content_div:
        text = content_div.get_text(separator='\n', strip=True)
        return text
    return ""

# Scrape text from each URL and save it to a text file, with a progress bar
text_files = []
for i, url in enumerate(tqdm(urls, desc="Scraping URLs for text")):
    text_filename = f"text_data/page_{i+1}.txt"
    text = extract_text_from_url(url)
    with open(text_filename, 'w', encoding='utf-8') as file:
        file.write(text)
    text_files.append(text_filename)

# Convert each text file to a PDF
os.makedirs("data", exist_ok=True)
pdf_files = []
for i, text_file in enumerate(tqdm(text_files, desc="Converting text to PDFs")):
    pdf_filename = f"data/page_{i+1}.pdf"
    pdfkit.from_file(text_file, pdf_filename)
    pdf_files.append(pdf_filename)

# Loading with LlamaParse

In [None]:
# Apply nest_asyncio to allow nested event loops (useful in Jupyter/Colab notebooks)
nest_asyncio.apply()

# Initialize LlamaParse with necessary parameters
parser = LlamaParse(
    api_key=llama_parse_key,  # Replace with your actual API key
    result_type="text",  # "markdown" and "text" are available
    verbose=False,  # Set to False to reduce verbosity
    language="en",  # Optionally you can define a language, default=en
)

In [None]:
# Define the directory containing the PDF files
data_directory = "/content/data"

# List all PDF files in the directory
document_paths = [os.path.join(data_directory, file) for file in os.listdir(data_directory) if file.endswith(".pdf")]

# Initialize an empty list to store the processed documents
processed_documents = []

# Process each document with a single progress bar
for document_path in tqdm(document_paths, desc="Processing documents"):
    document = parser.load_data([document_path])
    processed_documents.extend(document)

In [None]:
# Display the total number of documents
total_documents = len(processed_documents)
print(f"Total number of documents: {total_documents}")

# Randomly select one document to print
random_index = random.randint(0, total_documents - 1)
print(f"Randomly selected document {random_index + 1}:\n")
display(processed_documents[random_index])

# Preparing Storing

In [None]:
node_parser = SimpleNodeParser()

nodes = node_parser.get_nodes_from_documents(processed_documents)

In [None]:
# Print the number of nodes parsed
print(f"Number of nodes parsed: {len(nodes)}")

# Print details of the first few nodes to inspect their structure
for i, node in enumerate(nodes[:3]):  # Adjust the range if needed
    print(f"\nNode {i} details:")
    display(node)

In [None]:
# Function to create an index if it doesn't exist
def create_index_if_not_exists(index_name, dimension, metric, spec):
    existing_indexes = pc.list_indexes()
    if index_name not in existing_indexes:
        pc.create_index(
            name=index_name,
            dimension=dimension,
            metric=metric,
            spec=spec
        )
        print(f"Index '{index_name}' created successfully.")
    else:
        print(f"Index '{index_name}' already exists.")

# Usage
create_index_if_not_exists(
    index_name="bini_rag",
    dimension=1536,
    metric="dotproduct",
    spec=ServerlessSpec(cloud="aws", region="us-east-1")
)

In [None]:
pinecone_index = pc.Index("bini_rag")

In [None]:
vector_store = PineconeVectorStore(
    pinecone_index=pinecone_index,
    add_sparse_vector=True,
)

# Indexing

In [None]:
# Specify your OpenAI API key and the embedding model you want to use
embedding_model = "text-embedding-3-small"  # Replace with the actual model name

# Initialize the OpenAIEmbedding with the specified model
embed_model = OpenAIEmbedding(api_key=api_key, model=embedding_model)

# Assume astra_db_store is already initialized
storage_context = StorageContext.from_defaults(vector_store=vector_store)

In [None]:
# Initialize the VectorStoreIndex with the nodes and storage context
index = VectorStoreIndex(
    nodes=nodes,
    storage_context=storage_context,
    embed_model=embed_model,
)

In [None]:
# Create the VectorStoreIndex using the existing Pinecone collection
index = VectorStoreIndex.from_vector_store(
    vector_store,
)

# Querying

In [None]:
# Set up the query engine
query_engine = index.as_query_engine(vector_store_query_mode="hybrid")

In [None]:
# Example query
response = query_engine.query("What is Colet's full name?")
print(response)