# Installing Dependencies

In [None]:
!pip install requests beautifulsoup4 transformers sentence-transformers faiss-cpu pandas nltk chromadb
!pip install reportlab
!pip install langchain==0.0.187
!pip install unstructured
!pip install docx2txt
!pip install genai

In [None]:
import os
import chromadb
from langchain.document_loaders import UnstructuredURLLoader
from llama_index.readers.file import PyMuPDFReader
from llama_index.core import VectorStoreIndex
from llama_index.llms.gemini import Gemini
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import StorageContext
from llama_index.core import SimpleDirectoryReader
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas
import os
import google.generativeai as genai
from google.colab import userdata
import re
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse


# Scraping Data from both sources

In [None]:
# Function to fetch content from a page
def fetch_page_content(url, visited):
    if url in visited:
        return ""
    try:
        response = requests.get(url)
        response.raise_for_status()
        visited.add(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        # Extract text including headings, paragraphs, and other text content
        text_content = ' '.join([element.get_text(separator=' ', strip=True) for element in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'li', 'a', 'div'])])
        print(f"Extracted content from {url}")
        return text_content
    except requests.RequestException as e:
        print(f"Failed to fetch {url}: {e}")
        return ""

# Function to get links for specific tabs from the sidebar
def get_specific_tab_links(base_url, tabs):
    try:
        response = requests.get(base_url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        links = set()

        # Locate the sidebar or navigation menu
        sidebar = soup.find('div', {'id': 'sidebar'})  # Adjust if the sidebar structure is different
        if not sidebar:
            print("Sidebar not found. Trying another approach to locate links.")
            # Try to find links in another way if sidebar is not found
            sidebar = soup.find_all('a', href=True)

        for a_tag in sidebar:
            link_text = a_tag.get_text().strip()
            if link_text in tabs:
                link = urljoin(base_url, a_tag['href'])
                parsed_link = urlparse(link)
                if parsed_link.netloc == urlparse(base_url).netloc:
                    links.add(link)
                    print(f"Found link for {link_text}: {link}")
        return links
    except requests.RequestException as e:
        print(f"Failed to fetch the sidebar links from {base_url}: {e}")
        return set()

# Function to scrape content from each tab page
def scrape_specific_tabs(base_url, tabs):
    visited = set()
    tab_links = get_specific_tab_links(base_url, tabs)
    all_texts = []

    for link in tab_links:
        text = fetch_page_content(link, visited)
        if text:
            all_texts.append(text)

    return all_texts

# Main function to combine data from website
def main():
    # Base URL for the website
    website_base_url = 'https://stanford-cs324.github.io/winter2022/lectures/introduction/'

    # List of tabs to scrape
    tabs = [
        'Introduction', 'Capabilities', 'Harms I', 'Harms II', 'Data',
        'Security', 'Legality', 'Modeling', 'Training', 'Parallelism',
        'Scaling laws', 'Selective architectures', 'Adaptation', 'Environmental impact'
    ]

    # Scrape specific tabs from the website
    website_texts_list = scrape_specific_tabs(website_base_url, tabs)

    # Combine texts
    global website_texts
    website_texts = "\n".join(website_texts_list)

# Run the main function
if __name__ == "__main__":
    main()


In [None]:
website_texts

In [6]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

def extract_github_milestone_table(url):
    try:
        # Read the GitHub page
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        # Locate the "Milestone Papers" section
        milestone_section = None
        for h2 in soup.find_all('h2'):
            if 'Milestone Papers' in h2.get_text():
                milestone_section = h2
                break

        if not milestone_section:
            print("No 'Milestone Papers' section found on the GitHub page.")
            return ""

        # Find the next table after the "Milestone Papers" section
        table = milestone_section.find_next('table')
        if not table:
            print("No table found under the 'Milestone Papers' section.")
            return ""

        # Parse the table into a DataFrame
        df = pd.read_html(str(table))[0]
        table_text = df.to_string(index=False)
        return table_text
    except requests.RequestException as e:
        print(f"Failed to fetch GitHub table: {e}")
        return ""


github_url = 'https://github.com/Hannibal046/Awesome-LLM#milestone-papers'
github_table_text = extract_github_milestone_table(github_url)

In [None]:
github_table_text

#Preprocessing text

In [28]:
# Function to preprocess text
def preprocess_text(text):
    # Lowercase the text
    text = text.lower()

    # Remove newline symbols
    text = re.sub(r'\n', '', text)

    return text

# Preprocess the website texts and GitHub table text
preprocessed_website_texts = preprocess_text(website_texts)
preprocessed_github_table_text = preprocess_text(github_table_text)

# Combine the preprocessed texts into a single variable
preprocessed_text = preprocessed_website_texts + '' + preprocessed_github_table_text



In [None]:
preprocessed_text

In [None]:
type(preprocessed_text)

# Working on question answering

In [None]:
from google.colab import userdata
userdata.get('GENAI_API_KEY')

In [None]:
def create_pdf(text, filename, folderpath):
    try:
        # Define the path where the PDF file will be saved
        filepath = os.path.join(folderpath, f'{filename}.pdf')

        # Create a canvas
        c = canvas.Canvas(filepath, pagesize=letter)

        # Set up fonts and text size
        c.setFont("Helvetica", 12)

        # Calculate the available space on the page
        width, height = letter
        max_text_height = height - 100  # Leaving space for margins
        line_height = 14  # Approximate line height

        # Write the text to the PDF
        text_lines = text.split('\n')
        y_offset = height - 50  # Starting y-coordinate
        for line in text_lines:
            # Check if the text exceeds the available space on the page
            if y_offset - line_height < max_text_height:
                # Start a new page if the text exceeds the available space
                c.showPage()
                c.setFont("Helvetica", 12)
                y_offset = height - 50  # Reset the y-coordinate for the new page

            # Write the line of text
            c.drawString(50, y_offset, line)
            y_offset -= line_height  # Adjust for the next line

        c.save()
        print(f"PDF created successfully: {filepath}")
    except Exception as e:
        print(f"Error creating PDF: {e}")

# Folder path where documents are located
folderpath = '/content/sample_data'

# Create a PDF from the sample text
create_pdf(preprocessed_text, 'MyData', folderpath)

# Initialize genai with your API key
genai.configure(api_key=userdata.get('GENAI_API_KEY'))

# Choose a model from GenAI appropriate for your use case
model_name = 'gemini-1.5-flash'
model = genai.GenerativeModel(model_name)

# Initialize ChromaDB client
chroma_client = chromadb.EphemeralClient()

# Define collection name
collectionname = 'quickstart'

# Check if collection exists and delete if it does
collection_exists = any(collection.name == collectionname for collection in chroma_client.list_collections())
if collection_exists:
    print(f"Collection {collectionname} already exists. Resetting.")
    chroma_client.delete_collection(name=collectionname)

# Create new collection
chroma_collection = chroma_client.create_collection(collectionname)

# Initialize HuggingFace embedding model
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en")

# Load documents from a directory
loader = SimpleDirectoryReader(input_dir=folderpath, required_exts=[".pdf"])
documents = loader.load_data()

# Set up vector store and storage context
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

# Build index from documents
index = VectorStoreIndex.from_documents(
    documents, storage_context=storage_context, embed_model=embed_model
)

# Create query engine
query_engine = index.as_query_engine(llm=model)

# Query loop
while True:
    try:
        question = input("Enter your Question: ").strip()
        if not question:  # Check if the question is empty
            print("Error: Question cannot be empty.")
            continue
        response = query_engine.query(question)
        print(response)
    except Exception as e:
        print(f"Error processing question: {e}")
