In [None]:
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

In [None]:
pip install "vllm[cuda118]"

In [4]:
pip install pymupdf

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Defaulting to user installation because normal site-packages is not writeable
Collecting pymupdf
  Downloading pymupdf-1.25.4-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.25.4-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (20.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.0/20.0 MB[0m [31m28.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: pymupdf
[0mSuccessfully installed pymupdf-1.25.4

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [5]:
import os
import pdfplumber
import pandas as pd
def extract_text_pymupdf(pdf_path):
    """Extracts text from a structured (non-scanned) PDF."""
    doc = fitz.open(pdf_path)
    text = "\n".join([page.get_text("text") for page in doc])
    return text

In [3]:
def extract_tables_pdf(pdf_path):
    """Extracts tables from a PDF and converts them into DataFrames."""
    tables = []
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            table = page.extract_table()
            if table:
                df = pd.DataFrame(table[1:], columns=table[0])  # First row as headers
                tables.append(df)
    return tables

In [4]:
def extract_images_pdf(pdf_path, save_folder="images"):
    """Extracts images from a PDF and saves them locally."""
    images = []
    if not os.path.exists(save_folder):
        os.makedirs(save_folder)
    with pdfplumber.open(pdf_path) as pdf:
        for i, page in enumerate(pdf.pages):
            # Use pdfplumber's rendering for a rasterized image of the full page
            img = page.to_image(resolution=150)
            img_bytes = img.original
            img_filename = f"{save_folder}/pdf_page_{i+1}.png"
            with open(img_filename, "wb") as f:
                f.write(img_bytes)
            images.append(img_filename)
    return images

In [5]:
import requests
from bs4 import BeautifulSoup

def extract_text_webpage(url):
    """Extracts text from an article webpage."""
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    text = "\n".join([p.get_text(strip=True) for p in soup.find_all("p")])
    return text


In [6]:
def extract_tables_webpage(url):
    """Extracts tables from a webpage and converts them into DataFrames."""
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    tables = []
    for table in soup.find_all("table"):
        rows = []
        for row in table.find_all("tr"):
            cells = [cell.get_text(strip=True) for cell in row.find_all(["td", "th"])]
            rows.append(cells)
        if rows:
            df = pd.DataFrame(rows[1:], columns=rows[0])  # First row as headers
            tables.append(df)
    return tables

In [7]:
def extract_images_webpage(url, save_folder="images"):
    """Extracts images from a webpage and saves them locally."""
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    images = []
    if not os.path.exists(save_folder):
        os.makedirs(save_folder)
    for i, img in enumerate(soup.find_all("img")):
        img_url = img.get("src")
        if img_url and img_url.startswith(("http", "//")):
            img_url = img_url if img_url.startswith("http") else "https:" + img_url
            try:
                img_data = requests.get(img_url).content
                img_filename = os.path.join(save_folder, f"image_{i+1}.jpg")
                with open(img_filename, "wb") as f:
                    f.write(img_data)
                images.append(img_filename)
            except Exception as e:
                print(f"Failed to download {img_url}: {e}")
    return images

In [None]:
def extract_text(source):
    """Extracts text from PDF or web URL."""
    if source.endswith(".pdf"):
        return extract_text_pymupdf(source)  # Extract from PDF
    elif source.startswith("http"):
        return extract_text_webpage(source)  # Extract from web page
    else:
        raise ValueError("Unsupported file type. Provide a PDF or URL.")


In [None]:
def extract_tables(source):
    """Extracts tables from a PDF or web URL and converts them into DataFrames."""
    if source.endswith(".pdf"):
        return extract_tables_pdf(source)  # Extract from PDF
    elif source.startswith("http"):
        return extract_tables_webpage(source)  # Extract from web page
    else:
        raise ValueError("Unsupported file type. Provide a PDF or URL.")

In [None]:
def extract_images(source, save_folder="images"):
    """Extracts images from PDFs or web pages and saves them locally."""
    if source.endswith(".pdf"):
        return extract_images_pdf(source, save_folder)  # Extract from PDF
    elif source.startswith("http"):
        return extract_images_webpage(source, save_folder)  # Extract from web page
    else:
        raise ValueError("Unsupported file type. Provide a PDF or URL.")

In [None]:
from transformers import pipeline

def extract_all(sources):
    """Extracts text, tables, and images from multiple PDFs and web pages."""
    combined_text = []
    combined_tables = []
    combined_images = []
    source_references = []  # Store source details

    for source in sources:
        text = extract_text(source)
        tables = extract_tables(source)
        images = extract_images(source)

        combined_text.append(text)
        combined_tables.extend(tables)
        combined_images.extend(images)
        source_references.append(source)

    return combined_text, combined_tables, combined_images, source_references

In [None]:
from transformers import pipeline

def generate_lesson_from_extracted_data(sources):
    """Extracts data from multiple sources and generates an AI-powered lesson with embedded tables and image placeholders."""

    # Step 1: Extract content
    all_texts, all_tables, all_images, all_sources = extract_all(sources)

    # Step 2: Convert tables to Markdown
    def table_to_markdown(tables):
        markdown_tables = []
        for df in tables:
            markdown_tables.append(df.to_markdown(index=False))
        return "\n\n".join(markdown_tables)
    
    tables_markdown = table_to_markdown(all_tables)

    # Step 3: Prepare combined text (truncated if needed)
    combined_text = "\n\n".join(all_texts)[:4000]  # Truncate to avoid token limits

    # Step 4: Prepare image markdown placeholders
    image_placeholders = "\n".join([
        f"![image]({img})" for img in all_images
    ]) if all_images else "No images extracted."

    # Step 5: Construct the prompt
    prompt = f"""
You are an educational content generator. Your task is to create a structured lesson using the extracted text, tables, and images provided below.

### Extracted Text:
{combined_text}

### Extracted Tables (Markdown Format):
{tables_markdown}

### Source Webpages & Documents:
{', '.join(all_sources)}

### Extracted Images:
Use the following image files at appropriate points in your lesson. Include each one using Markdown syntax like: `![image](imageX.png)`.

{image_placeholders}

---

### 📝 Instructions:
- Integrate the **Markdown tables** into the lesson where they add value.
- **Include each image** by referencing it at the most relevant point in the content (using the Markdown format).
- Write a clear, engaging lesson with sections, subheadings, and explanations.
- Briefly describe the images where included, to help learners understand their relevance.

Begin generating the lesson below:
"""

    # Step 6: Load the AI model (adjust model name/device as needed)
    generator = pipeline("text-generation", model="meta-llama/Llama-2-7b-chat-hf", device="cuda")

    # Step 7: Generate response
    response = generator(prompt, max_length=2000, do_sample=True, temperature=0.7)

    return response[0]["generated_text"]


In [None]:
##### For testing the gen AI model to see capabilities

from vllm import LLM, SamplingParams

llm = LLM(model="meta-llama/Llama-2-7b-chat-hf")  # Make sure you have access to this model
sampling_params = SamplingParams(temperature=0.7, max_tokens=500)

prompt = "Explain machine learning to a high school student."
outputs = llm.generate([prompt], sampling_params)

print(outputs[0].outputs[0].text)  # Print AI-generated response