In [None]:
# ======================================
#   FARM-BOT URL ‚Üí PDF ‚Üí CLEAN ‚Üí EMBED ‚Üí QDRANT PIPELINE (STRICT INPUT ONLY)
# ======================================

!pip install sentence-transformers qdrant-client pymupdf tqdm

import requests
import uuid
import fitz
import re
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient, models

# ---------------------------
# CONFIG
# ---------------------------
QDRANT_API_KEY="your key here"
QDRANT_URL="your key here"
COLLECTION = "farmbot_knowledge"
EMBEDDING_DIM = 384

# ---------------------------
# USER INPUT (NO DEFAULTS!)
# ---------------------------
print("Paste your PDF links below (one per line).")
print("Nothing will run unless you paste links.\n")

raw_input_links = input("Links:\n").strip()

# If user enters nothing ‚Üí empty list
if raw_input_links == "":
    print("\n‚ùå No links provided. Nothing to process.")
    links = []
else:
    links = [u.strip() for u in raw_input_links.split("\n") if u.strip()]

print(f"\nüìå Total URLs to process: {len(links)}")
for l in links:
    print(" ‚Üí", l)

if len(links) == 0:
    raise SystemExit("‚ö†Ô∏è Exiting. No URLs to process.")


# ---------------------------
# MODEL & QDRANT CLIENT
# ---------------------------
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", device="cpu")
qdrant = QdrantClient(url=QDRANT_URL, api_key=QDRANT_API_KEY)

# Ensure collection exists
try:
    qdrant.get_collection(COLLECTION)
except:
    qdrant.create_collection(
        collection_name=COLLECTION,
        vectors_config=models.VectorParams(size=EMBEDDING_DIM, distance=models.Distance.COSINE)
    )
    print("Created Qdrant collection:", COLLECTION)


# ---------------------------
# HELPERS
# ---------------------------

def download_url(url):
    """Download PDF from URL."""
    try:
        r = requests.get(url, timeout=30)
        path = f"/content/{uuid.uuid4()}.pdf"
        with open(path, "wb") as f:
            f.write(r.content)
        return path
    except Exception as e:
        print(f"[ERROR] Download failed for {url}: {e}")
        return None


def pdf_to_text(path):
    """Extract text from PDF using PyMuPDF."""
    try:
        doc = fitz.open(path)
        text = ""
        for page in doc:
            text += page.get_text()
        return text
    except Exception as e:
        print(f"[ERROR] Text extraction failed for {path}: {e}")
        return ""


def clean_text(text: str):
    """Clean extracted text."""
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'Page \d+ of \d+', '', text)
    text = re.sub(r'¬©.*?\d+', '', text)
    return text.strip()


def chunk_text(text, max_words=800):
    """Yield fixed-size text chunks."""
    words = text.split()
    for i in range(0, len(words), max_words):
        yield " ".join(words[i:i + max_words])


# ---------------------------
# MAIN PIPELINE
# ---------------------------
total_chunks = 0
failed_urls = []

for url in tqdm(links, desc="Processing URLs"):

    print(f"\n=== üåê Processing URL: {url} ===")

    # 1. Download
    pdf_path = download_url(url)
    if not pdf_path:
        failed_urls.append(url)
        continue

    # 2. Extract + clean
    raw = pdf_to_text(pdf_path)
    if not raw.strip():
        print("‚ö†Ô∏è No text extracted. Skipping.")
        failed_urls.append(url)
        continue

    cleaned = clean_text(raw)

    # 3. Chunk
    chunks = list(chunk_text(cleaned))
    print(f"‚Üí Extracted {len(chunks)} chunks.")

    # 4. Batch embed
    embeddings = model.encode(chunks, batch_size=16, convert_to_numpy=True)

    # 5. Batch upload
    points = []
    for i, (vec, chunk) in enumerate(zip(embeddings, chunks)):
        points.append(
            models.PointStruct(
                id=str(uuid.uuid4()),
                vector=vec.tolist(),
                payload={
                    "source": url,
                    "chunk_id": i,
                    "text": chunk
                }
            )
        )

    qdrant.upsert(collection_name=COLLECTION, points=points)
    print(f"‚úî Uploaded {len(points)} chunks ‚Üí {COLLECTION}")

    total_chunks += len(points)


# ---------------------------
# SUMMARY
# ---------------------------
print("\nüéâ DONE ‚Äî URL Pipeline Complete!")
print(f"üß© Total chunks uploaded: {total_chunks}")
print(f"üåê Total URLs processed: {len(links)}")
print(f"‚ùå Failed URLs: {len(failed_urls)}")

if failed_urls:
    print("\nFailed list:")
    for u in failed_urls:
        print(" -", u)


In [None]:
from qdrant_client import QdrantClient

qdrant = QdrantClient(
    url="your key here",
    api_key="your key here")

print(qdrant.get_collections())

print(qdrant.count(collection_name="farmbot_knowledge"))


In [None]:
# ======================================
#   FARM-BOT ZIP ‚Üí PDF ‚Üí CLEAN ‚Üí EMBED ‚Üí QDRANT PIPELINE
# ======================================

!pip install sentence-transformers qdrant-client pymupdf tqdm

import zipfile
import os
import re
import uuid
import fitz
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient, models

# ---------------------------
# CONFIG
# ---------------------------
ZIP_PATH = "/content/drive/MyDrive/Colab Notebooks/publication.zip"
EXTRACT_DIR = "/content/publications_extracted"

QDRANT_API_KEY="Your key here"
QDRANT_URL="Your key here"
COLLECTION = "farmbot_knowledge"
EMBEDDING_DIM = 384

# ---------------------------
# LOAD MODEL & QDRANT
# ---------------------------
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", device="cpu")
qdrant = QdrantClient(url=QDRANT_URL, api_key=QDRANT_API_KEY)

# Ensure collection exists
try:
    qdrant.get_collection(COLLECTION)
except:
    qdrant.create_collection(
        collection_name=COLLECTION,
        vectors_config=models.VectorParams(size=EMBEDDING_DIM, distance=models.Distance.COSINE)
    )
    print("Created Qdrant collection:", COLLECTION)

# ---------------------------
# HELPERS
# ---------------------------

def clean_text(text: str):
    """Clean extracted PDF text."""
    text = re.sub(r'\s+', ' ', text)          # fix spacing
    text = re.sub(r'Page \d+ of \d+', '', text) 
    text = re.sub(r'¬©.*?\d+', '', text)       # remove copyright junk
    text = text.strip()
    return text


def chunk_text(text, max_words=800):
    """Yield word chunks."""
    words = text.split()
    for i in range(0, len(words), max_words):
        yield " ".join(words[i:i + max_words])


def pdf_to_text(path):
    try:
        doc = fitz.open(path)
        text = ""
        for page in doc:
            text += page.get_text()
        return text
    except Exception as e:
        print(f"[ERROR] Could not extract: {path} ‚Äì {e}")
        return ""


# ---------------------------
# 1) EXTRACT ZIP
# ---------------------------
print("üì¶ Extracting zip:", ZIP_PATH)

try:
    with zipfile.ZipFile(ZIP_PATH, 'r') as zip_ref:
        zip_ref.extractall(EXTRACT_DIR)
    print("‚úÖ ZIP extracted successfully!")
except Exception as e:
    print("‚ùå ZIP extraction failed:", e)
    raise

# ---------------------------
# 2) SCAN PDFs
# ---------------------------
pdf_files = []
for root, dirs, files in os.walk(EXTRACT_DIR):
    for f in files:
        if f.lower().endswith(".pdf"):
            pdf_files.append(os.path.join(root, f))

print(f"üîé Found {len(pdf_files)} PDF files.\n")

# ---------------------------
# MAIN PIPELINE
# ---------------------------
total_chunks = 0
failed_pdfs = []

for pdf_path in tqdm(pdf_files, desc="Processing PDFs"):

    print(f"\n=== üìÑ Processing PDF: {pdf_path} ===")

    raw = pdf_to_text(pdf_path)
    if not raw.strip():
        print("‚ö†Ô∏è No text extracted. Skipping.")
        failed_pdfs.append(pdf_path)
        continue

    cleaned = clean_text(raw)

    chunks = list(chunk_text(cleaned))
    print(f"‚Üí Extracted {len(chunks)} text chunks.")

    # Batch embed
    embeddings = model.encode(chunks, batch_size=16, convert_to_numpy=True)

    # Batch upload
    points = []
    for i, (vec, chunk) in enumerate(zip(embeddings, chunks)):
        points.append(
            models.PointStruct(
                id=str(uuid.uuid4()),
                vector=vec.tolist(),
                payload={
                    "source": pdf_path,
                    "chunk_id": i,
                    "text": chunk
                }
            )
        )

    qdrant.upsert(collection_name=COLLECTION, points=points)
    print(f"‚úî Uploaded {len(points)} chunks to Qdrant.")

    total_chunks += len(points)

print("\nüéâ ALL DONE!")
print(f"üìö Total PDFs processed: {len(pdf_files)}")
print(f"üß© Total chunks uploaded: {total_chunks}")
print(f"‚ùå Failed PDFs: {len(failed_pdfs)}")
if failed_pdfs:
    print("Failed list:")
    for f in failed_pdfs:
        print(" -", f)
