after initial success with scrapping articles:

In [7]:
# --------------------- 0. Imports --------------------- #
import os
import uuid
import pandas as pd
from tqdm import tqdm
from dotenv import load_dotenv
from huggingface_hub import InferenceClient
from qdrant_client import QdrantClient
from qdrant_client.http.models import VectorParams, Distance, PointStruct

# --------------------- 1. Configuration --------------------- #
load_dotenv()

hf_token = os.getenv("hf_token")
qdrant_token = os.getenv("qdrant_token")

if not hf_token or not qdrant_token:
    raise ValueError("Missing tokens in .env file. Make sure 'hf_token' and 'qdrant_token' are set.")

EMBEDDING_MODEL = "intfloat/multilingual-e5-large"
COLLECTION_NAME = "news_articles"  # Reuse this consistently

# --------------------- 2. Clients --------------------- #
embedding_client = InferenceClient(api_key=hf_token)

vector_db_client = QdrantClient(
    url="https://cf58605f-3b88-494f-9b09-dcc67ca3478b.europe-west3-0.gcp.cloud.qdrant.io:6333",
    api_key=qdrant_token
)

# --------------------- 3. Qdrant Collection Setup --------------------- #
def recreate_collection():
    try:
        vector_db_client.delete_collection(collection_name=COLLECTION_NAME)
        vector_db_client.create_collection(
            collection_name=COLLECTION_NAME,
            vectors_config=VectorParams(
                size=1024,
                distance=Distance.COSINE
            )
        )
        print(f"[✅] Collection '{COLLECTION_NAME}' recreated.")
    except Exception as e:
        print(f"[❌] Failed to recreate collection: {e}")

# --------------------- 4. Article Embedding & Upload --------------------- #
def chunk_text(text, max_tokens=500):
    words = text.split()
    return [" ".join(words[i:i + max_tokens]) for i in range(0, len(words), max_tokens)]

def upload_articles(csv_path="tech_trends_news.csv"):
    df = pd.read_csv(csv_path)

    for _, row in tqdm(df.iterrows(), total=len(df), desc="🔄 Embedding Articles"):
        title = row.get("title", "")
        url = row.get("url", "")
        text = row.get("text", "")

        if not isinstance(text, str) or not text.strip():
            continue

        for i, chunk in enumerate(chunk_text(text)):
            try:
                embedding = embedding_client.feature_extraction(
                    chunk,
                    model=EMBEDDING_MODEL
                )

                if len(embedding) != 1024:
                    print(f"⚠️ Skipped '{title}' due to unexpected embedding size.")
                    continue

                point_id = str(uuid.uuid5(uuid.NAMESPACE_URL, f"{title}-{i}"))

                vector_db_client.upsert(
                    collection_name=COLLECTION_NAME,
                    points=[PointStruct(
                        id=point_id,
                        vector=embedding,
                        payload={
                            "title": title,
                            "url": url,
                            "chunk": chunk
                        }
                    )]
                )
            except Exception as e:
                print(f"[ERROR] Chunk {i} of '{title}' failed: {e}")

# --------------------- 5. Run End-to-End --------------------- #
if __name__ == "__main__":
    recreate_collection()
    upload_articles("tech_trends_news.csv")


[✅] Collection 'news_articles' recreated.


🔄 Embedding Articles: 100%|██████████| 59/59 [01:59<00:00,  2.02s/it]


In [8]:
def search_articles(query: str, top_k=5):
    embedding = embedding_client.feature_extraction(
        query,
        model=EMBEDDING_MODEL  # Use the same embedding model variable you defined earlier
    )

    results = vector_db_client.search(
        collection_name=COLLECTION_NAME,  # Use your global COLLECTION_NAME, e.g. "news_articles"
        query_vector=embedding,
        limit=top_k,
        with_payload=True  # This ensures you get the title, url, chunk in results
    )

    return results

# Example usage:
results = search_articles("How to prepare student for AI trends in the labour market")

for hit in results:
    print(f"🔎 {hit.payload.get('title', 'No title')}")
    print(f"📎 URL: {hit.payload.get('url', 'No URL')}")
    print(f"📝 Snippet: {hit.payload.get('chunk', '')[:300]}...\n")


  results = vector_db_client.search(


🔎 Preparing Employees for AI in the Workplace: A Guide to Seamless Integration
📎 URL: https://profiletree.com/ai-in-the-workplace/
📝 Snippet: Preparing Employees for AI in the Workplace: A Guide to Seamless Integration Updated on: Updated by: AI in the Workplace – As artificial intelligence (AI) becomes more integrated into business operations, preparing employees for an AI-enhanced workplace is imperative. AI is not just a future conside...

🔎 Preparing Employees for AI in the Workplace: A Guide to Seamless Integration
📎 URL: https://profiletree.com/ai-in-the-workplace/
📝 Snippet: allowing staff to focus on more complex and creative work. AI can also assist in data analysis, enabling employees to make informed decisions swiftly and accurately. We’ve found that when AI handles the more mundane aspects of work, it frees our team to concentrate on strategy and innovation. The be...

🔎 Artificial intelligence in education, AI tutors features, advantages and disadvantages
📎 URL: https://ww

In [None]:
# Example usage:
results = search_articles("How to prepare student for AI trends in the labour market")

for hit in results:
    print(f"🔎 {hit.payload.get('title', 'No title')}")
    print(f"📎 URL: {hit.payload.get('url', 'No URL')}")
    print(f"📝 Snippet: {hit.payload.get('chunk', '')[:300]}...\n")
