In [1]:
import pandas as pd
import uuid
import torch
import clip
from qdrant_client import QdrantClient
from dotenv import load_dotenv
import os
from tqdm import tqdm

In [2]:
model, preprocess = clip.load("ViT-B/32")

In [3]:
load_dotenv()

QDRANT_KEY = os.getenv("QDRANT_KEY")
QDRANT_URL = os.getenv("QDRANT_URL")

client = QdrantClient(
    url = QDRANT_URL,
    api_key = QDRANT_KEY
)

In [4]:
print(client.get_collections())

collections=[CollectionDescription(name='GNOSIS')]


In [5]:
df = pd.read_csv("D:/STUDY/PROJECTS/GNOSIS/Resources/big covid/NewsRealCOVID-19_5.csv")
# df.head()

points = [] # store points to be uploaded in batch

In [6]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)
# model.eval()

In [7]:
def chunk_text(text, max_words=200):
    words = text.split()
    return [" ".join(words[i:i+max_words]) for i in range(0, len(words), max_words)]

In [8]:
BATCH_SIZE = 64
UPLOAD_BATCH = 1000

points_buffer = []

text_batch = []
meta_batch = []

In [9]:
for row_idx, row in tqdm(df.iterrows(), total=len(df)):

    title = str(row["title"]) if not pd.isna(row["title"]) else ""
    newstitle = str(row["newstitle"]) if not pd.isna(row["newstitle"]) else ""
    content = str(row["content"]) if not pd.isna(row["content"]) else ""

    news_url = str(row["news_url"]) if not pd.isna(row["news_url"]) else ""
    fact_check_url = str(row["fact_check_url"]) if not pd.isna(row["fact_check_url"]) else ""
    article_type = str(row["type"]) if not pd.isna(row["type"]) else ""

    if not content.strip():
        continue

    doc_id = f"health_{row_idx}"

    # =========================
    # Build rich document
    # =========================
    full_text = f"""
    Title: {title}
    Alternate Title: {newstitle}

    Article:
    {content}
    """.strip()

    # =========================
    # Chunk the article
    # =========================
    chunks = chunk_text(full_text, max_words=200)

    for chunk_id, chunk in enumerate(chunks):
        text_batch.append(chunk)
        meta_batch.append({
            "doc_id": doc_id,
            "chunk_id": chunk_id,
            "title": title,
            "newstitle": newstitle,
            "news_url": news_url,
            "fact_check_url": fact_check_url,
            "type": article_type,
            "label": "real",  # or derive if dataset has labels
        })

    # =========================
    # Embed & upload in batches
    # =========================
    if len(text_batch) >= BATCH_SIZE:

        tokens = clip.tokenize(text_batch, truncate=True).to(device)

        with torch.no_grad():
            vecs = model.encode_text(tokens).cpu().numpy()

        for vec, meta, text_ in zip(vecs, meta_batch, text_batch):
            point = {
                "id": str(uuid.uuid4()),
                "vector": vec.tolist(),
                "payload": {
                    "modality": "text",
                    "dataset": "HealthLine_News",
                    "chunk_text": text_,
                    **meta
                }
            }
            points_buffer.append(point)

        text_batch = []
        meta_batch = []

        if len(points_buffer) >= UPLOAD_BATCH:
            client.upsert(collection_name="GNOSIS", points=points_buffer)
            points_buffer = []


100%|██████████| 1590/1590 [00:05<00:00, 289.23it/s]


In [10]:
# Flush leftovers
if text_batch:
    tokens = clip.tokenize(text_batch, truncate=True).to(device)
    with torch.no_grad():
        vecs = model.encode_text(tokens).cpu().numpy()

    for vec, meta, text_ in zip(vecs, meta_batch, text_batch):
        point = {
            "id": str(uuid.uuid4()),
            "vector": vec.tolist(),
            "payload": {
                "modality": "text",
                "dataset": "Reuters_News",
                "text": text_,
                **meta
            }
        }
        points_buffer.append(point)

if points_buffer:
    client.upsert(collection_name="GNOSIS", points=points_buffer)


In [12]:
def normalize_payload(payload: dict, score: float):
    # Force universal schema — keys ALWAYS exist
    out = {
        "label": "",
        "title": "",
        "text": "",
        "date": "",
        "url": "",
        "image_url": "",
        "video_url": "",
        "score": float(score),
    }

    if isinstance(payload, dict):
        out["label"] = payload.get("label", "") or ""
        out["title"] = payload.get("title", "") or ""
        out["date"] = payload.get("date", "") or ""

        # Text may come from different fields
        out["text"] = (
            payload.get("chunk_text")
            or payload.get("text")
            or ""
        )

        # URL may come from different fields
        out["url"] = (
            payload.get("news_url")
            or payload.get("url")
            or ""
        )

        out["image_url"] = payload.get("image_url", "") or ""
        out["video_url"] = payload.get("video_url", "") or ""

    return out


In [15]:
client = QdrantClient(
    url=QDRANT_URL,
    api_key=QDRANT_KEY,
    timeout=60
)

# Init CLIP
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)
model.eval()

QUERY_TEXT = "can remdesivir treat people with covid-19"

# Embed query
tokens = clip.tokenize([QUERY_TEXT], truncate=True).to(device)

with torch.no_grad():
    vec = model.encode_text(tokens).cpu().numpy()[0]

# Search
result = client.query_points(
    collection_name="GNOSIS",
    query=vec.tolist(),
    limit=5,
    with_payload=True
)

hits = result.points

summary = {
    "fake": {"count": 0, "items": []},
    "real": {"count": 0, "items": []},
}



# Process top results
for hit in hits:
    item = normalize_payload(hit.payload, hit.score)
    label = (item["label"] or "").lower().strip()

    if label in summary:
        summary[label]["count"] += 1
        summary[label]["items"].append(item)

# Decide final verdict by majority
if summary["fake"]["count"] > summary["real"]["count"]:
    final_verdict = "fake"
elif summary["real"]["count"] > summary["fake"]["count"]:
    final_verdict = "real"
else:
    final_verdict = "uncertain"   # tie case

# Build final output dict
final_output = {
    "final_verdict": final_verdict,
    "fake": {
        "count": summary["fake"]["count"],
        "items": summary["fake"]["items"]
    },
    "real": {
        "count": summary["real"]["count"],
        "items": summary["real"]["items"]
    }
}


# -------------------------------
# Print final result
# -------------------------------

# print("\n" + "="*80)
# print("QUERY:", QUERY_TEXT)
# print("="*80)

# print("\nFINAL VERDICT:", final_output["final_verdict"].upper())

# print("\nFAKE COUNT:", final_output["fake"]["count"])
# for i, item in enumerate(final_output["fake"]["items"], 1):
#     print(f"\n[FAKE EXAMPLE {i}]")
#     print("Title:", item["title"])
#     print("URL:", item["url"])
#     print("Text:", item["text"][:300])
#     print("Score:", item["score"])


# print("\nREAL COUNT:", final_output["real"]["count"])
# for i, item in enumerate(final_output["real"]["items"], 1):
#     print(f"\n[REAL EXAMPLE {i}]")
#     print("Title:", item["title"])
#     print("URL:", item["url"])
#     print("Text:", item["text"][:300])
#     print("Score:", item["score"])


# If you want the raw dict:
# print("\n\nFINAL OUTPUT DICT:\n")
# print(final_output)

In [16]:
import json

print(json.dumps(final_output, indent=2, ensure_ascii=False))

{
  "final_verdict": "real",
  "fake": {
    "count": 1,
    "items": [
      {
        "label": "fake",
        "title": "US Launches Groundbreaking Cuban-Developed Cancer Treatment",
        "text": "offer the clinical trial for the treatment, according to the statement. ...",
        "date": "",
        "url": "",
        "image_url": "",
        "video_url": "",
        "score": 0.8383697
      }
    ]
  },
  "real": {
    "count": 4,
    "items": [
      {
        "label": "real",
        "title": "Here’s What You Should Do If You Have a ‘Mild’ Case of COVID-19",
        "text": "Title: Here’s What You Should Do If You Have a ‘Mild’ Case of COVID-19 Alternate Title: \"Here’s What You Should Do If You Have a ‘Mild’ Case of COVID-19\" Article: experts say treating yourself at home for a mild case of covid-19 is similar to how you would treat yourself if you have the flu. getty images common symptoms for mild cases of covid-19 include sore throat coughing and fever. there s no shortc