Install packages

In [1]:
!pip install -q sentence-transformers faiss-cpu pandas

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m71.7 MB/s[0m eta [36m0:00:00[0m
[?25h

Imports & config

In [2]:
import pandas as pd
import numpy as np
import faiss
import pickle

from sentence_transformers import SentenceTransformer

CSV_PATH = "/content/all_sources_metadata_2020-03-13.csv"
N_DOCS = 3000

EMBED_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
embed_model = SentenceTransformer(EMBED_MODEL_NAME)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Load CSV & filter rows

In [3]:
df = pd.read_csv(CSV_PATH)

print("Columns:", df.columns.tolist())
print("Total rows:", len(df))

# Keep rows with non-empty abstract
df = df.dropna(subset=["abstract"])
df = df[df["abstract"].astype(str).str.strip() != ""]

# Optional: keep only rows that claim to have full text
if "has_full_text" in df.columns:
    df = df[df["has_full_text"].astype(str).str.upper() == "TRUE"]

# Optional: keep only open licenses (you can skip this if you want)
if "license" in df.columns:
    df = df[df["license"].astype(str).str.contains("cc-by", case=False, na=False)]

print("Rows with abstract + full_text + cc-by license:", len(df))

# Sample a manageable subset
if len(df) > N_DOCS:
    df_small = df.sample(N_DOCS, random_state=42)
else:
    df_small = df

print("Using", len(df_small), "articles for the index")

Columns: ['sha', 'source_x', 'title', 'doi', 'pmcid', 'pubmed_id', 'license', 'abstract', 'publish_time', 'authors', 'journal', 'Microsoft Academic Paper ID', 'WHO #Covidence', 'has_full_text']
Total rows: 29500
Rows with abstract + full_text + cc-by license: 104
Using 104 articles for the index


Build Docs

In [4]:
docs = []

for _, row in df_small.iterrows():
    title = str(row.get("title", "")).strip()
    abstract = str(row.get("abstract", "")).strip()
    doi = str(row.get("doi", "")).strip()
    journal = str(row.get("journal", "")).strip()
    year = str(row.get("publish_time", "")).strip()
    source_x = str(row.get("source_x", "")).strip()

    # Concatenate title + abstract as the text we’ll embed
    text_parts = []
    if title and title.lower() != "nan":
        text_parts.append(title)
    if abstract and abstract.lower() != "nan":
        text_parts.append(abstract)

    full_text = ". ".join(text_parts).strip()
    if not full_text:
        continue

    docs.append(
        {
            "id": len(docs),
            "text": full_text,
            "title": title,
            "doi": doi,
            "journal": journal,
            "publish_time": year,
            "source": source_x or "CORD-19",
        }
    )

len(docs)

104

Embed & build FAISS index

In [5]:
texts = [d["text"] for d in docs]

print("Embedding", len(texts), "documents...")
embeddings = embed_model.encode(
    texts,
    convert_to_numpy=True,
    batch_size=64,
    show_progress_bar=True,
)

embeddings = embeddings.astype("float32")
faiss.normalize_L2(embeddings)

dim = embeddings.shape[1]
index = faiss.IndexFlatIP(dim)
index.add(embeddings)

print("Index size:", index.ntotal)

Embedding 104 documents...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Index size: 104


Save

In [6]:
INDEX_PATH = "faiss_index.bin"
DOCS_PATH = "docs.pkl"

faiss.write_index(index, INDEX_PATH)
with open(DOCS_PATH, "wb") as f:
    pickle.dump(docs, f)

print("Saved:", INDEX_PATH, DOCS_PATH)

Saved: faiss_index.bin docs.pkl


In [7]:
from google.colab import files
files.download(INDEX_PATH)
files.download(DOCS_PATH)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>