In [None]:
import pandas as pd

In [None]:
# pdf = pd.read_csv(f"labelled_newscatcher_dataset.csv", sep=";")
pdf = pd.read_csv(f"labelled_newscatcher_coloured.csv", index_col=0)
# pdf

In [None]:

pdf["id"] = pdf.index
display(pdf)


In [None]:

from sentence_transformers import InputExample


In [None]:

pdf_subset = pdf.head(1000)

In [None]:

def example_create_fn(doc1: pd.Series) -> InputExample:
    """
    Helper function that outputs a sentence_transformer guid, label, and text
    """
    return InputExample(texts=[doc1])

In [None]:

faiss_train_examples = pdf_subset.apply(
        lambda x: example_create_fn(x["title"]), axis=1
    ).tolist()

In [None]:
# faiss_train_examples

In [None]:
from sentence_transformers import SentenceTransformer

In [None]:
model = SentenceTransformer(
    "all-MiniLM-L6-v2", 
    # cache_folder=DA.paths.datasets
)  # Use a pre-cached model

In [None]:
# model.save("cache")

In [None]:
faiss_title_embedding = model.encode(pdf_subset.title.values.tolist())

In [None]:
len(faiss_title_embedding), len(faiss_title_embedding[0])


In [None]:

import numpy as np
import faiss

In [None]:
pdf_to_index = pdf_subset.set_index(["id"], drop=False)

In [None]:
id_index = np.array(pdf_to_index.id.values).flatten().astype("int")

In [None]:
content_encoded_normalized = faiss_title_embedding.copy()

In [None]:
faiss.normalize_L2(content_encoded_normalized)

In [None]:
dir(faiss)

In [None]:
# index_flat = faiss.IndexFlatIP(len(faiss_title_embedding[0]))
# index_flat.add_with_ids(content_encoded_normalized, id_index)

In [None]:
# Index1DMap translates search results to IDs: https://faiss.ai/cpp_api/file/IndexIDMap_8h.html#_CPPv4I0EN5faiss18IndexIDMapTemplateE
# The IndexFlatIP below builds index
index_content = faiss.IndexIDMap(faiss.IndexFlatIP(len(faiss_title_embedding[0])))

In [None]:
index_content.add_with_ids(content_encoded_normalized, id_index)

In [None]:
def search_content(query, pdf_to_index, k=3):
    query_vector = model.encode([query])
    faiss.normalize_L2(query_vector)

    # We set k to limit the number of vectors we want to return
    top_k = index_content.search(query_vector, k)
    ids = top_k[1][0].tolist()
    similarities = top_k[0][0].tolist()
    results = pdf_to_index.loc[ids]
    results["similarities"] = similarities
    return results

In [None]:
display(search_content("animal", pdf_to_index))

In [None]:
result = search_content("animal", pdf_to_index)

for id in result.id:
    print(result.loc[id, "title"])

In [None]:
result = search_content("money", pdf_to_index)

for id in result.id:
    print(result.loc[id, "title"])

In [None]:
result = search_content("videogame", pdf_to_index)

for id in result.id:
    print(result.loc[id, "title"])

In [None]:
result = search_content("bombs", pdf_to_index)

for id in result.id:
    print(result.loc[id, "title"])

In [None]:
# pdf.to_csv("labelled_newscatcher_coloured.csv")