In [1]:
import json
import pandas as pd


Load the file

In [2]:
file_path = 'arxiv-metadata-oai-snapshot.json'
limit = 10000   # Loading only 10,000 papers


Targeting specific categories

In [3]:
target_categories = ['cs.AI', 'cs.LG', 'cs.CL', 'cs.CV']


Combining title and abstract

Checking the target category

In [4]:
data = []
with open(file_path, 'r', encoding='utf-8') as f:
    for line in f:
        paper = json.loads(line)
        categories = paper.get('categories', '')

        # Checking if any target category is in the paper's categories
        if any(cat in categories.split() for cat in target_categories):
            data.append(paper)


        if len(data) >= limit:
            break


Building DataFrame

In [5]:
df = pd.DataFrame(data)


In [6]:
df['content'] = df['title'] + '. ' + df['abstract']


 Drop empty

In [7]:
df = df[df['content'].notna()]
df = df[df['content'].str.strip() != '']


Final document list

In [8]:
documents = df['content'].tolist()

print(f"Loaded {len(documents)} documents")
print(documents[0])


Loaded 10000 documents
Intelligent location of simultaneously active acoustic emission sources:
  Part I.   The intelligent acoustic emission locator is described in Part I, while Part
II discusses blind source separation, time delay estimation and location of two
simultaneously active continuous acoustic emission sources.
  The location of acoustic emission on complicated aircraft frame structures is
a difficult problem of non-destructive testing. This article describes an
intelligent acoustic emission source locator. The intelligent locator comprises
a sensor antenna and a general regression neural network, which solves the
location problem based on learning from examples. Locator performance was
tested on different test specimens. Tests have shown that the accuracy of
location depends on sound velocity and attenuation in the specimen, the
dimensions of the tested area, and the properties of stored data. The location
accuracy achieved by the intelligent locator is comparable to that 

Training

In [9]:
from sentence_transformers import InputExample
import random


  from .autonotebook import tqdm as notebook_tqdm


In [10]:
train_examples = []


In [None]:
#Building positive Examples
for doc in documents:
    train_examples.append(InputExample(texts=[doc, doc], label=1.0))  # Positive pair


In [12]:
#Building negative Examples
for _ in range(len(documents) // 2):  # Half number of negatives
    doc1 = random.choice(documents)
    doc2 = random.choice(documents)
    if doc1 != doc2:  # Avoid same doc
        train_examples.append(InputExample(texts=[doc1, doc2], label=0.0))  # Negative pair


Fine-tune the model

In [13]:
from sentence_transformers import SentenceTransformer, losses
from torch.utils.data import DataLoader


In [14]:
# Load a base pre-trained model
model = SentenceTransformer('all-MiniLM-L6-v2')


In [15]:
# Prepare DataLoader
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)


In [16]:
# Define Loss
train_loss = losses.CosineSimilarityLoss(model)


In [17]:
# Fine-tune
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=1,
    warmup_steps=100,
    output_path='./fine_tuned_model'
)




Step,Training Loss
500,0.0044


Building FAISS

In [None]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import pickle


In [None]:
model = SentenceTransformer('./fine_tuned_model')


Embeddings

In [20]:
embeddings = model.encode(documents, device='cpu', show_progress_bar=True, batch_size=32)


Batches: 100%|██████████| 313/313 [05:20<00:00,  1.02s/it]


In [None]:
# Convert embeddings to numpy array
embeddings = np.array(embeddings)


In [None]:
# Creating FAISS index
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)


In [None]:
# Saving FAISS index and documents
faiss.write_index(index, "faiss_index.bin")
with open('documents.pkl', 'wb') as f:
    pickle.dump(documents, f)

print("FAISS database built and saved!")


FAISS database built and saved!


LangChain connection

In [None]:
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.docstore.in_memory import InMemoryDocstore
import pickle
import faiss


In [25]:
# Load documents
with open('documents.pkl', 'rb') as f:
    documents = pickle.load(f)


In [None]:
# Load FAISS index
index = faiss.read_index('faiss_index.bin')


In [27]:
# Load embedding model (fine-tuned one)
embeddings = HuggingFaceEmbeddings(model_name="./fine_tuned_model")


  embeddings = HuggingFaceEmbeddings(model_name="./fine_tuned_model")


In [None]:
# Create LangChain FAISS store
vectorstore = FAISS(
    embedding_function=embeddings,
    index=index,
    docstore=InMemoryDocstore(dict(enumerate(documents))),
    index_to_docstore_id=lambda i: i
)
