In [2]:
!pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.8.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.7 kB)
Downloading faiss_cpu-1.8.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.0/27.0 MB[0m [31m79.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.8.0.post1


In [3]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertModel
import faiss
import numpy as np

In [4]:
# Load the saved embedding model and tokenizer
bert_tokenizer = BertTokenizer.from_pretrained('bert_embedding_model')
bert_model = BertModel.from_pretrained('bert_embedding_model')

In [6]:
# Load the Ubuntu manual
with open('ubuntu_manual.txt', 'r') as file:
    ubuntu_manual = file.read()

# Split the manual into smaller sections
sections = ubuntu_manual.split('\n\n')

In [7]:
# Function to embed text
def embed_text(text):
    inputs = bert_tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = bert_model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

In [8]:
# Apply the embedding function to the manual sections
embeddings = [embed_text(section) for section in sections]

# Convert embeddings to a numpy array
embedding_matrix = np.vstack(embeddings)


In [9]:
# Build the FAISS index
index = faiss.IndexFlatL2(embedding_matrix.shape[1])
index.add(embedding_matrix)

In [10]:
# Save the index and the corresponding sections
faiss.write_index(index, "faiss_manual_index.bin")
pd.DataFrame({'section': sections}).to_csv("manual_sections.csv", index=False)