In [1]:
!pip install openai faiss-cpu tiktoken


Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (31.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m81.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.12.0


In [2]:
from openai import OpenAI
import faiss
import numpy as np
import tiktoken
import re
import os
import json
from google.colab import drive
from google.colab import userdata


In [3]:
drive.mount('/content/drive')
API_KEY = userdata.get('MY_OPENAI_KEY')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
client = OpenAI(api_key= API_KEY)

In [5]:
save_path = "/content/drive/MyDrive/RAG Recordings/"
os.makedirs(save_path, exist_ok=True)

In [23]:
def chunk_text(text, max_tokens=400, model_name="text-embedding-3-small"):

    enc = tiktoken.encoding_for_model(model_name)

    sessions = re.split(r'---\s*.*?\s*---', text)

    chunks_with_meta = []

    for i in range(1, len(sessions), 2):
        session_id = sessions[i].strip()
        session_text = sessions[i+1].strip()

        sentences = re.split(r'(?<=[.!?]) +', session_text)

        current_chunk, current_len = [], 0

        for sentence in sentences:
            tokens = len(enc.encode(sentence))
            if current_len + tokens > max_tokens:
                chunks_with_meta.append({
                    "session_id": session_id,
                    "text": " ".join(current_chunk)
                })
                current_chunk, current_len = [sentence], tokens
            else:
                current_chunk.append(sentence)
                current_len += tokens

        if current_chunk:
            chunks_with_meta.append({
                "session_id": session_id,
                "text": " ".join(current_chunk)
            })

    return chunks_with_meta

In [31]:
text_path = "/content/drive/MyDrive/RAG Recordings/all_transcriptions.txt"
with open(text_path, "r", encoding="utf-8") as f:
    text = f.read()

chunks = chunk_text(text)


In [35]:
def get_embeddings(chunks, model="text-embedding-3-small"):
    embeddings = []
    for chunk in chunks:
        response = client.embeddings.create(
            model=model,
            input=chunk["text"]
        )
        embeddings.append(response.data[0].embedding)
    return np.array(embeddings, dtype="float32")

embeddings = get_embeddings(chunks)


In [36]:
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)

faiss_path = os.path.join(save_path, "recordings_index.faiss")
chunks_path = os.path.join(save_path, "recordings_chunks.json")

faiss.write_index(index, faiss_path)

with open(chunks_path, "w", encoding="utf-8") as f:
    json.dump(chunks, f, ensure_ascii=False, indent=2)
