In [1]:
import nltk
import pandas as pd
import openpyxl
import os
from abc import ABC, abstractmethod
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document

print("Libraries imported.")

Libraries imported.


Different LLMs from HuggingFace and vector store FAISS are used. Once the LLM is run using N randomly selected samples (with fixed seed), it is saved locally to data/vector_stores/and loaded for future use. Samples only need to be embedded once for each LLM x N combination.

Outputs for each query are saved to data/output/ with an appropriate name (class name + suffix).

QUERIES - queries we are interested in

NUMBER OF ARTICLES - represents how many "top results" we want to output to Excel

SAMPLE SIZE - represents the size of the random sample of blogs that represent our database

In [2]:
config = {
    'QUERIES': [
        "What are people saying about relationships?",
        "How do bloggers feel about technology advancements?",
        "Tell me something about Brasil",
        "car mechanics price"
    ],
    'NUMBER OF ARTICLES': 3,
    'SAMPLE SIZE': 25000
}

base_dir = os.getcwd()
data_dir = os.path.join(base_dir, 'data')
vector_stores_dir = os.path.join(data_dir, 'vector_stores')
embeddings_dir = os.path.join(data_dir, 'embeddings')
sampled_data_dir = os.path.join(data_dir, 'sampled_data')
output_dir = os.path.join(data_dir, 'output')

os.makedirs(vector_stores_dir, exist_ok=True)
os.makedirs(embeddings_dir, exist_ok=True)
os.makedirs(sampled_data_dir, exist_ok=True)
os.makedirs(output_dir, exist_ok=True)

print("Data directories set up.")

df = pd.read_csv(os.path.join(data_dir, 'blogtext.csv'))

print("DataFrame loaded.")

nltk.download('punkt')

print("NLTK prepared.")

Data directories set up.
DataFrame loaded.
NLTK prepared.


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Antisha\anaconda3\envs\vm_task_env\nltk_data.
[nltk_data]     ..
[nltk_data]   Package punkt is already up-to-date!


In [3]:
class DocumentRetrievalSystem(ABC):
    def __init__(self, df, sample_size=100000, random_state=42):
        self.df = df
        self.sample_size = sample_size
        self.random_state = random_state
        self.df_sampled = None
        self.doc_list = None
        self.split_docs = None
        self.embedding_model = None
        self.vector_store = None
        self.vector_store_path = None

        self.class_name = self.__class__.__name__
        self.embedding_model_name = self.get_embedding_model_name()
        self.vector_store_name = f"{self.class_name}_{self.embedding_model_name}_{self.sample_size}"
    
        self.data_dir = data_dir
        self.vector_stores_dir = vector_stores_dir
        self.embeddings_dir = embeddings_dir
        self.sampled_data_dir = sampled_data_dir

    @abstractmethod
    def get_embedding_model_name(self):
        pass

    @abstractmethod
    def get_embedding_model(self):
        pass

    @abstractmethod
    def get_text_splitter(self):
        pass

    @abstractmethod
    def get_vector_store_class(self):
        pass

    def prepare_data(self):
        print(f"Sampling {self.sample_size} rows from the DataFrame.")
        self.df_sampled = self.df.sample(n=self.sample_size, random_state=self.random_state)

        sampled_data_path = os.path.join(self.sampled_data_dir, f"{self.vector_store_name}_sampled_data.csv")
        self.df_sampled.to_csv(sampled_data_path, index=False)
        print(f"Sampled DataFrame saved as '{sampled_data_path}'.")

        self.df_sampled['text'] = self.df_sampled['text'].astype(str)

        self.doc_list = []
        for index, row in self.df_sampled.iterrows():
            doc = Document(
                page_content=row['text'],
                metadata={
                    'id': row['id'],
                    'gender': row['gender'],
                    'age': row['age'],
                    'topic': row['topic'],
                    'sign': row['sign'],
                    'date': row['date']
                }
            )
            self.doc_list.append(doc)
        print(f"Created {len(self.doc_list)} Document objects.")

        text_splitter = self.get_text_splitter()
        self.split_docs = text_splitter.split_documents(self.doc_list)
        print(f"Split documents into {len(self.split_docs)} chunks.")

    def initialize_embeddings(self):
        if self.embedding_model is None:
            print("Initializing embedding model.")
            self.embedding_model = self.get_embedding_model()
            print(f"Initialized embedding model '{self.embedding_model_name}'.")
        else:
            print("Embedding model already initialized.")

    def create_vector_store(self):
        self.vector_store_path = os.path.join(self.vector_stores_dir, self.vector_store_name)
    
        if os.path.exists(self.vector_store_path):
            print(f"Loading existing vector store from '{self.vector_store_path}'.")
            self.vector_store = FAISS.load_local(
                self.vector_store_path, embeddings=self.embedding_model, allow_dangerous_deserialization=True)
            print("Vector store loaded.")
        else:
            print("Creating new vector store from embeddings.")
    
            os.makedirs(self.embeddings_dir, exist_ok=True)
    
            batch_size = 1000
            all_embeddings = []
            all_texts = []
            all_metadatas = []
    
            print("Starting embedding process...")
            for i in range(0, len(self.split_docs), batch_size):
                batch_docs = self.split_docs[i:i + batch_size]
                batch_texts = [doc.page_content for doc in batch_docs]
                batch_metadatas = [doc.metadata for doc in batch_docs]
                batch_embeddings = self.embedding_model.embed_documents(batch_texts)
                all_embeddings.extend(batch_embeddings)
                all_texts.extend(batch_texts)
                all_metadatas.extend(batch_metadatas)
                print(f"Processed batch {i // batch_size + 1}/{(len(self.split_docs) + batch_size - 1) // batch_size}")
    
            print(f"Completed embedding of {len(all_embeddings)} documents.")
    
            self.vector_store = FAISS.from_texts(
                texts=all_texts,
                embedding=self.embedding_model,
                metadatas=all_metadatas
            )
            print("Vector store created from texts.")
            self.vector_store.save_local(self.vector_store_path)
            print(f"Vector store saved to '{self.vector_store_path}'.")


    def get_top_n_documents(self, query, n):
        if self.vector_store is None:
            print("Vector store not initialized. Please run create_vector_store() first.")
            return []
    
        retriever = self.vector_store.as_retriever(search_kwargs={'k': n})
    
        docs_and_scores = self.vector_store.similarity_search_with_score(query, k=n)
    
        results = []
        for doc, score in docs_and_scores:
            result = {
                'query': query,
                'content': doc.page_content,
                'metadata': doc.metadata,
                'score': score
            }
            results.append(result)
    
        df_results = pd.DataFrame(results)
    
        output_file_name = f"{self.vector_store_name}_results.xlsx"
        output_file_path = os.path.join(output_dir, output_file_name)
    
        if os.path.exists(output_file_path):
            with pd.ExcelWriter(output_file_path, mode='a', if_sheet_exists='overlay') as writer:
                if 'Sheet1' in writer.sheets:
                    startrow = writer.sheets['Sheet1'].max_row
                else:
                    startrow = 0
                df_results.to_excel(writer, index=False, header=False, startrow=startrow)
        else:
            df_results.to_excel(output_file_path, index=False)
    
        print(f"Results saved to '{output_file_path}'.")
    
        return results


    def process_query(self, query, n):
        results = self.get_top_n_documents(query, n)
        if results:
            top_result = results[0]
            print(f"\nTop result from {self.class_name} for query '{query}':")
            print("Content:", top_result['content'])
            print("Metadata:", top_result['metadata'])
            print("Embedding Model (LLM):", self.embedding_model_name)
            print("-" * 80)
        else:
            print(f"No results found for query '{query}' using {self.class_name}.")



In [4]:
class BlogRetrievalSystemMiniLMFAISS(DocumentRetrievalSystem):
    def get_embedding_model_name(self):
        return 'all-MiniLM-L6-v2'

    def get_embedding_model(self):
        return HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')

    def get_text_splitter(self):
        return RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)

    def get_vector_store_class(self):
        from langchain.vectorstores import FAISS
        return FAISS

class BlogRetrievalSystemParaphraseFAISS(DocumentRetrievalSystem):
    def get_embedding_model_name(self):
        return 'paraphrase-MiniLM-L6-v2'

    def get_embedding_model(self):
        return HuggingFaceEmbeddings(model_name='sentence-transformers/paraphrase-MiniLM-L6-v2')

    def get_text_splitter(self):
        return RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)

    def get_vector_store_class(self):
        from langchain.vectorstores import FAISS
        return FAISS


class BlogRetrievalSystemMPNetFAISS(DocumentRetrievalSystem):
    def get_embedding_model_name(self):
        return 'all-mpnet-base-v2'
    
    def get_embedding_model(self):
        return HuggingFaceEmbeddings(model_name='sentence-transformers/all-mpnet-base-v2')
    
    def get_text_splitter(self):
        return RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    
    def get_vector_store_class(self):
        from langchain.vectorstores import FAISS
        return FAISS

class BlogRetrievalSystemQAFAISS(DocumentRetrievalSystem):
    def get_embedding_model_name(self):
        return 'multi-qa-mpnet-base-dot-v1'
    
    def get_embedding_model(self):
        return HuggingFaceEmbeddings(model_name='sentence-transformers/multi-qa-mpnet-base-dot-v1')
    
    def get_text_splitter(self):
        return RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    
    def get_vector_store_class(self):
        from langchain.vectorstores import FAISS
        return FAISS

class BlogRetrievalSystemMiniLMFAISS_NLTK(DocumentRetrievalSystem):
    def get_embedding_model_name(self):
        return 'all-MiniLM-L6-v2'

    def get_embedding_model(self):
        return HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')

    def get_text_splitter(self):
        from langchain.text_splitter import NLTKTextSplitter
        return NLTKTextSplitter(chunk_size=1000, chunk_overlap=200)

    def get_vector_store_class(self):
        from langchain.vectorstores import FAISS
        return FAISS

class BlogRetrievalSystemParaphraseFAISS_NLTK(DocumentRetrievalSystem):
    def get_embedding_model_name(self):
        return 'paraphrase-MiniLM-L6-v2'

    def get_embedding_model(self):
        return HuggingFaceEmbeddings(model_name='sentence-transformers/paraphrase-MiniLM-L6-v2')

    def get_text_splitter(self):
        from langchain.text_splitter import NLTKTextSplitter
        return NLTKTextSplitter(chunk_size=1000, chunk_overlap=200)

    def get_vector_store_class(self):
        from langchain.vectorstores import FAISS
        return FAISS


In [5]:
subclasses = [
    BlogRetrievalSystemMiniLMFAISS,
    BlogRetrievalSystemParaphraseFAISS,
    # BlogRetrievalSystemMPNetFAISS,
    # BlogRetrievalSystemQAFAISS,
    # BlogRetrievalSystemMiniLMFAISS_NLTK,
    # BlogRetrievalSystemParaphraseFAISS_NLTK,
]

instances = []
for subclass in subclasses:
    print(f"\nInitializing {subclass.__name__}...")
    blog_system = subclass(df, sample_size=config['SAMPLE SIZE'])
    blog_system.prepare_data()
    blog_system.initialize_embeddings()
    blog_system.create_vector_store()
    instances.append(blog_system)

for query in config['QUERIES']:
    for blog_system in instances:
        blog_system.process_query(query, config['NUMBER OF ARTICLES'])



Initializing BlogRetrievalSystemMiniLMFAISS...
Sampling 25000 rows from the DataFrame.
Sampled DataFrame saved as 'C:\Users\Antisha\Documents\vm_task\data\sampled_data\BlogRetrievalSystemMiniLMFAISS_all-MiniLM-L6-v2_25000_sampled_data.csv'.
Created 25000 Document objects.
Split documents into 45903 chunks.
Initializing embedding model.


  return HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')


Initialized embedding model 'all-MiniLM-L6-v2'.
Loading existing vector store from 'C:\Users\Antisha\Documents\vm_task\data\vector_stores\BlogRetrievalSystemMiniLMFAISS_all-MiniLM-L6-v2_25000'.
Vector store loaded.

Initializing BlogRetrievalSystemParaphraseFAISS...
Sampling 25000 rows from the DataFrame.
Sampled DataFrame saved as 'C:\Users\Antisha\Documents\vm_task\data\sampled_data\BlogRetrievalSystemParaphraseFAISS_paraphrase-MiniLM-L6-v2_25000_sampled_data.csv'.
Created 25000 Document objects.
Split documents into 45903 chunks.
Initializing embedding model.
Initialized embedding model 'paraphrase-MiniLM-L6-v2'.
Loading existing vector store from 'C:\Users\Antisha\Documents\vm_task\data\vector_stores\BlogRetrievalSystemParaphraseFAISS_paraphrase-MiniLM-L6-v2_25000'.
Vector store loaded.
Results saved to 'C:\Users\Antisha\Documents\vm_task\data\output\BlogRetrievalSystemMiniLMFAISS_all-MiniLM-L6-v2_25000_results.xlsx'.

Top result from BlogRetrievalSystemMiniLMFAISS for query 'What 