In [1]:
#!pip install -U transformers rank_bm25 sentence-transformers langchain langchain-experimental langchain-community "unstructured[all-docs]" --quiet

In [2]:
import transformers

transformers.__version__

'4.38.2'

In [3]:
import pandas as pd
from pathlib import Path
from functools import partial
from unstructured.partition.auto import partition
from unstructured.chunking import chunk_by_title
from unstructured.documents.elements import Element, Text
from unstructured.cleaners.core import clean
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.embeddings import SelfHostedHuggingFaceEmbeddings
from langchain_experimental.text_splitter import SemanticChunker

model_id = "sentence-transformers/all-MiniLM-L6-v2"
model_kwargs = {"device": "cuda"}
encode_kwargs = {"normalize_embeddings": True}

model = HuggingFaceEmbeddings(
    model_name=model_id, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
)

  return self.fget.__get__(instance, owner)()


In [4]:
text_splitter = SemanticChunker(model)

In [5]:
files = dict()

files_path = Path("./data/")

for file in files_path.glob("*.pdf"):
    elements = partition(str(file.resolve()))
    text_elements = chunk_by_title(elements)
    
    chunks = []

    for element in text_elements:
        element.apply(
            partial(
                clean,
                bullets=True,
                extra_whitespace=True,
                dashes=True,
                trailing_punctuation=True
            )
        )
        chunks.append(element)
    files[file.name] = chunks

In [16]:
# After table of contents
files["978-981-15-1967-3.pdf"][117]

<unstructured.documents.elements.CompositeElement at 0x7fe1353fe440>

In [17]:
# Before index
files["978-981-15-1967-3.pdf"][-40]

<unstructured.documents.elements.CompositeElement at 0x7fe134abe5c0>

In [8]:
book = [chunk.text for chunk in files["978-981-15-1967-3.pdf"][117:-39]]
book = " ".join(book)

In [18]:
docs = text_splitter.create_documents([book])
docs = [doc.page_content for doc in docs]

In [10]:
questions = [
    "What is overfitting?",
    "What is underfitting?",
    "How do we test a models generalization error?",
    "Why should training data points not be in the test set?",
    "What is cross validation?",
    "What are commonly used metrics to measure the performance of a model?",
    "What is linear regression?",
    "What are problems of linear models?",
    "What is a decision tree?",
    "What is the McCulloch–Pitts model?",
    "What is a neural network?",
    "How is a neural network optimized?",
    "How does backpropagation work?",
    "What is deep learning?",
    "What is ensemble learning?",
    "What is the goal of ensemble methods?",
    "What is supervised learning?"
    "What is unsupervised learning?",
    "What is the difference between supervised and unsupervised learning?",
    "What is the goal of clustering?",
    "How is k-Nearest Neighbor trained?",
    "Which algorithm can be used to reduce dimensions?",
    "What is semi-supervised learning?",
    "What is reinforcement learning?",
    "What is an Markov Decision Process used for in reinforcement learning",
    "How interacts an reinforcment learning agent with its environment?",
    "What is the goal of an agent in reinforcement learning?",
    "What is the Exploration-Exploitation dilemma?"
]

In [13]:
from model import DecoderModel
from store import VectorStore

db = VectorStore("sentence-transformers/all-MiniLM-L6-v2", hybrid=True)

[nltk_data] Downloading package wordnet to /home/chkei001/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
  return self.fget.__get__(instance, owner)()


In [14]:
db.add_documents(docs, batch_size=50)

In [15]:
from tqdm import tqdm
import transformers

transformers.logging.disable_progress_bar()
transformers.logging.set_verbosity_error()

models = [
    "HuggingFaceH4/zephyr-7b-beta",
    "mistralai/Mistral-7B-Instruct-v0.2",
    "HuggingFaceH4/zephyr-7b-gemma-v0.1",
    "google/gemma-2b-it",
    #"mistralai/Mixtral-8x7B-Instruct-v0.1" GPU Memory
]

result_rows = []

for model_id in models:
    causal_lm = DecoderModel(model_id, device="cuda")
    with tqdm(total=len(questions)) as pbar:
        for question in questions:
            results = db.search(question, top_n=3)
            contexts = [result["document"] for result in results]
            model_input = "\n\n".join(contexts)
            answer = causal_lm(question, model_input)
            result_rows.append(
                {
                    "model": model_id,
                    "contexts": contexts,
                    "answer": answer,
                    "question": question
                }
            )
            pbar.update(1)
pd.DataFrame(result_rows).to_csv(f"ML_BOOK_RESULTS.csv")

100%|██████████| 27/27 [02:50<00:00,  6.31s/it]
100%|██████████| 27/27 [02:36<00:00,  5.81s/it]
100%|██████████| 27/27 [50:13<00:00, 111.59s/it]
100%|██████████| 27/27 [15:32<00:00, 34.52s/it]
