# Reading The Scala 2 Cookbook

<img src="https://m.media-amazon.com/images/I/91AfDQsL7SL._AC_UF1000,1000_QL80_.jpg" width="300" />


In [8]:
import PyPDF2
from tqdm import tqdm
import os


def divide_chunks(text_body, min_chunk_size=1000, max_chunk_size=2500):
    in_chunks = text_body.split(".\n")
    out_chunks = []
    curr_chunk = ""
    for ic in in_chunks:
        if len(ic) < min_chunk_size:
            if len(ic)+len(curr_chunk) < max_chunk_size:
                curr_chunk = curr_chunk + "\n" + ic
            else:
                out_chunks.append(curr_chunk)
                curr_chunk = ""
    return out_chunks


def clean_chunk(raw_chunk):
    chunk = raw_chunk.replace('-\n','').replace('\n',' ')
    return chunk


body = ""
with open(f"data/pdf/scala_book/Scala_Cookbook.pdf", "rb") as file:
    reader = PyPDF2.PdfReader(file)
    for page in tqdm(reader.pages):
        text = page.extract_text()
        body += "\n\n" + text
    chunk_list = [clean_chunk(c) for c in divide_chunks(body)]

100%|██████████| 722/722 [00:08<00:00, 80.54it/s]


# Summarization

In [None]:
import torch
from transformers import pipeline

hf_name = 'pszemraj/led-large-book-summary'

summarizer = pipeline(
    "summarization",
    hf_name,
    device=0 if torch.cuda.is_available() else -1,
)

summaries = []

for sentence in tqdm(chunk_list):
  summaries.append(summarizer(
    sentence,
    max_length=256,
    no_repeat_ngram_size=3,
    encoder_no_repeat_ngram_size=3,
    repetition_penalty=3.5,
    num_beams=4,
    early_stopping=True,
  )[0]["summary_text"])

# Merging smaller summarized chunks

In [None]:
def merge_chunks(small_chunks, aggregation_cardinality):
  big_chunks = []
  for i in range(0, len(small_chunks), aggregation_cardinality):
    current_merge = ".\n".join(small_chunks[i:i+aggregation_cardinality])
    big_chunks.append(current_merge)
  return big_chunks

big_chunk_list = merge_chunks(summaries, 4)

# Embeddings
##### Multidimensional Vector Representation of Semantic Meanings
<img src="https://corpling.hypotheses.org/files/2018/04/Screen-Shot-2018-04-25-at-13.21.44.png" width="400" />



In [None]:
from sentence_transformers import SentenceTransformer
import itertools

model_name = "sentence-transformers/all-MiniLM-L12-v2"

model = SentenceTransformer(model_name, device='cuda')

embeddings = []
for bc in tqdm(big_chunk_list):
  embeddings.append([float(emb) for emb in model.encode(bc)])

# Database
##### Creating a Vector Database of Embeddings using the open-source ChromaDB
<img src="https://www.mlq.ai/content/images/2023/08/1_admwyPyR6v_IZI0EYE--eA-1.webp" width="250" />


In [14]:
from chromadb import Client
from chromadb.utils import embedding_functions


chroma_client = Client()

default_ef = embedding_functions.DefaultEmbeddingFunction()

collection = chroma_client.get_or_create_collection(name="scala_book_chunks", embedding_function=default_ef)

collection.add(
    documents=big_chunk_list,
    embeddings=embeddings,
    ids=[str(x) for x in range(len(big_chunk_list))]
)

# ChatBot
##### Using the open-source Databricks' Dolly model fine-tuned through the RAG techinique
<img src="https://www.databricks.com/sites/default/files/2023-04/Dolly-logo.png" width="300" />

In [15]:
from transformers import pipeline
import torch
from langchain import PromptTemplate
from langchain.llms import HuggingFacePipeline
from langchain.chains.question_answering import load_qa_chain


def build_qa_chain():

    model_name = "databricks/dolly-v2-3b" # Dolly smallest version (3 billion params)

    instruct_pipeline = pipeline(model=model_name, torch_dtype=torch.bfloat16, trust_remote_code=True,
                                 return_full_text=True, max_new_tokens=4096, top_p=0.95, top_k=50,
                                 device=0) #cuda

    template = """Below is an instruction that describes a task. Write a response that appropriately completes the request.

    Instruction:
    You are an expert Scala developer.
    You use a simple language to explain concepts.
    You reply using only short textual descriptions and code expamples.

    {context}

    Question: {question}

    Response:
    """

    prompt = PromptTemplate(input_variables=['context', 'question'], template=template)

    hf_pipe = HuggingFacePipeline(pipeline=instruct_pipeline)

    return load_qa_chain(llm=hf_pipe, chain_type="stuff", prompt=prompt, verbose=True)

In [None]:
# Building the chain will load Dolly and can take several minutes depending on the model size
qa_chain = build_qa_chain()

In [26]:
class Document():
    def __init__(self, content):
        self.page_content = content
        self.metadata = {"metadata": ""}

def get_similar_docs(question):
    results = collection.query(
        query_embeddings=[float(x) for x in model.encode(question)],
        n_results=2
    )
    return results["documents"]

def answer_question(question):
    similar_docs = [Document(x) for x in get_similar_docs(question)]
    result = qa_chain({"input_documents": similar_docs, "question": question})
    return result

In [27]:
import os
os.environ['CURL_CA_BUNDLE'] = ''

question = "Which are the strenghts of the Scala programming language?"

answer = answer_question(question)

print(answer["output_text"])



[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mBelow is an instruction that describes a task. Write a response that appropriately completes the request.

    Instruction:
    You are an expert Scala developer.
    You use a simple language to explain concepts.
    You reply using only short textual descriptions and code expamples.

    ['In this chapter, the author gives a detailed introduction of the Scala programming language and its many features. He also thanks his sister Melissa who helped him to edit his manuscript into WordPerfect before he uploaded it to Microsoft Word. Finally, he\'d like to take a special shout-out to Martin Oederdy and his engineering team at E FPL for helping him to integrate the Scala REPL into his daily workflow. The first chapter introduces some of the most important Scala features such as scalability, which are useful for both programmers and non-programmers alike. S

In [30]:
question = "What are side effects?"

answer = answer_question(question)

print(answer["output_text"])



[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mBelow is an instruction that describes a task. Write a response that appropriately completes the request.

    Instruction:
    You are an expert Scala developer.
    You use a simple language to explain concepts.
    You reply using only short textual descriptions and code expamples.

    ['Using STDOUT In this chapter, we cover the basics of executing commands and using STDOUT to execute them in a Scala program. We\'ll start with a problem that sounds simple: What do you do when you run a command into a terminal emulator? The solution is to use !! to execute STDOUT . This allows you to execute any command from a process in another part of the system and get STDOUT , which is the output of the executed command. It\'s similar to the " ! command" shown in Chapter 11, except instead of a single string you can attach multiple strands . STDOUT is the result




[1m> Finished chain.[0m

[1m> Finished chain.[0m

In computer science, a side effect of an operation is the change in the state of the system caused by the operation. It can also refer to an unintended, unintended, undesired, or undesirable result of an action.


In [29]:
question = "How do I transform a sequence of string to uppercase?"

answer = answer_question(question)

print(answer["output_text"])



[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mBelow is an instruction that describes a task. Write a response that appropriately completes the request.

    Instruction:
    You are an expert Scala developer.
    You use a simple language to explain concepts.
    You reply using only short textual descriptions and code expamples.

    ['In this chapter, Overton explains how to implement a method into a map , which is a mapping of strings into a collection of strings. He discusses the steps for implementing such a method and gives an example of how to use it to process a string one character at a time. In Chapter 1, he describes his technique for finding patterns in strings . To him, the trickiest thing to do is to find a pattern in a string by matching its first two characters with numPattern . He uses the regex pattern on a string to search for matches. Although it\'s more work than he originally 