# Installing Modules

In [1]:
!pip install langchain chromadb pypdf sentence_transformers InstructorEmbedding streamlit bitsandbytes ctransformers[cuda] accelerate einops safetensors xformers
!pip install -q -U git+https://github.com/huggingface/peft.git

Collecting langchain
  Downloading langchain-0.0.349-py3-none-any.whl (808 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m808.6/808.6 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting chromadb
  Downloading chromadb-0.4.18-py3-none-any.whl (502 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m502.4/502.4 kB[0m [31m36.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pypdf
  Downloading pypdf-3.17.2-py3-none-any.whl (277 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m277.9/277.9 kB[0m [31m28.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sentence_transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m14.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting InstructorEmbedding
  Downloading InstructorEmbedding-1.0.1-py2.py3-none-any.whl (19 kB)
Collecting streamli

In [10]:
import torch
from transformers import BitsAndBytesConfig
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

from langchain import HuggingFacePipeline
from langchain import PromptTemplate, LLMChain
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import DirectoryLoader
from langchain.llms import LlamaCpp

from InstructorEmbedding import INSTRUCTOR
from langchain.embeddings import HuggingFaceInstructEmbeddings

## Defining the LLM

In [3]:
from langchain.llms import CTransformers


config = {'max_new_tokens': 1024, 'temperature': 0, 'context_length': 1024}
llm = CTransformers(model='TheBloke/Mistral-7B-Instruct-v0.1-GGUF',model_file="mistral-7b-instruct-v0.1.Q4_K_M.gguf", config=config, n_ctx=2048)

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

config.json:   0%|          | 0.00/31.0 [00:00<?, ?B/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

mistral-7b-instruct-v0.1.Q4_K_M.gguf:   0%|          | 0.00/4.37G [00:00<?, ?B/s]

## Loading PDF

In [23]:
file_path = "/content/drive/MyDrive/500+ Data Engineering Interview Questions & Answers.pdf"
loader = PyPDFLoader(file_path)

document = loader.load()

### Splitting Text

In [24]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=300)
texts = text_splitter.split_documents(document)

In [25]:
len(texts)

318

### Embedding

In [26]:
instructor_embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-large",
model_kwargs={'device': 'cuda:0'})

.gitattributes:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/270 [00:00<?, ?B/s]

2_Dense/config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/3.15M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/66.3k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.53k [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.41k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/461 [00:00<?, ?B/s]

load INSTRUCTOR_Transformer
max_seq_length  512


### Creating DataBase

In [27]:
# Embed and store the texts
# Supplying a persist_directory will store the embeddings on disk
persist_directory = 'db'

## Here is the nmew embeddings being used
embedding = instructor_embeddings

vectordb = Chroma.from_documents(documents=texts,
embedding=embedding,
persist_directory=persist_directory)

# persiste the db to disk
vectordb.persist()
vectordb = None

In [28]:
# Now we can load the persisted database from disk, and use it as normal.
vectordb = Chroma(persist_directory=persist_directory,
embedding_function=embedding)

### Building a Retreiver

In [29]:
retriever = vectordb.as_retriever(search_kwargs={"k": 3})

### Making a Chain

In [30]:
# create the chain to answer questions
qa_chain = RetrievalQA.from_chain_type(llm=llm,
chain_type="stuff",
retriever=retriever,
return_source_documents=True)

In [36]:
## Cite sources

import textwrap

def wrap_text_preserve_newlines(text, width=110):
# Split the input text into lines based on newline characters
 lines = text.split('\n')

# Wrap each line individually
 wrapped_lines = [textwrap.fill(line, width=width) for line in lines]

# Join the wrapped lines back together using newline characters
 wrapped_text = '\n'.join(wrapped_lines)

 return wrapped_text


In [37]:
def process_llm_response(llm_response):
 print(wrap_text_preserve_newlines(llm_response['result']))
 print('\n\nSources:')
 for source in llm_response["source_documents"]:
  print(source.metadata['source'])

### Prompting

In [38]:
%%time

# example
query = "Difference between spark 1.6 and 2.x?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

Number of tokens (1025) exceeded maximum context length (1024).
Number of tokens (1026) exceeded maximum context length (1024).
Number of tokens (1027) exceeded maximum context length (1024).
Number of tokens (1028) exceeded maximum context length (1024).
Number of tokens (1029) exceeded maximum context length (1024).


 Spark 1.6x has some performance issues which are corrected in Spark 2.x. The main differences between Spark
1.6x and 2.x are:

* SparkSession: In Spark 2.x, the SparkSession is a unified entry point for all Spark operations. It replaces
the previous entry points like SQLContext and HadoopContext.
* Faster analysis: Spark 2.x has improved performance for data processing tasks.
* Added SQL features: Spark 2.x includes new SQL features like window functions, JSON support, and more.
* MLib improvements: Spark 2.x has improved the Machine Learning Library (MLib) with new algorithms and
optimizations.
* New streaming module: Spark 2.x includes a new streaming module for real-time data processing.
* Unified dataset and data frame API's: In Spark 2.x, the dataset and data frame APIs are unified, making it
easier to work withstandar e


Sources:
/content/drive/MyDrive/500+ Data Engineering Interview Questions & Answers.pdf
/content/drive/MyDrive/500+ Data Engineering Interview Questions & Answ