In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [2]:
#sample documents
documents = [
    "Anime is great",
    "Where is hell have you been",
    "Anime is the greatest media ever created",
    "Movie is a media used for generation"
]



In [3]:
query = "Best media ever"

In [4]:
import re

def preprocess_text(text):
  text = text.lower()
  text = re.sub(r'[^\w\s]', '', text)
  return text



In [5]:
preprocessed_doc = [preprocess_text(doc) for doc in documents]

In [6]:
preprocessed_doc

['anime is great',
 'where is hell have you been',
 'anime is the greatest media ever created',
 'movie is a media used for generation']

In [7]:
preprocessed_query = preprocess_text(query)

In [8]:
preprocessed_query

'best media ever'

In [9]:
vector = TfidfVectorizer()

In [10]:
X = vector.fit_transform(preprocessed_doc)

In [11]:
X.toarray()

array([[0.5728925 , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.72664149, 0.        , 0.        , 0.        ,
        0.37919167, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        ],
       [0.        , 0.43551105, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.43551105, 0.43551105,
        0.22726773, 0.        , 0.        , 0.        , 0.        ,
        0.43551105, 0.43551105],
       [0.33570696, 0.        , 0.42580171, 0.42580171, 0.        ,
        0.        , 0.        , 0.42580171, 0.        , 0.        ,
        0.222201  , 0.33570696, 0.        , 0.42580171, 0.        ,
        0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.45203489,
        0.45203489, 0.        , 0.        , 0.        , 0.        ,
        0.23589056, 0.3563895 , 0.45203489, 0.        , 0.45203489,
        0.        , 0.        ]])

In [12]:
query_embedding = vector.transform([preprocessed_query])

In [13]:
query_embedding.toarray()

array([[0.        , 0.        , 0.        , 0.78528828, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.6191303 , 0.        , 0.        , 0.        ,
        0.        , 0.        ]])

using tfidf instead of hugging face transformer. so that the vector representation is of sparse vector instead of dense vector


In [14]:
similarities = cosine_similarity(X, query_embedding)

In [15]:
similarities

array([[0.        ],
       [0.        ],
       [0.54222344],
       [0.22065154]])

In [16]:
np.argsort(similarities, axis = 0)

array([[0],
       [1],
       [3],
       [2]])

In [17]:
ranked_indices = np.argsort(similarities, axis = 0)[::-1].flatten()

In [18]:
ranked_indices

array([2, 3, 1, 0])

In [19]:
ranked_documents = [documents[i] for i in ranked_indices]

In [20]:
ranked_documents

['Anime is the greatest media ever created',
 'Movie is a media used for generation',
 'Where is hell have you been',
 'Anime is great']

In [21]:
for i, doc in enumerate(ranked_documents):
  print(f"Rank {i} : {doc}")

Rank 0 : Anime is the greatest media ever created
Rank 1 : Movie is a media used for generation
Rank 2 : Where is hell have you been
Rank 3 : Anime is great


In [22]:
docs_path = "/content/Deep Learning Techniques for Time Series Forecasting_ A Comprehensive Guide _ by Huntress Elle _ Medium.pdf"

In [23]:
!pip install pypdf



In [24]:
!pip install langchain_community



In [25]:
from langchain_community.document_loaders import PyPDFLoader

In [26]:
loader = PyPDFLoader(docs_path)

In [27]:
docs = loader.load()

In [28]:
from langchain.text_splitter import RecursiveCharacterTextSplitter


In [29]:
splitter = RecursiveCharacterTextSplitter(chunk_size = 200, chunk_overlap = 30)

In [30]:
chunks = splitter.split_documents(docs)

In [31]:
from langchain.embeddings import HuggingFaceEmbeddings

In [32]:
embeddings = HuggingFaceEmbeddings( model_name = "BAAI/bge-base-en-v1.5")

  embeddings = HuggingFaceEmbeddings( model_name = "BAAI/bge-base-en-v1.5")
  from tqdm.autonotebook import tqdm, trange
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [33]:
!pip install chromadb

Collecting tokenizers<=0.20.3,>=0.13.2 (from chromadb)
  Downloading tokenizers-0.20.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading tokenizers-0.20.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m23.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tokenizers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.21.0
    Uninstalling tokenizers-0.21.0:
      Successfully uninstalled tokenizers-0.21.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
transformers 4.47.0 requires tokenizers<0.22,>=0.21, but you have tokenizers 0.20.3 which is incompatible.[0m[31m
[0mSuccessfully installed tokenizers-0.20.3


In [34]:
from langchain.vectorstores import Chroma

In [35]:
vectorstore = Chroma.from_documents(chunks, embeddings)

In [36]:
vectorstore_retriever = vectorstore.as_retriever(search_kwargs = {"k":3})

In [37]:
!pip install rank_bm25



In [38]:
from langchain.retrievers import BM25Retriever, EnsembleRetriever

In [39]:
keyword_retriver = BM25Retriever.from_documents(chunks)

based on the similarity search how many sentences to fetch

In [40]:
keyword_retriver.k = 3

In [41]:
retriver = EnsembleRetriever(retrievers = [vectorstore_retriever, keyword_retriver], weights = [0.3,0.7])

In [42]:
model_name = "HuggingFaceH4/zephyr-7b-beta"

In [43]:
# !pip install bitsandbytes

In [44]:
# !pip install accelerate

In [45]:
!pip install -U transformers accelerate bitsandbytes

Collecting tokenizers<0.22,>=0.21 (from transformers)
  Using cached tokenizers-0.21.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Using cached tokenizers-0.21.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.0 MB)
Installing collected packages: tokenizers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.20.3
    Uninstalling tokenizers-0.20.3:
      Successfully uninstalled tokenizers-0.20.3
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
chromadb 0.5.23 requires tokenizers<=0.20.3,>=0.13.2, but you have tokenizers 0.21.0 which is incompatible.[0m[31m
[0mSuccessfully installed tokenizers-0.21.0


In [46]:
import torch
from transformers import (AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline, )
from langchain import HuggingFacePipeline


In [47]:
def load_quantized_model(model_name:str):
  bnb_config = BitsAndBytesConfig(
      load_in_4bit = True,
      bnb_4bit_use_double_quant= True,
      bnb_4bit_quant_type="nf4",
      bnb_4bit_compute_dtype=torch.bfloat16,
  )
  model = AutoModelForCausalLM.from_pretrained(model_name,
                                               torch_dtype = torch.bfloat16,
                                               quantization_config = bnb_config)
  return model




In [48]:
def initialize_tokenizer(model_name:str):
  tokenizer = AutoTokenizer.from_pretrained(model_name)
  tokenizer.bos_token_id = 1
  return tokenizer

In [49]:
tokenizer = initialize_tokenizer(model_name)

In [50]:
model = load_quantized_model(model_name)

`low_cpu_mem_usage` was None, now default to True since model is quantized.


model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/8 [00:00<?, ?it/s]

model-00001-of-00008.safetensors:   0%|          | 0.00/1.89G [00:00<?, ?B/s]

model-00002-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00003-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00004-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00005-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00006-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00007-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00008-of-00008.safetensors:   0%|          | 0.00/816M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

In [51]:
pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    use_cache=True,
    device_map="auto",
    max_length=2048,
    do_sample=True,
    top_k=5,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.pad_token_id,
)

Device set to use cuda:0


In [52]:
llm = HuggingFacePipeline(pipeline=pipeline)

  llm = HuggingFacePipeline(pipeline=pipeline)


In [53]:
from langchain.chains import RetrievalQA

In [55]:
normal_chain = RetrievalQA.from_chain_type(
    llm=llm, chain_type="stuff", retriever=vectorstore_retriever
)


In [57]:
hybrid_chain = RetrievalQA.from_chain_type(
    llm=llm, chain_type="stuff", retriever=retriver
)


In [61]:
response1 = normal_chain.invoke("How can CNN help in timeseries forecasting")

In [62]:
response1


{'query': 'How can CNN help in timeseries forecasting',
 'result': "Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.\n\nConvolutional Neural Networks (CNNs): While predominantly known for image\nprocessing, CNNs have also been adapted for time series forecasting. They excel at\n\ncan be effectively utilized for accurate and efficient time series forecasting.\nThe Road Ahead for Deep Learning and Time Series Forecasting\n\nThe application of deep learning in time series forecasting has led to\ngroundbreaking advancements across various sectors. Let’s explore some real-world\n\nQuestion: How can CNN help in timeseries forecasting\nHelpful Answer: CNN, commonly known for its application in image processing, has shown promising results in timeseries forecasting as well. CNNs have the ability to extract features from time series data, which can then be used for accurate and effi

In [64]:
print(response1.get("result"))

Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

Convolutional Neural Networks (CNNs): While predominantly known for image
processing, CNNs have also been adapted for time series forecasting. They excel at

can be effectively utilized for accurate and efficient time series forecasting.
The Road Ahead for Deep Learning and Time Series Forecasting

The application of deep learning in time series forecasting has led to
groundbreaking advancements across various sectors. Let’s explore some real-world

Question: How can CNN help in timeseries forecasting
Helpful Answer: CNN, commonly known for its application in image processing, has shown promising results in timeseries forecasting as well. CNNs have the ability to extract features from time series data, which can then be used for accurate and efficient forecasting. This application of deep learning in time series forecasting 

In [65]:
response2 = hybrid_chain.invoke("How can CNN help in timeseries forecasting")

In [66]:
print(response2.get("result"))

Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

Overfitting: Deep learning models are prone to overfitting, especially when dealing
with large datasets. Regularization techniques, such as dropout and early stopping,
can help mitigate this issue.

can help mitigate this issue.
Best Practices:
1. Start with Simple Models: Begin with simpler models and gradually move to
more complex ones as needed. This approach helps in understanding the data

useful for analyzing timeseries with complex, layered structures, such as
electroencephalogram (EEG) signal analysis or predicting seismic activities.

Convolutional Neural Networks (CNNs): While predominantly known for image
processing, CNNs have also been adapted for time series forecasting. They excel at

can be effectively utilized for accurate and efficient time series forecasting.
The Road Ahead for Deep Learning and Time Series