In [1]:
import os
import streamlit as st
import pickle
import time
import langchain
from langchain import OpenAI
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.chains.qa_with_sources.loading import load_qa_with_sources_chain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import UnstructuredURLLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
import requests
from bs4 import BeautifulSoup

In [6]:
import pandas as pd
pd.set_option('display.max_colwidth', 100)
with open("google_key.txt", "r") as f:
    api_key=f.read()

In [7]:
from langchain.llms import GooglePalm

In [4]:
llm= OpenAI(temperature =0.9 , max_tokens=500)

ValidationError: 1 validation error for OpenAI
__root__
  Did not find openai_api_key, please add an environment variable `OPENAI_API_KEY` which contains it, or pass  `openai_api_key` as a named parameter. (type=value_error)

In [4]:
urls=["https://www.livemint.com/news/india/uttarkashi-tunnel-collapse-silkyara-ndrf-deploys-team-rescue-mission-41-workers-top-10-updates-pushkar-singh-dhami-11700703183514.html",
     "https://www.indiatvnews.com/sports/cricket/india-s-schedule-after-world-cup-2023-final-loss-india-vs-australia-t20i-series-t20-world-cup-2024-2023-11-20-903594"]
responses=[]

for url in urls:
    response=requests.get(url)
    if response.status_code==200:
        soup = BeautifulSoup(response.text, 'html.parser')
        text = soup.get_text()
        responses.append((text, response.url))
        
    else:
        print(f"Failed to retrieve the URL. Url={url} Status_code :{response.status_code}")

In [5]:
len(responses)

2

In [6]:
import string 
punct=string.punctuation
seps=[]
for i in punct:
    seps.append(i)

In [7]:
text_splitter=RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", "?","." ]+seps+[" "],
    chunk_size=1000,
    chunk_overlap=200
)

In [8]:
splitted_tuple=[]
for i in responses:
    splitted_text= text_splitter.split_text(i[0])
    for j in splitted_text:
        splitted_tuple.append((j, i[1]))

In [9]:
data_dict=[{"chunks": content, "url": url} for content, url in splitted_tuple]

In [10]:
data_dict[0]

{'chunks': 'Uttarkashi Tunnel Collapse: NDRF team enters tunnel to save 41 trapped workers with ambulances on standby | 10 updates | Mint',
 'url': 'https://www.livemint.com/news/india/uttarkashi-tunnel-collapse-silkyara-ndrf-deploys-team-rescue-mission-41-workers-top-10-updates-pushkar-singh-dhami-11700703183514.html'}

In [11]:
df = pd.DataFrame(data_dict)

In [12]:
from sentence_transformers import SentenceTransformer

In [13]:
encoder= SentenceTransformer("all-mpnet-base-v2")
encoded_data=encoder.encode(df.chunks)

2023-11-30 11:16:48.262 INFO    sentence_transformers.SentenceTransformer: Load pretrained SentenceTransformer: all-mpnet-base-v2


Downloading .gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading 1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading train_script.py:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

2023-11-30 11:17:36.467 INFO    sentence_transformers.SentenceTransformer: Use pytorch device: cpu


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

In [16]:
dim=encoded_data.shape[1]
dim

768

In [18]:
import faiss
index=faiss.IndexFlatL2(dim)
index

2023-11-30 11:23:08.289 INFO    faiss.loader: Loading faiss with AVX2 support.
2023-11-30 11:23:08.290 INFO    faiss.loader: Could not load library with AVX2 support due to:
ModuleNotFoundError("No module named 'faiss.swigfaiss_avx2'")
2023-11-30 11:23:08.292 INFO    faiss.loader: Loading faiss.
2023-11-30 11:23:08.654 INFO    faiss.loader: Successfully loaded faiss.


<faiss.swigfaiss.IndexFlatL2; proxy of <Swig Object of type 'faiss::IndexFlatL2 *' at 0x000002C15B4FFDB0> >

In [19]:
index.add(encoded_data)

In [20]:
search_query="what is the number of member of NDRF team?"

vec=encoder.encode(search_query)
vec.shape

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

(768,)

In [22]:
import numpy as np

svec=np.array(vec).reshape(1, -1)
svec.shape

(1, 768)

In [23]:
distances, I =index.search(svec , k=2)

In [25]:
I

array([[0, 7]], dtype=int64)

In [27]:
df.loc[I[0]].chunks

0    Uttarkashi Tunnel Collapse: NDRF team enters tunnel to save 41 trapped workers with ambulances o...
7    Back\n\n\n\n\nShare Via\n\n\n\n\n\n\n\n\n \n\n\n\n\nUttarkashi Tunnel Collapse: NDRF team enters...
Name: chunks, dtype: object

In [31]:
index_with_id = faiss.IndexIDMap(index)

RuntimeError: Error in __cdecl faiss::IndexIDMapTemplate<struct faiss::Index>::IndexIDMapTemplate(struct faiss::Index *) at D:\a\faiss-wheels\faiss-wheels\faiss\faiss\IndexIDMap.cpp:32: Error: 'index->ntotal == 0' failed: index must be empty on input

In [32]:
chain=RetrievalQAWithSourcesChain.from_llm(llm=llm , retriever=index)

ValidationError: 1 validation error for RetrievalQAWithSourcesChain
retriever
  value is not a valid dict (type=type_error.dict)