In [1]:
from langchain.llms import GooglePalm

In [2]:
import os
import streamlit as st
import pickle
import time
import langchain
from langchain import OpenAI
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.chains.qa_with_sources.loading import load_qa_with_sources_chain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import UnstructuredURLLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
import requests
from bs4 import BeautifulSoup

In [3]:
import pandas as pd

In [4]:
with open("google_key.txt", "r") as f:
    api_key=f.read()

In [5]:
llm=GooglePalm(google_api_key=api_key, temperature=0.7)

In [6]:
urls=["https://www.livemint.com/news/india/uttarkashi-tunnel-collapse-silkyara-ndrf-deploys-team-rescue-mission-41-workers-top-10-updates-pushkar-singh-dhami-11700703183514.html",
     "https://timesofindia.indiatimes.com/city/kochi/centre-calls-health-meet-as-covid-rises-in-kerala-10-deaths-this-month/articleshow/106106851.cms"]
responses=[]

for url in urls:
    response=requests.get(url)
    if response.status_code==200:
        soup = BeautifulSoup(response.text, 'html.parser')
        text = soup.get_text()
        responses.append((text, response.url))
        
    else:
        print(f"Failed to retrieve the URL. Url={url} Status_code :{response.status_code}")

In [7]:
len(responses)

2

In [8]:
import string 
punct=string.punctuation
seps=[]
for i in punct:
    seps.append(i)

In [9]:
text_splitter=RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", "?","." ]+seps+[" "],
    chunk_size=1000,
    chunk_overlap=200
)

In [10]:
splitted_tuple=[]
for i in responses:
    splitted_text= text_splitter.split_text(i[0])
    for j in splitted_text:
        splitted_tuple.append((j, i[1]))

In [11]:
data_dict=[{"chunks": content, "url": url} for content, url in splitted_tuple]

In [12]:
df = pd.DataFrame(data_dict)

In [13]:
def filter_chunks(text):
    text=text.lower().strip()
    text=text.split()
    words=[]
    for i in text:
        if i not in seps:
            word="".join([j for j in i if j.isalnum()])
            words.append(word)
    sentence=" ".join(words)
    if(len(sentence)>0):
        return sentence  

In [14]:
filter_chunks("add! .")

'add'

In [15]:
df["chunks"]=df["chunks"].apply(filter_chunks)

In [16]:
csv_path='dataset.csv'
if os.path.exists(csv_path):
    os.remove(csv_path)

In [17]:
df.to_csv(csv_path, index=False)

In [18]:
csv_data=pd.read_csv("dataset.csv",encoding='latin1')

In [19]:
csv_data.dropna()

Unnamed: 0,chunks,url
0,uttarkashi tunnel collapse ndrf team enters tu...,https://www.livemint.com/news/india/uttarkashi...
1,explore sign in epaper subscribe thursday 11 j...,https://www.livemint.com/news/india/uttarkashi...
2,top sections news india news world news econom...,https://www.livemint.com/news/india/uttarkashi...
3,a opinion markets stock markets commodity news...,https://www.livemint.com/news/india/uttarkashi...
4,multimedia collections videos webstories photo...,https://www.livemint.com/news/india/uttarkashi...
6,gainers losers top gainers top losers indusind...,https://www.livemint.com/news/india/uttarkashi...
7,back share via uttarkashi tunnel collapse ndrf...,https://www.livemint.com/news/india/uttarkashi...
8,here are the top ten updates on the uttarkashi...,https://www.livemint.com/news/india/uttarkashi...
9,3 the dedicated freight corridor corporation o...,https://www.livemint.com/news/india/uttarkashi...
10,5 after conversing with the workers their fami...,https://www.livemint.com/news/india/uttarkashi...


In [20]:
from langchain.document_loaders.csv_loader import CSVLoader

In [21]:
loader=CSVLoader(file_path=csv_path, source_column='chunks')

In [22]:
data=loader.load()

In [23]:
data

[Document(page_content='chunks: uttarkashi tunnel collapse ndrf team enters tunnel to save 41 trapped workers with ambulances on standby 10 updates mint\nurl: https://www.livemint.com/news/india/uttarkashi-tunnel-collapse-silkyara-ndrf-deploys-team-rescue-mission-41-workers-top-10-updates-pushkar-singh-dhami-11700703183514.html', metadata={'source': 'uttarkashi tunnel collapse ndrf team enters tunnel to save 41 trapped workers with ambulances on standby 10 updates mint', 'row': 0}),
 Document(page_content='chunks: explore sign in epaper subscribe thursday 11 january 2024 stocks mutual funds news home budget2024 news markets premium money mutual fund industry companies technology web stories opinion videos all companies technology markets money mutual funds insurance auto industry personal finance hello user sign in sign out my account my account subscribe my watchlist newsletters notifications my reads for you view less view more data insights market dashboard bullion gold silver fuel 

In [None]:
import faiss
import numpy as np

# Assuming 'data' is your input data and 'embeddings' are the corresponding vectors

# Flatten the data and make sure it's in float32 format
flat_data = np.array(data).astype('float32')
flat_embeddings = np.array(embeddings).astype('float32')

# Instantiate an index
index = faiss.IndexFlatL2(flat_embeddings.shape[1])  # L2 distance index

# Add vectors to the index
index.add(flat_embeddings)

# Now, 'index' is ready for similarity searches

In [29]:
embeddings= OpenAIEmbeddings()

vectors=FAISS.from_documents(data, embeddings)

2024-01-11 11:10:42.280 INFO    openai: error_code=insufficient_quota error_message='You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.' error_param=None error_type=insufficient_quota message='OpenAI API error received' stream_error=False
2024-01-11 11:10:46.991 INFO    openai: error_code=insufficient_quota error_message='You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.' error_param=None error_type=insufficient_quota message='OpenAI API error received' stream_error=False
2024-01-11 11:10:51.497 INFO    openai: error_code=insufficient_quota error_message='You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com

RateLimitError: You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.

In [26]:
import pickle

In [28]:
file_path='vector_index.pkl'
if os.path.exists(csv_path):
    os.remove(csv_path)
with open(file_path, "wb") as f:
    pickle.dump(vectors, f)

NameError: name 'vectors' is not defined