In [53]:
from langchain.llms import GooglePalm

In [54]:
import os
import streamlit as st
import pickle
import time
import langchain
from langchain import OpenAI
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.chains.qa_with_sources.loading import load_qa_with_sources_chain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import UnstructuredURLLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
import requests
from bs4 import BeautifulSoup

In [55]:
import pandas as pd

In [56]:
with open("google_key.txt", "r") as f:
    api_key=f.read()

In [57]:
llm=GooglePalm(google_api_key=api_key, temperature=0.7)

In [58]:
urls=["https://www.livemint.com/news/india/uttarkashi-tunnel-collapse-silkyara-ndrf-deploys-team-rescue-mission-41-workers-top-10-updates-pushkar-singh-dhami-11700703183514.html",
     "https://timesofindia.indiatimes.com/city/kochi/centre-calls-health-meet-as-covid-rises-in-kerala-10-deaths-this-month/articleshow/106106851.cms"]
responses=[]

for url in urls:
    response=requests.get(url)
    if response.status_code==200:
        soup = BeautifulSoup(response.text, 'html.parser')
        text = soup.get_text()
        responses.append((text, response.url))
        
    else:
        print(f"Failed to retrieve the URL. Url={url} Status_code :{response.status_code}")

In [59]:
len(responses)

2

In [60]:
import string 
punct=string.punctuation
seps=[]
for i in punct:
    seps.append(i)

In [61]:
text_splitter=RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", "?","." ]+seps+[" "],
    chunk_size=1000,
    chunk_overlap=200
)

In [62]:
splitted_tuple=[]
for i in responses:
    splitted_text= text_splitter.split_text(i[0])
    for j in splitted_text:
        splitted_tuple.append((j, i[1]))

In [63]:
data_dict=[{"chunks": content, "url": url} for content, url in splitted_tuple]

In [64]:
df = pd.DataFrame(data_dict)

In [65]:
def filter_chunks(text):
    text=text.lower().strip()
    text=text.split()
    words=[]
    for i in text:
        if i not in seps:
            word="".join([j for j in i if j.isalnum()])
            words.append(word)
    sentence=" ".join(words)
    if(len(sentence)>0):
        return sentence  

In [66]:
filter_chunks("add! .")

'add'

In [67]:
df["chunks"]=df["chunks"].apply(filter_chunks)

In [68]:
csv_path='dataset.csv'
if os.path.exists(csv_path):
    os.remove(csv_path)

In [69]:
df.to_csv(csv_path, index=False)

In [70]:
csv_data=pd.read_csv("dataset.csv",encoding='latin1')

In [71]:
csv_data.dropna()

Unnamed: 0,chunks,url
0,uttarkashi tunnel collapse ndrf team enters tu...,https://www.livemint.com/news/india/uttarkashi...
1,explore sign in epaper subscribe tuesday 2 jan...,https://www.livemint.com/news/india/uttarkashi...
2,top sections news india news world news econom...,https://www.livemint.com/news/india/uttarkashi...
3,a opinion markets stock markets commodity news...,https://www.livemint.com/news/india/uttarkashi...
4,multimedia collections videos webstories photo...,https://www.livemint.com/news/india/uttarkashi...
6,gainers losers top gainers top losers sun phar...,https://www.livemint.com/news/india/uttarkashi...
7,business news news india uttarkashi tunnel col...,https://www.livemint.com/news/india/uttarkashi...
8,a 15member ndrf team led by a commandant has b...,https://www.livemint.com/news/india/uttarkashi...
9,2 medical professionals have been stationed wi...,https://www.livemint.com/news/india/uttarkashi...
10,3 the dedicated freight corridor corporation o...,https://www.livemint.com/news/india/uttarkashi...


In [72]:
from langchain.document_loaders.csv_loader import CSVLoader

In [73]:
loader=CSVLoader(file_path=csv_path, source_column='chunks')

In [74]:
data=loader.load()

In [75]:
data

[Document(page_content='chunks: uttarkashi tunnel collapse ndrf team enters tunnel to save 41 trapped workers with ambulances on standby 10 updates mint\nurl: https://www.livemint.com/news/india/uttarkashi-tunnel-collapse-silkyara-ndrf-deploys-team-rescue-mission-41-workers-top-10-updates-pushkar-singh-dhami-11700703183514.html', metadata={'source': 'uttarkashi tunnel collapse ndrf team enters tunnel to save 41 trapped workers with ambulances on standby 10 updates mint', 'row': 0}),
 Document(page_content='chunks: explore sign in epaper subscribe tuesday 2 january 2024 stocks mutual funds news home latest news markets premium money mutual fund industry companies technology web stories opinion videos all companies technology markets money mutual funds insurance auto industry personal finance hello user sign in sign out my account my account subscribe my watchlist newsletters notifications my reads for you view less view more data insights market dashboard bullion gold silver fuel petrol