In [1]:
# import necessary libraries
import pandas as pd
pd.set_option('display.max_colwidth', 100)

In [2]:
# load dataset
df = pd.read_csv("sample_text.csv ")
df.shape

(8, 2)

In [3]:
df

Unnamed: 0,text,category
0,Meditation and yoga can improve mental health,Health
1,"Fruits, whole grains and vegetables helps control blood pressure",Health
2,These are the latest fashion trends for this week,Fashion
3,Vibrant color jeans for male are becoming a trend,Fashion
4,The concert starts at 7 PM tonight,Event
5,Navaratri dandiya program at Expo center in Mumbai this october,Event
6,Exciting vacation destinations for your next trip,Travel
7,Maldives and Srilanka are gaining popularity in terms of low budget vacation places,Travel


# Step 1: Create Source Embedding for the text Column #

In [8]:
from sentence_transformers import SentenceTransformer
encoder = SentenceTransformer('all-mpnet-base-v2')
vectors = encoder.encode(df.text)


In [9]:
vectors.shape

(8, 768)

In [11]:
dim = vectors.shape[1]
dim

768

# Step 2 : Build a FAISS Index for Vectors

In [12]:
import faiss
index = faiss.IndexFlatL2(dim)

In [None]:
import os
import streamlit as st
import pickle
import time
import langchain
from langchain import OpenAI
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.chains.qa_with_sources.loading import load_qa_with_sources_chain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import UnstructuredURLLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS

# Step 3 : Normalize the source vectors(as we are using L2 distance to measure similarity ) and add to the index

In [13]:
index.add(vectors)

In [14]:
index

<faiss.swigfaiss.IndexFlatL2; proxy of <Swig Object of type 'faiss::IndexFlatL2 *' at 0x00000286EE9FA3A0> >

# Step 4 : Encode Search text using same encoder and normalize the output vector

In [17]:
search_query = "What is the best way to learn python?"
vec = encoder.encode([search_query])
vec.shape

(1, 768)

In [18]:
import numpy as np
svec = np.array(vec).reshape(1, -1)
svec.shape

(1, 768)

# Step 5: Search for Similar Vector in the FAISS index created

In [27]:
distances, I = index.search(svec, k=2)
distances

array([[1.8346734, 1.864798 ]], dtype=float32)

In [28]:
I

array([[5, 6]], dtype=int64)

In [31]:
I.tolist()

[[5, 6]]

In [32]:
row_indices = I.tolist()[0]
row_indices

[5, 6]

In [29]:
df.loc[I[0]]

Unnamed: 0,text,category
5,Navaratri dandiya program at Expo center in Mumbai this october,Event
6,Exciting vacation destinations for your next trip,Travel


In [30]:
search_query

'What is the best way to learn python?'

In [None]:
from langchain.document_loaders.csv_loader import CSVLoader

loader = CSVLoader("data.csv",source_column="prompt")
data = loader.load()
len(data)

In [None]:
data[0].metadata

In [None]:
from langchain.document_loaders import SeleniumURLLoader

In [None]:
from langchain.document_loaders import UnstructuredURLLoader


In [None]:
loader = UnstructuredURLLoader(urls=["https://www.gutenberg.org/files/1342/1342-0.txt",
                                        "https://www.gutenberg.org/files/11/11-0.txt"])

In [None]:
urls = "https://www.moneycontrol.com/news/business/tata-motors-launches-punch-icng-price-starts-at-rs-549-lakh-7500001.html",
"https://www.moneycontrol.com/news/business/markets/wall-street-rises--as-tesla-soars-on-ai-optimism-11351111.html",
