# WebScraper and Vector Database

## Install Dependicies

In [None]:
import os
import requests
from langchain_text_splitters import RecursiveCharacterTextSplitter
import PyPDF2
from langchain.schema import Document  # Import the Document class
from langchain_huggingface import HuggingFaceEmbeddings

In [None]:
import faiss
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer

# Path to the CSV file
output_file_path = "/data/output.csv"

# Load the CSV file
df = pd.read_csv(output_file_path)

# Ensure the CSV file has a column named 'text'
if 'text' not in df.columns:
    raise ValueError("CSV file must have a 'text' column")

# Extract sentences from the CSV file
sentences = df['text'].tolist()

# Load a pre-trained sentence embedding model
model = SentenceTransformer('all-MiniLM-L6-v2')  # You can choose any model you prefer

# Generate embeddings for the sentences
embeddings = model.encode(sentences).astype('float32')

# Create a FAISS index
index = faiss.IndexFlatL2(embeddings.shape[1])  # L2 distance
index.add(embeddings)  # Add embeddings to the index

# Generate embedding for your query sentence
query_sentence = "What is a dialogue dataset?"
query_embedding = model.encode(query_sentence).astype('float32').reshape(1, -1)  # Reshape to 2D array

# Number of nearest neighbors to search for
k = 3  # You can change this to any number you want

# Search the index
distances, indices = index.search(query_embedding, k)

# Display the results
print("Query Sentence:", query_sentence)
print("Indices of nearest neighbors:", indices)
print("Distances to nearest neighbors:", distances)

# Retrieve and print the most similar sentences
for i in range(k):
    print(f"Similar Sentence {i + 1}: {sentences[indices[0][i]]} (Distance: {distances[0][i]})")

In [None]:
print(sentences[0])

## Download PDFs from single webpage

In [2]:
# Create a directory for PDFs if it doesn't exist
pdf_dir = 'pdfs'
os.makedirs(pdf_dir, exist_ok=True)

url = 'https://paperswithcode.com/paper/learning-to-memorize-entailment-and-discourse'  # Replace with the target URL
response = requests.get(url)

soup = BeautifulSoup(response.content, 'html.parser')
pdf_links = []

for link in soup.find_all('a', href=True):
    if link['href'].endswith('.pdf'):
        pdf_links.append(link['href'])

for pdf_link in pdf_links:
    # If the link is relative, make it absolute
    if not pdf_link.startswith('http'):
        pdf_link = url + pdf_link  # Adjust this based on the website structure

    pdf_response = requests.get(pdf_link)
    pdf_name = pdf_link.split('/')[-1]  # Get the file name from the URL

    # Save the PDF in the 'pdfs' directory
    pdf_path = os.path.join(pdf_dir, pdf_name)

    with open(pdf_path, 'wb') as pdf_file:
        pdf_file.write(pdf_response.content)

print(f"Downloaded {len(pdf_links)} PDF(s) into the '{pdf_dir}' folder.")

Downloaded 1 PDF(s) into the 'pdfs' folder.


## Text Splitter

In [4]:
def split_into_chunks(documents):
    """Splits documents into smaller pieces for processing"""
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,        # Maximum size of each text chunk
        chunk_overlap=300,      # Overlap between chunks for context continuity
        is_separator_regex=False  # Disable regex-based splitting
    )
    return splitter.split_documents(documents)

## Extract Text From PDF

In [5]:
def extract_text_from_pdf(pdf_path):
    text = ""
    with open(pdf_path, "rb") as file:
        reader = PyPDF2.PdfReader(file)
        for page in reader.pages:
            text += page.extract_text() + "\n"  # Add a newline for separation
    return text

## Split text into chunks

In [6]:
# Specify the path to the folder containing PDF files
pdf_folder_path = "pdfs"  # The folder containing your PDF files

# List to hold all extracted text as Document objects
all_documents = []

# Iterate through all PDF files in the folder
for filename in os.listdir(pdf_folder_path):
    if filename.endswith(".pdf"):
        pdf_path = os.path.join(pdf_folder_path, filename)
        pdf_text = extract_text_from_pdf(pdf_path)
        
        # Create a Document object
        document = Document(page_content=pdf_text, metadata={"filename": filename})
        all_documents.append(document)

# Split the documents into chunks
text_chunks = split_into_chunks(all_documents)

# Print the first few chunks
for i, chunk in enumerate(text_chunks[:3]):  # Print first 3 chunks
    print(f"Chunk {i + 1}:\n{chunk}\n")

Chunk 1:
page_content='Learning to Memorize Entailment and Discourse Relations for Persona-Consistent
Dialogues
Ruijun Chen1, Jin Wang1*, Liang-Chih Yu2and Xuejie Zhang1
1School of Information Science and Engineering, Yunnan University, Yunnan, China
2Department of Information Management, Yuan Ze University, Taiwan
chenrj@mail.ynu.edu.cn, wangjin@ynu.edu.cn, lcyu@saturn.yzu.edu.tw, xjzhang@ynu.edu.cn
Abstract
Maintaining engagement and consistency is particularly im-
portant in dialogue systems. Existing works have improved
the performance of dialogue systems by intentionally learn-
ing interlocutor personas with sophisticated network struc-
tures. One issue with this approach is that it requires more
personal corpora with annotations. Additionally, these mod-
els typically perform the next utterance prediction to gener-
ate a response but neglect the discourse coherence in the en-
tire conversation. To address these issues, this study proposes
a method of learning to memorize entailme

## Setting Up Vector Database

In [7]:
def create_embedding_model():
    """Creates the text embedding model"""
    # Initialize and return a HuggingFace embeddings model
    return HuggingFaceEmbeddings(
        model_name="sentence-transformers/all-MiniLM-L12-v2"
    )

In [8]:
def setup_vector_database(database_path):
    """Sets up the vector database connection"""
    # Create the parent directory for the database if it doesn't exist
    os.makedirs(os.path.split(database_path)[0], exist_ok=True)
    
    # Connect to the Milvus database using the provided path
    connections.connect(
        alias="default",        # Use the default connection alias
        uri=database_path       # Specify the database location
    )
    
    # Check if the collection exists
    if not utility.has_collection("PapersWithCode"):
        print("Collection does not exist. You'll need to create it later.")
        return False
    return True