In [1]:
import requests
from bs4 import BeautifulSoup
import os
import re

def clean_text(text):
    text = text.strip()
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\n+', '\n\n', text)
    text = re.sub(r'(info@bethelfurniture\.com\s+)+', r'info@bethelfurniture.com\n', text)
    return text

def extract_links(soup, base_url):
    """
    Extract all relevant page links from a BeautifulSoup object.
    """
    links = set()
    for anchor in soup.find_all("a", href=True):
        href = anchor['href']
        if href.startswith('/'):
            href = base_url + href
        if href.startswith(base_url):
            links.add(href)
    return links

def fetch_and_clean_page(url, headers):
    """
    Fetch a page, parse its content, and clean the text.
    """
    response = requests.get(url, headers=headers)
    response.raise_for_status()
    html_content = response.content
    soup = BeautifulSoup(html_content, 'lxml')  # Changed parser to 'lxml'
    
    # Extract text content
    for script in soup(["script", "style", "video", "img"]):
        script.decompose()
    text_content = soup.get_text(separator="\n").strip()
    text_content = clean_text(text_content)
    
    return text_content, soup

# Function to read WhatsApp chat files and return combined content
def read_whatsapp_chats(chat_files):
    whatsapp_content = []
    for chat_file in chat_files:
        try:
            with open(chat_file, 'r', encoding='utf-8') as file:
                content = file.read().strip()
                whatsapp_content.append(f"Content from {chat_file}:\n" + "="*20 + '\n\n' + content + '\n\n')
        except FileNotFoundError:
            print(f"File {chat_file} not found. Skipping...")
    return "\n".join(whatsapp_content)

# Main URL to start scraping
base_url = "https://bethelfurniture.com"

# Set to keep track of visited URLs
visited_urls = set()

# List to gather all content
all_content = []

# Define headers to mimic a web browser
headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:55.0) Gecko/20100101 Firefox/55.0',
}

# Queue for URLs to visit
urls_to_visit = {base_url}

while urls_to_visit:
    current_url = urls_to_visit.pop()
    if current_url in visited_urls:
        continue
    visited_urls.add(current_url)

    try:
        page_content, page_soup = fetch_and_clean_page(current_url, headers)
        all_content.append(f"Content from {current_url}:\n" + "="*20 + '\n\n' + page_content + '\n\n')
        
        # Extract new links and add to the queue
        new_links = extract_links(page_soup, base_url)
        urls_to_visit.update(new_links - visited_urls)
    except requests.RequestException as e:
        print(f"Failed to fetch {current_url}: {e}")

# Combine all the web scraping content
combined_content = "\n".join(all_content)

# WhatsApp chat files
whatsapp_chat_files = ["WhatsApp Chat 1.txt", "WhatsApp Chat 2.txt", "WhatsApp Chat 3.txt", "WhatsApp Chat 4.txt", "WhatsApp Chat 5.txt"]

# Append WhatsApp chats content to the scraped content
whatsapp_content = read_whatsapp_chats(whatsapp_chat_files)
combined_content += "\n" + whatsapp_content

# Define the path to save the combined content
file_path = "recursive_extracted_content.txt"

# Write the combined text to a file
with open(file_path, "w", encoding="utf-8") as file:
    file.write(combined_content)

print(f"Content successfully scraped and saved to {file_path}")


Failed to fetch https://bethelfurniture.com/store/: 404 Client Error: Not Found for url: https://bethelfurniture.com/store/
Content successfully scraped and saved to recursive_extracted_content.txt


## Delete Index

In [1]:
from pinecone import ServerlessSpec
from sentence_transformers import SentenceTransformer
from pinecone import Pinecone

# Initialize Pinecone with PineconeClient
client = Pinecone(api_key="0990c15d-15bf-4dd8-80af-2361a2df1aa3", environment="us-east-1")
# Define the index name
index_name = "bethel"
# Check if the index already exists, if not, create it
if index_name not in client.list_indexes().names():
    client.create_index(
        index_name, 
        dimension=768, 
        metric='cosine',
        spec=ServerlessSpec(
            cloud='aws', 
            region='us-east-1'
        )
    )

  from tqdm.autonotebook import tqdm


In [5]:

# # Initialize Pinecone with PineconeClient
client = Pinecone(api_key="0990c15d-15bf-4dd8-80af-2361a2df1aa3", environment="us-east-1")
# Define the index namefrom pinecone import Pinecone
client.delete_index(index_name)