In [None]:
# Automatic webscraping the URL into AWS S3 bucket as json file.

In [7]:
import requests
from bs4 import BeautifulSoup
import json
import gradio as gr
import re
import os
import boto3
from botocore.exceptions import NoCredentialsError

# AWS S3 Configuration
S3_BUCKET_NAME = "your-s3-bucket-name"
S3_REGION_NAME = "your-region"
S3_ACCESS_KEY = "your-access-key"
S3_SECRET_KEY = "your-secret-key"

def upload_to_s3(file_path, bucket_name, object_name=None):
    try:
        s3_client = boto3.client(
            's3',
            region_name=S3_REGION_NAME,
            aws_access_key_id=S3_ACCESS_KEY,
            aws_secret_access_key=S3_SECRET_KEY
        )
        if object_name is None:
            object_name = os.path.basename(file_path)

        s3_client.upload_file(file_path, bucket_name, object_name)
        return f"File uploaded to S3: s3://{bucket_name}/{object_name}"
    except NoCredentialsError:
        return "Credentials not available"
    except Exception as e:
        return f"Error uploading to S3: {e}"

def scrape_website(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")

        content = []
        accumulated_text = ""

        for tag in soup.find_all(["h1", "h2", "h3", "h4", "h5", "h6", "p"]):
            if tag.name.startswith("h") and accumulated_text:
                content.append({"tag": "p", "text": accumulated_text.strip()})
                accumulated_text = ""

            content.append({"tag": tag.name, "text": tag.get_text(strip=True)})

            if tag.name == "p":
                accumulated_text += " " + tag.get_text(strip=True)

        if accumulated_text:
            content.append({"tag": "p", "text": accumulated_text.strip()})

        merged_content = []
        prev_p_text = ""

        for item in content:
            if item["tag"] == "p":
                prev_p_text += " " + item["text"]
            else:
                if prev_p_text:
                    merged_content.append({"tag": "p", "text": prev_p_text.strip()})
                    prev_p_text = ""
                merged_content.append(item)

        if prev_p_text:
            merged_content.append({"tag": "p", "text": prev_p_text.strip()})

        extracted_jsons = []
        extracted_json = {}

        for item in merged_content:
            if item['tag'] in ['h1', 'h2', 'h3']:
                extracted_json['heading'] = item['text']
            elif item['tag'] == 'p':
                extracted_json['body'] = item['text']
                extracted_jsons.append(extracted_json)
                extracted_json = {}

        return json.dumps(extracted_jsons, ensure_ascii=False, indent=4)
    except requests.RequestException as e:
        return f"Error fetching URL: {e}"

def sanitize_filename(url):
    return re.sub(r'[^a-zA-Z0-9]', '_', url) + ".json"

def save_and_upload(url):
    content = scrape_website(url)
    folder_name = "scrapped"
    os.makedirs(folder_name, exist_ok=True)
    filename = os.path.join(folder_name, sanitize_filename(url))
    try:
        with open(filename, "w", encoding="utf-8") as file:
            file.write(content)
        upload_message = upload_to_s3(filename, S3_BUCKET_NAME)
        return f"Content saved to {filename}\n{upload_message}"
    except Exception as e:
        return f"Error saving or uploading file: {e}"

def web_scraper_interface(urls):
    results = []
    for url in urls.split(','):
        url = url.strip()
        if url:
            save_message = save_and_upload(url)
            results.append(f"URL: {url}\n{save_message}")
    return "\n\n".join(results)

with gr.Blocks() as ui:
    gr.Markdown("## Web Scraper UI")
    url_input = gr.Textbox(label="Enter URLs (comma-separated)")
    scrape_button = gr.Button("Scrape and Upload to S3")
    output_box = gr.Textbox(label="Status", lines=10)

    scrape_button.click(web_scraper_interface, inputs=[url_input], outputs=[output_box])

if __name__ == "__main__":
    ui.launch()


/bin/bash: line 1: Pip: command not found


In [None]:
## Webscraping the URL into Json file

In [9]:
import requests
from bs4 import BeautifulSoup
import json
import gradio as gr
import re
import os

def scrape_website(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")

        content = []
        accumulated_text = ""

        for tag in soup.find_all(["h1", "h2", "h3", "h4", "h5", "h6", "p"]):
            if tag.name.startswith("h") and accumulated_text:
                content.append({"tag": "p", "text": accumulated_text.strip()})
                accumulated_text = ""

            content.append({"tag": tag.name, "text": tag.get_text(strip=True)})

            if tag.name == "p":
                accumulated_text += " " + tag.get_text(strip=True)

        if accumulated_text:
            content.append({"tag": "p", "text": accumulated_text.strip()})

        merged_content = []
        prev_p_text = ""

        for item in content:
            if item["tag"] == "p":
                prev_p_text += " " + item["text"]
            else:
                if prev_p_text:
                    merged_content.append({"tag": "p", "text": prev_p_text.strip()})
                    prev_p_text = ""
                merged_content.append(item)

        if prev_p_text:
            merged_content.append({"tag": "p", "text": prev_p_text.strip()})

        extracted_jsons = []
        extracted_json = {}

        for item in merged_content:
            if item['tag'] in ['h1', 'h2', 'h3']:
                extracted_json['heading'] = item['text']
            elif item['tag'] == 'p':
                extracted_json['body'] = item['text']
                extracted_jsons.append(extracted_json)
                extracted_json = {}

        return json.dumps(extracted_jsons, ensure_ascii=False, indent=4)
    except requests.RequestException as e:
        return f"Error fetching URL: {e}"

def sanitize_filename(url):
    return re.sub(r'[^a-zA-Z0-9]', '_', url) + ".json"

def save_to_file(content, url):
    folder_name = "scrapped"
    os.makedirs(folder_name, exist_ok=True)
    filename = os.path.join(folder_name, sanitize_filename(url))
    try:
        with open(filename, "w", encoding="utf-8") as file:
            file.write(content)
        return f"Content saved to {filename}"
    except Exception as e:
        return f"Error saving file: {e}"

def web_scraper_interface(urls):
    results = []
    for url in urls.split(','):
        url = url.strip()
        if url:
            content = scrape_website(url)
            save_message = save_to_file(content, url)
            results.append(f"URL: {url}\n{save_message}")
    return "\n\n".join(results)

with gr.Blocks() as ui:
    gr.Markdown("## Web Scraper UI")
    url_input = gr.Textbox(label="Enter URLs (comma-separated)")
    scrape_button = gr.Button("Scrape")
    output_box = gr.Textbox(label="Status", lines=10)

    scrape_button.click(web_scraper_interface, inputs=[url_input], outputs=[output_box])

if __name__ == "__main__":
    ui.launch()


It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://e9ffc906eae4874b02.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


In [8]:
pip install gradio

Collecting gradio
  Downloading gradio-5.29.1-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<25.0,>=22.0 (from gradio)
  Downloading aiofiles-24.1.0-py3-none-any.whl.metadata (10 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.10.1 (from gradio)
  Downloading gradio_client-1.10.1-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downloading groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.9.3 (from gradio)
  Downloading ruff-0.11.10-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)
Collecting safehttpx<0.2.0,>=0.1.

In [None]:
import requests
from bs4 import BeautifulSoup
import json
import gradio as gr
import re
import os

def scrape_website(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")

        content = []
        accumulated_text = ""

        for tag in soup.find_all(["h1", "h2", "h3", "h4", "h5", "h6", "p"]):
            if tag.name.startswith("h") and accumulated_text:
                content.append({"tag": "p", "text": accumulated_text.strip()})
                accumulated_text = ""

            content.append({"tag": tag.name, "text": tag.get_text(strip=True)})

            if tag.name == "p":
                accumulated_text += " " + tag.get_text(strip=True)

        if accumulated_text:
            content.append({"tag": "p", "text": accumulated_text.strip()})

        merged_content = []
        prev_p_text = ""

        for item in content:
            if item["tag"] == "p":
                prev_p_text += " " + item["text"]
            else:
                if prev_p_text:
                    merged_content.append({"tag": "p", "text": prev_p_text.strip()})
                    prev_p_text = ""
                merged_content.append(item)

        if prev_p_text:
            merged_content.append({"tag": "p", "text": prev_p_text.strip()})

        extracted_jsons = []
        extracted_json = {}

        for item in merged_content:
            if item['tag'] in ['h1', 'h2', 'h3']:
                extracted_json['heading'] = item['text']
            elif item['tag'] == 'p':
                extracted_json['body'] = item['text']
                extracted_jsons.append(extracted_json)
                extracted_json = {}

        return json.dumps(extracted_jsons, ensure_ascii=False, indent=4)
    except requests.RequestException as e:
        return f"Error fetching URL: {e}"

def sanitize_filename(url):
    return re.sub(r'[^a-zA-Z0-9]', '_', url) + ".json"

def save_to_file(content, url):
    folder_name = "scrapped"
    os.makedirs(folder_name, exist_ok=True)
    filename = os.path.join(folder_name, sanitize_filename(url))
    try:
        with open(filename, "w", encoding="utf-8") as file:
            file.write(content)
        return f"Content saved to {filename}", filename
    except Exception as e:
        return f"Error saving file: {e}", None

def download_file(filename):
    return gr.File(value=filename, label="Download File")

def web_scraper_interface(urls):
    results = []
    files = []
    for url in urls.split(','):
        url = url.strip()
        if url:
            content = scrape_website(url)
            save_message, filename = save_to_file(content, url)
            results.append(f"URL: {url}\n{save_message}")
            if filename:
                files.append(download_file(filename))
    return "\n\n".join(results), files

with gr.Blocks() as ui:
    gr.Markdown("## Web Scraper UI")
    url_input = gr.Textbox(label="Enter URLs (comma-separated)")
    scrape_button = gr.Button("Scrape")
    output_box = gr.Textbox(label="Scraped Content", lines=20)
    file_outputs = gr.File(label="Download Files")

    scrape_button.click(web_scraper_interface, inputs=[url_input], outputs=[output_box, file_outputs])

if __name__ == "__main__":
    ui.launch()


In [None]:
import requests
from bs4 import BeautifulSoup
import json
import gradio as gr
import re

def scrape_website(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")

        content = []
        accumulated_text = ""

        for tag in soup.find_all(["h1", "h2", "h3", "h4", "h5", "h6", "p"]):
            if tag.name.startswith("h") and accumulated_text:
                content.append({"tag": "p", "text": accumulated_text.strip()})
                accumulated_text = ""

            content.append({"tag": tag.name, "text": tag.get_text(strip=True)})

            if tag.name == "p":
                accumulated_text += " " + tag.get_text(strip=True)

        if accumulated_text:
            content.append({"tag": "p", "text": accumulated_text.strip()})

        merged_content = []
        prev_p_text = ""

        for item in content:
            if item["tag"] == "p":
                prev_p_text += " " + item["text"]
            else:
                if prev_p_text:
                    merged_content.append({"tag": "p", "text": prev_p_text.strip()})
                    prev_p_text = ""
                merged_content.append(item)

        if prev_p_text:
            merged_content.append({"tag": "p", "text": prev_p_text.strip()})

        extracted_jsons = []
        extracted_json = {}

        for item in merged_content:
            if item['tag'] in ['h1', 'h2', 'h3']:
                extracted_json['heading'] = item['text']
            elif item['tag'] == 'p':
                extracted_json['body'] = item['text']
                extracted_jsons.append(extracted_json)
                extracted_json = {}

        return json.dumps(extracted_jsons, ensure_ascii=False, indent=4)
    except requests.RequestException as e:
        return f"Error fetching URL: {e}"

def sanitize_filename(url):
    return re.sub(r'[^a-zA-Z0-9]', '_', url) + ".json"

def save_to_file(content, url):
    filename = sanitize_filename(url)
    try:
        with open(filename, "w", encoding="utf-8") as file:
            file.write(content)
        return f"Content saved to {filename}", filename
    except Exception as e:
        return f"Error saving file: {e}", None

def download_file(filename):
    return gr.File(value=filename, label="Download File")

def web_scraper_interface(urls):
    results = []
    files = []
    for url in urls.split(','):
        url = url.strip()
        if url:
            content = scrape_website(url)
            save_message, filename = save_to_file(content, url)
            results.append(f"URL: {url}\n{save_message}")
            if filename:
                files.append(download_file(filename))
    return "\n\n".join(results), files

with gr.Blocks() as ui:
    gr.Markdown("## Web Scraper UI")
    url_input = gr.Textbox(label="Enter URLs (comma-separated)")
    scrape_button = gr.Button("Scrape")
    output_box = gr.Textbox(label="Scraped Content", lines=20)
    file_outputs = gr.File(label="Download Files")

    scrape_button.click(web_scraper_interface, inputs=[url_input], outputs=[output_box, file_outputs])

if __name__ == "__main__":
    ui.launch()


In [None]:
import requests
from bs4 import BeautifulSoup
import json
import gradio as gr
import re

def scrape_website(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")

        content = []
        accumulated_text = ""

        for tag in soup.find_all(["h1", "h2", "h3", "h4", "h5", "h6", "p"]):
            if tag.name.startswith("h") and accumulated_text:
                content.append({"tag": "p", "text": accumulated_text.strip()})
                accumulated_text = ""

            content.append({"tag": tag.name, "text": tag.get_text(strip=True)})

            if tag.name == "p":
                accumulated_text += " " + tag.get_text(strip=True)

        if accumulated_text:
            content.append({"tag": "p", "text": accumulated_text.strip()})

        merged_content = []
        prev_p_text = ""

        for item in content:
            if item["tag"] == "p":
                prev_p_text += " " + item["text"]
            else:
                if prev_p_text:
                    merged_content.append({"tag": "p", "text": prev_p_text.strip()})
                    prev_p_text = ""
                merged_content.append(item)

        if prev_p_text:
            merged_content.append({"tag": "p", "text": prev_p_text.strip()})

        extracted_jsons = []
        extracted_json = {}

        for item in merged_content:
            if item['tag'] in ['h1', 'h2', 'h3']:
                extracted_json['heading'] = item['text']
            elif item['tag'] == 'p':
                extracted_json['body'] = item['text']
                extracted_jsons.append(extracted_json)
                extracted_json = {}

        return json.dumps(extracted_jsons, ensure_ascii=False, indent=4)
    except requests.RequestException as e:
        return f"Error fetching URL: {e}"

def sanitize_filename(url):
    return re.sub(r'[^a-zA-Z0-9]', '_', url) + ".json"

def save_to_file(content, url):
    filename = sanitize_filename(url)
    try:
        with open(filename, "w", encoding="utf-8") as file:
            file.write(content)
        return f"Content saved to {filename}"
    except Exception as e:
        return f"Error saving file: {e}"

def web_scraper_interface(url):
    content = scrape_website(url)
    return content, save_to_file(content, url)

with gr.Blocks() as ui:
    gr.Markdown("## Web Scraper UI")
    url_input = gr.Textbox(label="Enter URL")
    scrape_button = gr.Button("Scrape")
    output_box = gr.Textbox(label="Scraped Content", lines=20)
    save_status = gr.Textbox(label="Save Status")

    scrape_button.click(web_scraper_interface, inputs=[url_input], outputs=[output_box, save_status])

if __name__ == "__main__":
    ui.launch()
