In [1]:
pip install requests beautifulsoup4 pandas

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install selenium

Note: you may need to restart the kernel to use updated packages.


In [3]:
import requests
from bs4 import BeautifulSoup
import json

In [4]:
# Define the URLs to scrape
urls = [
    "https://www.pasteur.fr/fr/centre-medical/fiches-maladies/sida-vih",
    "https://preventionsida.org/fr/vih/le-vih-cest-quoi/",
    "https://www.hiv.gov/hiv-basics/overview/about-hiv-and-aids/what-are-hiv-and-aids",
    "https://www.cdc.gov/hiv/about/index.html",
    "https://info.health.nz/conditions-treatments/infectious-diseases/hiv-and-aids",
    "https://www.unaids.org/en/frequently-asked-questions-about-hiv-and-aids",
    "https://hivinfo.nih.gov/understanding-hiv/fact-sheets/hiv-and-aids-basics",
    "https://www.cdc.gov/hiv/index.html",
    "https://www.healthline.com/health/hiv-aids"
]

In [5]:
# Function to scrape all information from a webpage
def scrape_website(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Check for HTTP request errors
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Extract all text content
        text_content = soup.get_text(separator='\n').strip()
        
        # Extract all links
        links = [a['href'] for a in soup.find_all('a', href=True)]
        
        # Extract all images
        images = [img['src'] for img in soup.find_all('img', src=True)]
        
        # Extract metadata (title, description, etc.)
        title = soup.title.string if soup.title else "No title"
        meta_description = soup.find('meta', attrs={'name': 'description'})
        description = meta_description['content'] if meta_description else "No description"
        
        return {
            'url': url,
            'title': title,
            'description': description,
            'text_content': text_content,
            'links': links,
            'images': images
        }
    except Exception as e:
        print(f"Error scraping {url}: {e}")
        return None

In [6]:
# Convert scraped data to IBM Watson Assistant format
def convert_to_watson_format(scraped_data):
    watson_data = {
        "intents": [],
        "entities": [],
        "dialog_nodes": []
    }
    
    # Add intents and dialog nodes
    for idx, page in enumerate(scraped_data):
        intent_name = f"page_{idx + 1}_info"
        watson_data["intents"].append({
            "intent": intent_name,
            "examples": [
                { "text": f"What is the content of {page['url']}?" },
                { "text": f"Tell me about {page['title']}." },
                { "text": f"Can you summarize the information on {page['url']}?" }
            ]
        })
        watson_data["dialog_nodes"].append({
            "dialog_node": f"node_{idx + 1}",
            "conditions": f"#{intent_name}",
            "output": {
                "generic": [
                    {
                        "response_type": "text",
                        "values": [
                            { "text": f"Here is the information about {page['title']}: {page['text_content']}" }
                        ],
                        "selection_policy": "sequential"
                    }
                ]
            }
        })
    
    # Add entities (e.g., images and links)
    all_links = set(link for page in scraped_data for link in page['links'])
    all_images = set(img for page in scraped_data for img in page['images'])
    watson_data["entities"].append({
        "entity": "link",
        "values": [{"value": link} for link in all_links]
    })
    watson_data["entities"].append({
        "entity": "image",
        "values": [{"value": img} for img in all_images]
    })
    
    return watson_data

In [7]:
# Scrape each URL
scraped_data = []
for url in urls:
    print(f"Scraping: {url}")
    website_data = scrape_website(url)
    if website_data:
        scraped_data.append(website_data)

Scraping: https://www.pasteur.fr/fr/centre-medical/fiches-maladies/sida-vih
Scraping: https://preventionsida.org/fr/vih/le-vih-cest-quoi/
Scraping: https://www.hiv.gov/hiv-basics/overview/about-hiv-and-aids/what-are-hiv-and-aids
Scraping: https://www.cdc.gov/hiv/about/index.html
Scraping: https://info.health.nz/conditions-treatments/infectious-diseases/hiv-and-aids
Scraping: https://www.unaids.org/en/frequently-asked-questions-about-hiv-and-aids
Scraping: https://hivinfo.nih.gov/understanding-hiv/fact-sheets/hiv-and-aids-basics
Scraping: https://www.cdc.gov/hiv/index.html
Scraping: https://www.healthline.com/health/hiv-aids


In [8]:
# Convert scraped data to Watson format
watson_ready_data = convert_to_watson_format(scraped_data)

In [9]:
# Save the Watson-compatible data to a JSON file
output_file = 'hiv-watson_data.json'
with open(output_file, 'w') as f:
    json.dump(watson_ready_data, f, indent=4)

print(f"Scraping complete. Watson-compatible data saved to {output_file}")

Scraping complete. Watson-compatible data saved to hiv-watson_data.json
