In [1]:
!pip install requests beautifulsoup4



In [2]:
import os
import requests
from bs4 import BeautifulSoup
import re
import shutil

In [3]:
folder_path = "/kaggle/working/scraped_text_data"  # Folder to delete

try:
    shutil.rmtree(folder_path)
    print(f"Deleted folder: {folder_path}")
except FileNotFoundError:
    print("Folder not found, skipping deletion.")
except Exception as e:
    print(f"Error deleting folder: {e}")

Folder not found, skipping deletion.


In [4]:


# Define categories and their corresponding websites
categories = {
    "technology_news": ["https://techcrunch.com", "https://www.theverge.com", "https://www.wired.com"],
    "ai_machine_learning": ["https://www.ainews.com", "https://openai.com/blog", "https://www.technologyreview.com"],
    "cybersecurity": ["https://krebsonsecurity.com", "https://www.darkreading.com", "https://www.csoonline.com"],
    "space_astronomy": ["https://www.nasa.gov/news", "https://www.space.com", "https://www.esa.int"],
    "blockchain": ["https://www.coindesk.com", "https://cointelegraph.com", "https://bitcoinmagazine.com"],
    "finance_economy": ["https://www.bloomberg.com", "https://www.forbes.com/money", "https://www.wsj.com"],
    "stock_market": ["https://finance.yahoo.com", "https://www.marketwatch.com", "https://www.cnbc.com/markets/"],
    "health_medicine": ["https://www.webmd.com", "https://www.mayoclinic.org", "https://www.nih.gov"],
    "fitness_nutrition": ["https://www.healthline.com", "https://www.livestrong.com", "https://www.verywellfit.com"],
    "climate_environment": ["https://www.nationalgeographic.com", "https://www.climate.gov", "https://www.theguardian.com/environment"],
    "education_elearning": ["https://www.edsurge.com", "https://blog.coursera.org", "https://www.edweek.org"],
    "psychology_mental_health": ["https://www.psychologytoday.com", "https://www.mindful.org", "https://www.verywellmind.com"],
    "travel_tourism": ["https://www.lonelyplanet.com", "https://www.tripadvisor.com", "https://www.travelandleisure.com"],
    "automobiles_evs": ["https://www.motortrend.com", "https://www.caranddriver.com", "https://electrek.co"],
    "sports_news": ["https://www.espn.com", "https://www.bbc.com/sport", "https://www.si.com"],
    "entertainment_movies": ["https://www.hollywoodreporter.com", "https://www.imdb.com/news", "https://editorial.rottentomatoes.com"],
    "gaming_esports": ["https://www.ign.com", "https://www.pcgamer.com", "https://kotaku.com"],
    "history_culture": ["https://www.history.com", "https://www.smithsonianmag.com", "https://www.bbc.co.uk/history"],
    "ai_ethics_bias": ["https://ainowinstitute.org", "https://thegradient.pub", "https://montrealethics.ai"],
    "startups_entrepreneurship": ["https://www.ycombinator.com/blog", "https://www.entrepreneur.com", "https://www.inc.com"]
}


# Function to clean text
def clean_text(text):
    text = re.sub(r'\s+', ' ', text)  # Remove extra whitespace
    text = re.sub(r'[^\w\s,.!?]', '', text)  # Remove special characters
    return text.strip()

# Function to scrape data
def scrape_text(url):
    try:
        response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")

        # Extract title
        title = soup.title.text if soup.title else "No Title"

        # Extract content
        paragraphs = soup.find_all("p")
        content = " ".join([para.get_text() for para in paragraphs])

        # Extract date (if available)
        date_meta = soup.find("meta", {"name": "date"}) or soup.find("meta", {"property": "article:published_time"})
        date = date_meta["content"] if date_meta else "No Date Available"

        # Clean content
        cleaned_content = clean_text(content)
        
        return f"Title: {title}\nDate: {date}\nContent: {cleaned_content}\n\n"
    
    except Exception as e:
        return f"Error scraping {url}: {str(e)}\n\n"

# Function to save scraped data into text files
def save_to_file(category, text_data):
    os.makedirs("scraped_text_data", exist_ok=True)
    file_path = os.path.join("scraped_text_data", f"{category}.txt")
    
    with open(file_path, "w", encoding="utf-8") as file:
        file.write(text_data)

# Scrape and save data for each category
for category, urls in categories.items():
    print(f"Scraping category: {category}...")
    all_text = ""
    
    for url in urls:
        print(f"  -> Scraping {url}")
        all_text += scrape_text(url)
    
    save_to_file(category, all_text)

print("Scraping completed! All files saved in 'scraped_text_data' folder.")


Scraping category: technology_news...
  -> Scraping https://techcrunch.com
  -> Scraping https://www.theverge.com
  -> Scraping https://www.wired.com
Scraping category: ai_machine_learning...
  -> Scraping https://www.ainews.com
  -> Scraping https://openai.com/blog
  -> Scraping https://www.technologyreview.com
Scraping category: cybersecurity...
  -> Scraping https://krebsonsecurity.com
  -> Scraping https://www.darkreading.com
  -> Scraping https://www.csoonline.com
Scraping category: space_astronomy...
  -> Scraping https://www.nasa.gov/news
  -> Scraping https://www.space.com
  -> Scraping https://www.esa.int
Scraping category: blockchain...
  -> Scraping https://www.coindesk.com
  -> Scraping https://cointelegraph.com
  -> Scraping https://bitcoinmagazine.com
Scraping category: finance_economy...
  -> Scraping https://www.bloomberg.com
  -> Scraping https://www.forbes.com/money
  -> Scraping https://www.wsj.com
Scraping category: stock_market...
  -> Scraping https://finance.yaho

In [5]:
import shutil
import os

# Define dataset path
dataset_dir = "/kaggle/working/scraped_text_data"

# Ensure directory exists
os.makedirs(dataset_dir, exist_ok=True)
shutil.make_archive("/kaggle/working/text_dataset_export", 'zip', dataset_dir)

'/kaggle/working/text_dataset_export.zip'