In [1]:
!pip install requests beautifulsoup4



In [2]:
import requests
from bs4 import BeautifulSoup

In [None]:
s = "https://waitbutwhy.com/2013/10/why-procrastinators-procrastinate.html"

headers = {
    'User-Agent': 'PersonalBlogSearchCrawler/1.0'
}

html_content = ""
try:
    response = requests.get(s, headers=headers, timeout=10)

    response.raise_for_status()
    
    html_content = response.text
    print("Successfully fetched the webpage!")
    print(f"Content length: {len(html_content)} characters")

except requests.exceptions.RequestException as e:
    print(f"An error occurred: {e}")

Successfully fetched the webpage!
Content length: 142420 characters


In [4]:
from urllib.parse import urljoin

base_url = "https://waitbutwhy.com/2013/10/why-procrastinators-procrastinate.html"

cleaned_links = set()

if 'all_links' in locals():  # Check if all_links exists
    for link in all_links:
        href = link.get('href')

        if not href:
            continue
        if href.startswith('mailto:') or href.startswith('javascript:'):
            continue
        if href.startswith('#'):
            continue

        absolute_link = urljoin(base_url, href)
        cleaned_links.add(absolute_link)

    print(f"Found {len(cleaned_links)} unique, cleaned links.")

    for link in list(cleaned_links)[:10]:
        print(link)

In [None]:
import time
from collections import deque

s = "https://waitbutwhy.com/2013/10/why-procrastinators-procrastinate.html"

queue = deque([s])

visited_urls = {s}

max_pages = 50 
crawled_count = 0

while queue and crawled_count < max_pages:
    current_url = queue.popleft()
    crawled_count += 1
    
    print(f"[{crawled_count}/{max_pages}] Crawling: {current_url}")

    try:
        headers = {'User-Agent': 'PersonalBlogSearchCrawler/1.0'}
        response = requests.get(current_url, headers=headers, timeout=10)
        response.raise_for_status()
        html_content = response.text
    except requests.exceptions.RequestException as e:
        print(f"   -> Failed to fetch {current_url}: {e}")
        continue # Skip to the next URL in the queue
        
    soup = BeautifulSoup(html_content, 'html.parser')
    for link in soup.find_all('a'):
        href = link.get('href')

        if not href or href.startswith(('mailto:', 'javascript:', '#')):
            continue

        absolute_link = urljoin(current_url, href)
        
        if absolute_link not in visited_urls:
            visited_urls.add(absolute_link)
            queue.append(absolute_link)
            
    time.sleep(1) 

print("\nCrawling finished.")
print(f"Total unique pages found and added to queue: {len(visited_urls)}")

[1/50] Crawling: https://waitbutwhy.com/2013/10/why-procrastinators-procrastinate.html
[2/50] Crawling: https://waitbutwhy.com
[3/50] Crawling: https://waitbutwhy.com/homepage
[4/50] Crawling: https://waitbutwhy.com/
[5/50] Crawling: https://waitbutwhy.com/wait-but-who
[6/50] Crawling: https://waitbutwhy.com/faq
[7/50] Crawling: https://waitbutwhy.com/contact
[8/50] Crawling: https://waitbutwhy.com/archive
[9/50] Crawling: https://waitbutwhy.com/minis
[10/50] Crawling: https://waitbutwhy.com/the-shed
[11/50] Crawling: https://waitbutwhy.com/table
[12/50] Crawling: http://shop.waitbutwhy.com/
[13/50] Crawling: http://store.waitbutwhy.com
[14/50] Crawling: https://store.waitbutwhy.com/collections/new-releases
[15/50] Crawling: http://store.waitbutwhy.com/collections/posters
[16/50] Crawling: https://store.waitbutwhy.com/collections/phone-cases
[17/50] Crawling: https://store.waitbutwhy.com/collections/cards-and-wrapping-paper
[18/50] Crawling: http://store.waitbutwhy.com/collections/plus

In [None]:
import time
import warnings
from collections import deque
from urllib.parse import urlparse
from bs4 import BeautifulSoup, XMLParsedAsHTMLWarning

warnings.filterwarnings("ignore", category=XMLParsedAsHTMLWarning)

def is_valid_for_crawling(url):
    """
    Checks if a URL is likely to be a webpage we want to process.
    """
    try:
        parsed_url = urlparse(url)
        
        path = parsed_url.path.lower()
        if any(path.endswith(ext) for ext in ['.jpg', '.jpeg', '.png', '.gif', '.pdf', '.zip', '.css', '.js']):
            return False
            
        domain = parsed_url.netloc.lower()
        if any(junk_domain in domain for junk_domain in ['twitter.com', 'facebook.com', 'pinterest.com', 'instagram.com', 'googleusercontent.com', 'gum.co']):
            return False
            
        if any(pattern in path for pattern in ['/share', '/login', '/profile', '/search']):
            return False

        return True
        
    except (ValueError, AttributeError):
        return False

s = "https://waitbutwhy.com/2013/10/why-procrastinators-procrastinate.html"
queue = deque([s])
visited_urls = {s}
max_pages = 50 
crawled_count = 0

print("--- Starting Smarter Crawler ---")

while queue and crawled_count < max_pages:
    current_url = queue.popleft()
    
    crawled_count += 1
    print(f"[{crawled_count}/{max_pages}] Crawling: {current_url}")

    try:
        headers = {'User-Agent': 'PersonalBlogSearchCrawler/1.0'}
        response = requests.get(current_url, headers=headers, timeout=10)
        response.raise_for_status()
        html_content = response.text
    except requests.exceptions.RequestException as e:
        print(f"   -> Failed to fetch {current_url}: {e}")
        continue
        
    soup = BeautifulSoup(html_content, 'html.parser')
    for link in soup.find_all('a'):
        href = link.get('href')

        if not href:
            continue
            
        absolute_link = urljoin(current_url, href)
        
        if is_valid_for_crawling(absolute_link) and absolute_link not in visited_urls:
            visited_urls.add(absolute_link)
            queue.append(absolute_link)
            
    time.sleep(1)

print("\nCrawling finished.")
print(f"Total unique, valid pages found: {len(visited_urls)}")
print("\nSample of pages in queue for next crawl:")
for url in list(queue)[:5]:
    print(f"- {url}")

--- Starting Smarter Crawler ---
[1/50] Crawling: https://waitbutwhy.com/2013/10/why-procrastinators-procrastinate.html
[2/50] Crawling: https://waitbutwhy.com
[3/50] Crawling: https://waitbutwhy.com/homepage
[4/50] Crawling: https://waitbutwhy.com/
[5/50] Crawling: https://waitbutwhy.com/wait-but-who
[6/50] Crawling: https://waitbutwhy.com/faq
[7/50] Crawling: https://waitbutwhy.com/contact
[8/50] Crawling: https://waitbutwhy.com/archive
[9/50] Crawling: https://waitbutwhy.com/minis
[10/50] Crawling: https://waitbutwhy.com/the-shed
[11/50] Crawling: https://waitbutwhy.com/table
[12/50] Crawling: http://shop.waitbutwhy.com/
[13/50] Crawling: http://store.waitbutwhy.com
[14/50] Crawling: https://store.waitbutwhy.com/collections/new-releases
[15/50] Crawling: http://store.waitbutwhy.com/collections/posters
[16/50] Crawling: https://store.waitbutwhy.com/collections/phone-cases
[17/50] Crawling: https://store.waitbutwhy.com/collections/cards-and-wrapping-paper
[18/50] Crawling: http://stor

In [None]:
import time
import warnings
import json
from collections import deque
from urllib.parse import urlparse, urlunparse
from bs4 import BeautifulSoup, XMLParsedAsHTMLWarning

warnings.filterwarnings("ignore", category=XMLParsedAsHTMLWarning)

def normalize_url(url):
    """Creates a canonical URL representation to avoid duplicates."""
    parsed_url = urlparse(url)
    
    netloc = parsed_url.netloc.lower().replace('www.', '')
    path = parsed_url.path.rstrip('/')
    
    
    visit_key = (netloc + path)
    
    absolute_url = urlunparse((
        parsed_url.scheme.lower(),
        netloc,
        path if path else '/', # Ensure path is at least '/'
        '', '', '' # Remove params, query, fragment
    ))

    return visit_key, absolute_url


def is_valid_for_crawling(url):
    try:
        parsed_url = urlparse(url)
        path = parsed_url.path.lower()
        if any(path.endswith(ext) for ext in ['.jpg', '.jpeg', '.png', '.gif', '.pdf', '.zip', '.css', '.js']):
            return False
        domain = parsed_url.netloc.lower()
        if any(junk_domain in domain for junk_domain in ['twitter.com', 'facebook.com', 'pinterest.com', 'instagram.com', 'googleusercontent.com', 'gum.co', 'patreon.com', 'ted.com']):
            return False
        return True
    except (ValueError, AttributeError):
        return False

s = "https://waitbutwhy.com/2013/10/why-procrastinators-procrastinate.html"
output_filename = "crawled_content.jsonl"

queue = deque([s])
visited_urls = set() # This will now store the normalized 'visit_key'
max_pages = 50
crawled_count = 0

start_key, start_abs = normalize_url(s)
visited_urls.add(start_key)

print("--- Starting Content-Saving Crawler ---")

with open(output_filename, 'w') as f: # Open the file to write to
    while queue and crawled_count < max_pages:
        current_url = queue.popleft()
        crawled_count += 1
        
        print(f"[{crawled_count}/{max_pages}] Crawling: {current_url}")

        try:
            headers = {'User-Agent': 'PersonalBlogSearchCrawler/1.0'}
            response = requests.get(current_url, headers=headers, timeout=10)
            response.raise_for_status()
        except requests.exceptions.RequestException as e:
            print(f"   -> Failed to fetch {current_url}: {e}")
            continue

        soup = BeautifulSoup(response.text, 'html.parser')
        
        page_title = soup.find('title').get_text() if soup.find('title') else 'No Title Found'
        page_text = soup.get_text(separator=' ', strip=True) # Get all human-readable text

        data = {
            'url': current_url,
            'title': page_title,
            'text': page_text
        }
        f.write(json.dumps(data) + '\n')
        
        for link in soup.find_all('a'):
            href = link.get('href')
            if not href: continue
                
            absolute_link = urljoin(current_url, href)
            
            normalized_key, clean_absolute_link = normalize_url(absolute_link)
            
            if is_valid_for_crawling(clean_absolute_link) and normalized_key not in visited_urls:
                visited_urls.add(normalized_key)
                queue.append(clean_absolute_link)
        
        time.sleep(1)

print("\nCrawling finished.")
print(f"Data for {crawled_count} pages saved to {output_filename}")

--- Starting Content-Saving Crawler ---
[1/50] Crawling: https://waitbutwhy.com/2013/10/why-procrastinators-procrastinate.html
[2/50] Crawling: https://waitbutwhy.com/
[3/50] Crawling: https://waitbutwhy.com/homepage
[4/50] Crawling: https://waitbutwhy.com/wait-but-who
[5/50] Crawling: https://waitbutwhy.com/faq
[6/50] Crawling: https://waitbutwhy.com/contact
[7/50] Crawling: https://waitbutwhy.com/archive
[8/50] Crawling: https://waitbutwhy.com/minis
[9/50] Crawling: https://waitbutwhy.com/the-shed
[10/50] Crawling: https://waitbutwhy.com/table
[11/50] Crawling: http://shop.waitbutwhy.com/
[12/50] Crawling: http://store.waitbutwhy.com/
[13/50] Crawling: https://store.waitbutwhy.com/collections/new-releases
[14/50] Crawling: http://store.waitbutwhy.com/collections/posters
[15/50] Crawling: https://store.waitbutwhy.com/collections/phone-cases
[16/50] Crawling: https://store.waitbutwhy.com/collections/cards-and-wrapping-paper
[17/50] Crawling: http://store.waitbutwhy.com/collections/plus

In [9]:
import json

input_filename = "crawled_content.jsonl"
output_filename = "classified_v1_content.jsonl"

UTILITY_KEYWORDS = ['about', 'contact', 'store', 'shop', 'privacy', 'terms', 'support', 'faq', 'archive', 'categories', 'tags']
ARTICLE_KEYWORDS = ['leave a reply', 'post a comment', 'comments are closed', 'written by']
MIN_WORDS_THRESHOLD = 250

class_counts = {'personal_blog': 0, 'other': 0}
processed_count = 0

print("--- Starting Heuristic Classification ---")

with open(input_filename, 'r') as infile, open(output_filename, 'w') as outfile:
    for line in infile:
        try:
            data = json.loads(line)
            processed_count += 1
            
            classification = 'personal_blog' # Assume it's a blog post by default
            
            url_and_title = (data.get('url', '') + ' ' + data.get('title', '')).lower()
            if any(keyword in url_and_title for keyword in UTILITY_KEYWORDS):
                classification = 'other'

            if classification == 'personal_blog':
                word_count = len(data.get('text', '').split())
                if word_count < MIN_WORDS_THRESHOLD:
                    classification = 'other'
            
            if any(keyword in data.get('text', '').lower() for keyword in ARTICLE_KEYWORDS):
                classification = 'personal_blog'

            data['classification_v1'] = classification
            outfile.write(json.dumps(data) + '\n')
            
            class_counts[classification] += 1

        except json.JSONDecodeError:
            print(f"Skipping malformed line: {line.strip()}")

print("\n--- Classification Finished ---")
print(f"Processed {processed_count} pages.")
print(f"Results saved to {output_filename}")
print("\nClassification breakdown:")
for category, count in class_counts.items():
    print(f"- {category}: {count} pages")

--- Starting Heuristic Classification ---

--- Classification Finished ---
Processed 50 pages.
Results saved to classified_v1_content.jsonl

Classification breakdown:
- personal_blog: 32 pages
- other: 18 pages


In [None]:
import time
import json
from collections import deque
from urllib.parse import urlparse, urljoin
import requests
from bs4 import BeautifulSoup, XMLParsedAsHTMLWarning
import warnings

warnings.filterwarnings("ignore", category=XMLParsedAsHTMLWarning)


def run_targeted_crawl(seed_file, output_file, max_pages=200):
    """
    Crawls websites starting from a list of seed URLs in a file.
    
    Args:
        seed_file (str): The path to the file containing seed URLs (one per line).
        output_file (str): The path to the .jsonl file where results will be saved.
        max_pages (int): The maximum number of pages to crawl in this session.
    """
    print(f"--- Starting new crawl session ---")
    print(f"Seed file: {seed_file}")
    print(f"Output file: {output_file}")

    try:
        with open(seed_file, 'r') as f:
            ss = [line.strip() for line in f if line.strip()]
        if not ss:
            print("Error: Seed file is empty.")
            return
    except FileNotFoundError:
        print(f"Error: Seed file not found at '{seed_file}'")
        return

    queue = deque(ss)
    visited_urls = set(ss)
    crawled_count = 0

    with open(output_file, 'w') as f:
        while queue and crawled_count < max_pages:
            current_url = queue.popleft()
            crawled_count += 1
            
            print(f"[{crawled_count}/{max_pages}] Crawling: {current_url}")

            try:
                headers = {'User-Agent': 'PersonalBlogSearchCrawler/1.0'}
                response = requests.get(current_url, headers=headers, timeout=10)
                response.raise_for_status()
            except requests.exceptions.RequestException as e:
                print(f"   -> Failed to fetch {current_url}: {e}")
                continue

            soup = BeautifulSoup(response.text, 'html.parser')
            page_title = soup.find('title').get_text(strip=True) if soup.find('title') else 'No Title'
            page_text = soup.get_text(separator=' ', strip=True)
            
            data = {'url': current_url, 'title': page_title, 'text': page_text}
            f.write(json.dumps(data) + '\n')
            
            for link in soup.find_all('a'):
                href = link.get('href')
                if not href: continue
                
                absolute_link = urljoin(current_url, href)
                
                if absolute_link not in visited_urls:
                    visited_urls.add(absolute_link)
                    queue.append(absolute_link)
            
            time.sleep(1)

    print(f"\n--- Crawl session finished. ---")
    print(f"Data for {crawled_count} pages saved to {output_file}")

In [None]:

run_targeted_crawl(
    seed_file='corporate_blog.txt', 
    output_file='crawled_corporate.jsonl',
    max_pages=500  
)
run_targeted_crawl(
    seed_file='personal_blog.txt', 
    output_file='crawled_personal.jsonl',
    max_pages=500  
)

--- Starting new crawl session ---
Seed file: personal_blog.txt
Output file: crawled_personal.jsonl
[1/500] Crawling: https://manassaloi.com/2018/03/30/how-i-became-pm.html
[2/500] Crawling: https://waitbutwhy.com/
[3/500] Crawling: https://finnscave.com/
[4/500] Crawling: https://www.kalzumeus.com/
[5/500] Crawling: https://medium.com/hackernoon/how-to-get-into-product-management-78c58bd9c8cf
[6/500] Crawling: https://manassaloi.com/
[7/500] Crawling: https://manassaloi.com/posts/
[8/500] Crawling: https://manassaloi.com/bookshelf/
[9/500] Crawling: https://manassaloi.com/booksummaries/
[10/500] Crawling: https://manassaloi.com/links/
[11/500] Crawling: https://manassaloi.com/proofofwork/
[12/500] Crawling: https://manassaloi.com/usesthis/
[13/500] Crawling: https://www.iocl.com/
[14/500] Crawling: https://www.facebook.com/NgoSphere/
[15/500] Crawling: https://www.redbus.in/
   -> Failed to fetch https://www.redbus.in/: HTTPSConnectionPool(host='www.redbus.in', port=443): Read timed o