In [None]:
pip install nltk



In [None]:
import nltk

In [None]:
pip install LSI



In [None]:
from LSI import LSI

ModuleNotFoundError: No module named 'LSI'

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json
import re
from urllib.parse import urljoin, urlparse
import time
import os
from IPython.display import display, HTML
from bs4 import XMLParsedAsHTMLWarning
import warnings
warnings.filterwarnings("ignore", category=XMLParsedAsHTMLWarning)

class WebCrawler:
    def __init__(self, url, depth=1, delay=1):
        """
        Initialize the web crawler

        Args:
            url (str): The starting URL to crawl
            depth (int): How many levels of links to follow (default: 1)
            delay (int): Delay between requests in seconds (default: 1)
        """
        self.starting_url = url
        self.depth = depth
        self.delay = delay
        self.visited_urls = set()
        self.data = []
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }

    def is_valid_url(self, url):
        """Check if url is valid and has the same domain as the starting url"""
        try:
            parsed_url = urlparse(url)
            starting_parsed = urlparse(self.starting_url)
            return bool(parsed_url.netloc) and parsed_url.netloc == starting_parsed.netloc
        except:
            return False

    def extract_metadata(self, soup):
        """Extract metadata from the page"""
        metadata = {}

        # Extract meta tags
        for meta in soup.find_all('meta'):
            if meta.get('name'):
                metadata[meta.get('name')] = meta.get('content')
            elif meta.get('property'):
                metadata[meta.get('property')] = meta.get('content')

        description = soup.find('meta', attrs={'name': 'description'})
        if description:
            metadata['description'] = description.get('content')

        for meta in soup.find_all('meta', attrs={'property': re.compile(r'^og:')}):
            metadata[meta.get('property')] = meta.get('content')

        for meta in soup.find_all('meta', attrs={'name': re.compile(r'^twitter:')}):
            metadata[meta.get('name')] = meta.get('content')

        return metadata

    def crawl(self, url, current_depth=0):
        """
        Crawl the webpage and extract data

        Args:
            url (str): URL to crawl
            current_depth (int): Current depth level
        """
        if url in self.visited_urls or current_depth > self.depth:
            return

        self.visited_urls.add(url)

        try:
            print(f"Crawling: {url}")
            response = requests.get(url, headers=self.headers, timeout=10)
            response.raise_for_status()

            soup = BeautifulSoup(response.text, 'html.parser')

            title = soup.title.string if soup.title else "No Title"

            metadata = self.extract_metadata(soup)

            links = []
            for a_tag in soup.find_all('a', href=True):
                link = a_tag.get('href')
                text = a_tag.text.strip()

                absolute_link = urljoin(url, link)

                link_data = {
                    'text': text,
                    'url': absolute_link
                }
                links.append(link_data)

                page_data = {
                    'url': url,
                    'title': title,
                    'metadata': metadata,
                    'links': links
                }

                for i, item in enumerate(self.data):
                    if item['url'] == url:
                        self.data[i] = page_data
                        break
                else:
                    self.data.append(page_data)

                if current_depth < self.depth and self.is_valid_url(absolute_link):
                    time.sleep(self.delay)
                    self.crawl(absolute_link, current_depth + 1)

        except Exception as e:
            print(f"Error crawling {url}: {str(e)}")

    def start_crawling(self):
        """Start the crawling process from the starting URL"""
        self.crawl(self.starting_url)
        return self.data

    def save_as_json(self, filename="crawled_data.json"):
        """Save crawled data as JSON"""
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(self.data, f, indent=4, ensure_ascii=False)
        print(f"Data saved to {filename}")

    def save_as_csv(self, filename="crawled_data.csv"):
        """Save crawled data as CSV (flattened structure)"""
        flattened_data = []

        for page in self.data:
            page_info = {
                'url': page['url'],
                'title': page['title']
            }

            for key, value in page['metadata'].items():
                page_info[f'meta_{key}'] = value

            if not page['links']:
                flattened_data.append(page_info)
            else:
                for link in page['links']:
                    link_info = page_info.copy()
                    link_info['link_text'] = link['text']
                    link_info['link_url'] = link['url']
                    flattened_data.append(link_info)

        df = pd.DataFrame(flattened_data)
        df.to_csv(filename, index=False, encoding='utf-8')
        print(f"Data saved to {filename}")

    def display_summary(self):
        """Display a summary of the crawled data"""
        num_pages = len(self.data)
        num_links = sum(len(page['links']) for page in self.data)

        print(f"\nCrawling Summary:")
        print(f"----------------")
        print(f"Starting URL: {self.starting_url}")
        print(f"Depth: {self.depth}")
        print(f"Pages crawled: {num_pages}")
        print(f"Total links found: {num_links}")
        print(f"Unique URLs visited: {len(self.visited_urls)}")

url = "https://web-scraping.dev/"
crawler = WebCrawler(url, depth=1, delay=1)
data = crawler.start_crawling()
crawler.save_as_json()
crawler.save_as_csv()
crawler.display_summary()

if data:
    print("\nPreview of the first page data:")
    print(f"URL: {data[0]['url']}")
    print(f"Title: {data[0]['title']}")
    print(f"Metadata: {json.dumps(data[0]['metadata'], indent=2)}")
    print(f"Links found: {len(data[0]['links'])}")
    if data[0]['links']:
        print(f"First link: {data[0]['links'][0]}")

Crawling: https://web-scraping.dev/
Crawling: https://web-scraping.dev/docs
Crawling: https://web-scraping.dev/api/graphql
Crawling: https://web-scraping.dev/products
Crawling: https://web-scraping.dev/reviews
Crawling: https://web-scraping.dev/testimonials
Crawling: https://web-scraping.dev/login
Crawling: https://web-scraping.dev/cart
Crawling: https://web-scraping.dev/#scenarios
Crawling: https://web-scraping.dev/product/1
Crawling: https://web-scraping.dev/login?cookies=
Crawling: https://web-scraping.dev/blocked
Crawling: https://web-scraping.dev/credentials
Crawling: https://web-scraping.dev/blocked?persist=
Crawling: https://web-scraping.dev/sitemap.xml
Crawling: https://web-scraping.dev/robots.txt
Data saved to crawled_data.json
Data saved to crawled_data.csv

Crawling Summary:
----------------
Starting URL: https://web-scraping.dev/
Depth: 1
Pages crawled: 12
Total links found: 217
Unique URLs visited: 16

Preview of the first page data:
URL: https://web-scraping.dev/
Title: w