In [2]:
import time
from urllib.parse import urljoin, urldefrag
import requests
from bs4 import BeautifulSoup

class DVWACrawler:
    def __init__(self, base_url, username="admin", password="password", max_pages=20, delay=1):
        self.base_url = base_url.rstrip("/")
        self.max_pages = max_pages
        self.delay = delay
        self.session = requests.Session()
        self.visited = set()
        self.queue = [self.base_url]
        self.pages = {}
        self.forms = {}

        self.login(username, password)

    def login(self, username, password):
        """Logs into DVWA using session-based authentication."""
        login_url = f"{self.base_url}/login.php"
        resp = self.session.get(login_url)
        soup = BeautifulSoup(resp.text, "html.parser")

        token = soup.find("input", {"name": "user_token"})
        token_val = token["value"] if token else ""

        payload = {
            "username": username,
            "password": password,
            "Login": "Login",
            "user_token": token_val
        }

        self.session.post(login_url, data=payload)

    def extract_links(self, html, page_url):
        """Extracts all same-domain links from a page."""
        soup = BeautifulSoup(html, "html.parser")
        links = []
        for a in soup.find_all("a", href=True):
            abs_url = urljoin(page_url, a["href"])
            clean_url = urldefrag(abs_url)[0].rstrip("/")
            if clean_url.startswith(self.base_url):
                links.append(clean_url)
        return links

    def extract_forms(self, html, page_url):
        """Extracts form details (method, action, inputs)."""
        soup = BeautifulSoup(html, "html.parser")
        forms = []
        for form in soup.find_all("form"):
            details = {
                "method": form.get("method", "get").lower(),
                "action": urljoin(page_url, form.get("action", "")),
                "inputs": []
            }
            for inp in form.find_all("input"):
                details["inputs"].append({
                    "name": inp.get("name"),
                    "type": inp.get("type", "text"),
                    "value": inp.get("value", "")
                })
            forms.append(details)
        return forms

    def crawl(self):
        """Main crawl loop."""
        while self.queue and len(self.visited) < self.max_pages:
            url = self.queue.pop(0).rstrip("/")
            if url in self.visited:
                continue

            try:
                resp = self.session.get(url, timeout=10)
                html = resp.text
            except Exception as e:
                print(f" Error fetching {url}: {e}")
                self.visited.add(url)
                continue

            self.pages[url] = html

            forms = self.extract_forms(html, url)
            if forms:
                self.forms[url] = forms

            for link in self.extract_links(html, url):
                if link not in self.visited and link not in self.queue:
                    self.queue.append(link)

            self.visited.add(url)

            print(f"✅ Crawled: {url}")

            time.sleep(self.delay)

        return {"pages": self.pages, "forms": self.forms}



if __name__ == "__main__":
    base_url = "http://localhost:8080"  
    crawler = DVWACrawler(base_url, username="admin", password="password", max_pages=10, delay=1)
    results = crawler.crawl()

    print("\n=== Pages Crawled ===")
    for url in results["pages"].keys():
        print(url)

    print("\n=== Forms Found ===")
    for url, forms in results["forms"].items():
        print(f"\nURL: {url}")
        for f in forms:
            print(f)


✅ Crawled: http://localhost:8080
✅ Crawled: http://localhost:8080/instructions.php
✅ Crawled: http://localhost:8080/setup.php
✅ Crawled: http://localhost:8080/vulnerabilities/brute
✅ Crawled: http://localhost:8080/vulnerabilities/exec
✅ Crawled: http://localhost:8080/vulnerabilities/csrf
✅ Crawled: http://localhost:8080/vulnerabilities/fi/?page=include.php
✅ Crawled: http://localhost:8080/vulnerabilities/upload
✅ Crawled: http://localhost:8080/vulnerabilities/captcha
✅ Crawled: http://localhost:8080/vulnerabilities/sqli

=== Pages Crawled ===
http://localhost:8080
http://localhost:8080/instructions.php
http://localhost:8080/setup.php
http://localhost:8080/vulnerabilities/brute
http://localhost:8080/vulnerabilities/exec
http://localhost:8080/vulnerabilities/csrf
http://localhost:8080/vulnerabilities/fi/?page=include.php
http://localhost:8080/vulnerabilities/upload
http://localhost:8080/vulnerabilities/captcha
http://localhost:8080/vulnerabilities/sqli

=== Forms Found ===

URL: http://l