In [1]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from collections import deque
import time
import pandas as pd

In [2]:
def crawl_website(base_url, max_pages=100,skip_patterns=None):
    start_time = time.time()
    visited = set()
    queue = deque([base_url])
    domain = urlparse(base_url).netloc
    results = []
    count = 0
    skip_patterns = skip_patterns or []

    while queue and len(visited) < max_pages:
        url = queue.popleft()

        if url in visited:
            continue

        try:
            response = requests.get(url)
            if response.status_code != 200:
                continue
            soup = BeautifulSoup(response.text, 'html.parser')
            visited.add(url)
            title = soup.title.string.strip() if soup.title else 'No Title'
            results.append((title,url))
            count += 1

            print(f'[INFO] page# {count} {title}: {url}')

            
            for a in soup.find_all('a',href=True):
                link = urljoin(base_url,a['href'])
                
                if urlparse(link).netloc == domain and link not in visited:
                    if not any(p in link for p in skip_patterns):
                        queue.append(link)
        except Exception as e:
            print(f'[ERROR] {url} : {e}')
            continue

    end_time = time.time() - start_time
    print(f'[INFO] Finished Script. Time Take {end_time}s')
    return results

In [3]:
skip_patterns = ['/ar','/upload','/video']
results = crawl_website('https://hbtf.com/en',max_pages=1000,skip_patterns=skip_patterns)

[INFO] page# 1 The Housing Bank for Trade and Finance - Housing Bank: https://hbtf.com/en
[INFO] page# 2 Contact Us - Housing Bank: https://hbtf.com/en/contact
[INFO] page# 3 Retail - Housing Bank: https://hbtf.com/en/retail
[INFO] page# 4 Edge - Housing Bank: https://hbtf.com/en/retail/programs/edge
[INFO] page# 5 Corporate Banking Sector - Housing Bank: https://hbtf.com/en/corporate
[INFO] page# 6 Treasury & Investment - Housing Bank: https://hbtf.com/en/treasury-investment
[INFO] page# 7 بنك الإسكان للتجارة والتمويل - بنك الاسكان: https://hbtf.com/?language=en
[INFO] page# 8 About HBTF - Housing Bank: https://hbtf.com/en/the-bank
[INFO] page# 9 Board of Director - Housing Bank: https://hbtf.com/en/board-of-director
[INFO] page# 10 Management - Housing Bank: https://hbtf.com/en/management
[INFO] page# 11 Investor's Center - Housing Bank: https://hbtf.com/en/investor-relations
[INFO] page# 12 Corporate Social Responsibility - Housing Bank: https://hbtf.com/en/communities
[INFO] page# 

In [4]:
names = [x[0] for x in results]
urls = [x[1] for x in results]

In [5]:
df = pd.DataFrame({'Name':names,
                  'URL':urls})

In [6]:
df.to_csv('new_script.csv',index=False)

In [7]:
results

[('The Housing Bank for Trade and Finance - Housing Bank',
  'https://hbtf.com/en'),
 ('Contact Us - Housing Bank', 'https://hbtf.com/en/contact'),
 ('Retail - Housing Bank', 'https://hbtf.com/en/retail'),
 ('Edge - Housing Bank', 'https://hbtf.com/en/retail/programs/edge'),
 ('Corporate Banking Sector - Housing Bank', 'https://hbtf.com/en/corporate'),
 ('Treasury & Investment - Housing Bank',
  'https://hbtf.com/en/treasury-investment'),
 ('بنك الإسكان للتجارة والتمويل - بنك الاسكان',
  'https://hbtf.com/?language=en'),
 ('About HBTF - Housing Bank', 'https://hbtf.com/en/the-bank'),
 ('Board of Director - Housing Bank', 'https://hbtf.com/en/board-of-director'),
 ('Management - Housing Bank', 'https://hbtf.com/en/management'),
 ("Investor's Center - Housing Bank",
  'https://hbtf.com/en/investor-relations'),
 ('Corporate Social Responsibility - Housing Bank',
  'https://hbtf.com/en/communities'),
 ('Annual Reports - Housing Bank', 'https://hbtf.com/en/annual-reports'),
 ('Working at HB