In [1]:
from urllib.parse import urlparse

In [2]:
from crawl4ai import AsyncWebCrawler  
import asyncio 
async def extract_urls_crawl(base_url):
    async with AsyncWebCrawler() as crawler: 
        result = await crawler.arun(
            url = base_url 
        )  
        if not result.success :
            raise RuntimeError
        valid_links = []
        titles = [] 
        base_domains = []
        for url in result.links["internal"] : 
            if url.get("text") and url.get("text").strip() or url.get("title") and url.get("title").strip(): 
                valid_links.append(url.get("href"))            
                titles.append(url.get("title") if url.get("title") else url.get("text")) 
                base_domains.append(url.get("base_domain")) 
        links_data = {
            "links" : valid_links , 
            "titles" : titles , 
            "base_domains" : base_domains  
        }                                  
        return links_data 

In [34]:
links_data = await extract_urls_crawl("https://www.sandipuniversity.edu.in/")

[INIT].... → Crawl4AI 0.4.247
[FETCH]... ↓ https://www.sandipuniversity.edu.in/... | Status: True | Time: 0.01s
[COMPLETE] ● https://www.sandipuniversity.edu.in/... | Status: True | Total: 0.02s


In [35]:
import pandas as pd 

df = pd.DataFrame(links_data) 

In [36]:
df

Unnamed: 0,links,titles,base_domains
0,https://www.sandipuniversity.edu.in/brochure_d...,Download E-Brochures,sandipuniversity.edu.in
1,https://www.sandipuniversity.edu.in/jobs/jobs.php,Careers,sandipuniversity.edu.in
2,https://www.sandipuniversity.edu.in/loan-facil...,Loan Facilities,sandipuniversity.edu.in
3,https://www.sandipuniversity.edu.in/internatio...,International Affairs Cell,sandipuniversity.edu.in
4,https://www.sandipuniversity.edu.in/,Get Connected,sandipuniversity.edu.in
...,...,...,...
88,https://www.sandipuniversity.edu.in/pdf/Ombuds...,Ombuds Person,sandipuniversity.edu.in
89,https://www.sandipuniversity.edu.in/iqac/rti.php,Statutory Declaration Under Section 4(1)(b) of...,sandipuniversity.edu.in
90,https://www.sandipuniversity.edu.in/info@sandi...,info@sandipuniversity.edu.in,sandipuniversity.edu.in
91,https://www.sandipuniversity.edu.in/privacy-po...,Privacy Policy,sandipuniversity.edu.in


In [37]:

# check_for_duplicates

value_counts = df["links"].value_counts()
duplicated_values = value_counts[value_counts > 1 ] 
duplicated_values 

Series([], Name: count, dtype: int64)

In [38]:
df = df.drop_duplicates() 
df = df.dropna()  
df

Unnamed: 0,links,titles,base_domains
0,https://www.sandipuniversity.edu.in/brochure_d...,Download E-Brochures,sandipuniversity.edu.in
1,https://www.sandipuniversity.edu.in/jobs/jobs.php,Careers,sandipuniversity.edu.in
2,https://www.sandipuniversity.edu.in/loan-facil...,Loan Facilities,sandipuniversity.edu.in
3,https://www.sandipuniversity.edu.in/internatio...,International Affairs Cell,sandipuniversity.edu.in
4,https://www.sandipuniversity.edu.in/,Get Connected,sandipuniversity.edu.in
...,...,...,...
88,https://www.sandipuniversity.edu.in/pdf/Ombuds...,Ombuds Person,sandipuniversity.edu.in
89,https://www.sandipuniversity.edu.in/iqac/rti.php,Statutory Declaration Under Section 4(1)(b) of...,sandipuniversity.edu.in
90,https://www.sandipuniversity.edu.in/info@sandi...,info@sandipuniversity.edu.in,sandipuniversity.edu.in
91,https://www.sandipuniversity.edu.in/privacy-po...,Privacy Policy,sandipuniversity.edu.in


In [51]:
###################Links Filterations###############
def is_content_url(url):
    """Enhanced filter for content-rich pages"""
    
    # content_indicators = [
    #     '/article/', '/post/', '/blog/',
    #     '/guide/', '/tutorial/',
    #     '/about/', '/page/', '/content/',
    #     '/courses/', '/faculty/', '/department/',
    #     '/research/', '/publication/',
    #     '/news/', '/events/',
    #     '/academics/', '/admission/',
    #     '/programs/', '/curriculum/',
    #     '/syllabus/', '/handbook/',
    #     '/contact/', '/location/',
    #     '/careers/', '/jobs/',
    #     '/faq/', '/help/', '/support/',
    #     '/policy/', '/terms/', '/privacy/',
    #     '/press/', '/media/', '/announcements/',
    #     '/projects/', '/portfolio/',
    #     '/services/', '/solutions/',
    #     '/team/', '/staff/', '/people/',
    #     '/overview/', '/details/', '/description/'
    # ]
    
    exclude_paths = [
        # CMS and Admin
        '/tag/', '/category/', '/author/',
        '/search/', '/page/', '/wp-content/',
        '/feed/', '/rss/', '/sitemap/',
        '/cart/', '/checkout/', '/account/',
        '/login/', '/register/', '/signup/',
        '/wp-admin/', '/wp-includes/',
        '/wp-json/', '/wp-cron/', '/wp-login/',
        '/administrator/', '/admin/', '/cpanel/',
        '/dashboard/', '/manage/', '/control/',
        
        # Assets and Resources
        '/assets/', '/images/', '/css/', '/js/',
        '/api/', '/cdn-cgi/', '/comment/',
        '/archive/', '/month/', '/date/',
        '/shop/', '/product/', '/cart/',
        '/fonts/', '/dist/', '/build/',
        '/temp/', '/tmp/', '/cache/',
        '/uploads/', '/download/', '/files/',
        '/thumb/', '/thumbnail/', '/preview/',
        '/banner/', '/slider/', '/carousel/',
        '/static/', '/media/', '/resources/',
        
        # User Interaction
        '/comment/', '/reply/', '/responses/',
        '/like/', '/share/', '/favorite/',
        '/rating/', '/review/', '/feedback/',
        '/submit/', '/form/', '/contact-form/',
        
        # Social and External
        '/social/', '/community/', '/forum/',
        '/chat/', '/message/', '/notification/',
        '/profile/', '/user/', '/member/',
        '/auth/', '/oauth/', '/sso/',
        
        # Temporary and System
        '/temp/', '/cache/', '/backup/',
        '/log/', '/logs/', '/status/',
        '/test/', '/testing/', '/debug/',
        '/demo/', '/sample/', '/example/',
        
        # eCommerce
        '/cart/', '/basket/', '/checkout/',
        '/order/', '/payment/', '/transaction/',
        '/invoice/', '/receipt/', '/shipping/',
        
        # Tracking and Analytics
        '/track/', '/analytics/', '/stats/',
        '/pixel/', '/beacon/', '/tracking/',
        '/counter/', '/hit/', '/click/'
    ]
    exclude_paths_improved = [
    # CMS and Admin Systems (these rarely contain valuable content)
    '/wp-admin/', '/wp-includes/', '/wp-json/', '/wp-cron/',
    '/wp-login/', '/administrator/', '/admin/', '/cpanel/',
    '/dashboard/', '/manage/', '/control/', '/login/', '/register/',
    '/signup/', '/account/', '/auth/', '/oauth/', '/sso/',
    
    # Core Technical Assets (never contain readable content)
    '/css/', '/js/', '/fonts/', '/dist/', '/build/',
    '/static/css/', '/static/js/', '/assets/css/', '/assets/js/',
    '/cdn-cgi/', '/webpack/', '/node_modules/', 
    
    # Media and Binary Files (typically not useful for knowledge extraction)
    '/images/', '/img/', '/thumbnails/', '/thumbs/', '/icons/',
    '/uploads/images/', '/assets/images/',
    
    # Temporary and System
    '/temp/', '/tmp/', '/cache/', '/backup/',
    '/logs/', '/status/', '/debug/',
    
    # eCommerce and Shopping (typically low-knowledge content)
    '/cart/', '/basket/', '/checkout/', 
    '/payment/', '/transaction/',
    
    # Tracking and Analytics
    '/tracking/', '/analytics/', '/pixel/', '/beacon/',
    
    # Utility and Functionality
    '/search/', '/feed/', '/rss/', '/sitemap/', '/api/',
]
    social_domains = {
    'facebook.com', 'fb.com', 'twitter.com', 'x.com', 
    'instagram.com', 'linkedin.com', 'youtube.com',
    'pinterest.com', 'tiktok.com', 'snapchat.com',
    'reddit.com', 'tumblr.com', 'discord.com', 'twitch.tv',
    'medium.com', 'quora.com', 't.me', 'wa.me'
}
    
    exclude_extensions = [
        # Documents
        '.pdf', '.doc', '.docx', '.txt', '.rtf',
        '.ppt', '.pptx', '.xls', '.xlsx', '.csv',
        '.odt', '.ods', '.odp', '.pages', '.numbers',
        '.key', '.epub', '.mobi',
        
        # Images
        '.jpg', '.jpeg', '.png', '.gif', '.bmp',
        '.svg', '.webp', '.tiff', '.ico', '.psd',
        '.ai', '.eps',
        
        # Audio/Video
        '.mp3', '.wav', '.ogg', '.m4a', '.wma',
        '.mp4', '.avi', '.mov', '.wmv', '.flv',
        '.webm', '.mkv', '.m4v',
        
        # Archives
        '.zip', '.rar', '.7z', '.tar', '.gz',
        '.bz2', '.iso',
        
        # Web Assets
        '.css', '.js', '.jsx', '.ts', '.tsx',
        '.json', '.xml', '.yaml', '.yml',
        '.woff', '.woff2', '.ttf', '.eot',
        '.map', '.min.js', '.min.css',
        
        # Configuration
        '.conf', '.config', '.ini', '.env',
        '.htaccess', '.htpasswd',
    ]
    
    exclude_params = [
        'page=', 'sort=', 'filter=', 'tag=',
        'category=', 'lang=', 'ref=', 'source=',
        'utm_', 'fbclid=', 'gclid=', 'sid=',
        'session=', 'token=', 'auth=', 'key=',
        'id=', 'date=', 'version=', 'v=',
        'format=', 'view=', 'layout=', 'type=',
        'redirect=', 'return=', 'callback=',
        'query=', 'search=', 'keywords=',
        'limit=', 'offset=', 'start=', 'end=',
        'from=', 'to=', 'dir=', 'order=',
        'print=', 'download=', 'preview='
    ]

    url_lower = url.lower()
    parsed_url = urlparse(url)
    path = parsed_url.path.lower()

  
    if any(url_lower.endswith(ext) for ext in exclude_extensions):
        return False
    if any(param in url_lower for param in exclude_params):
        return False
    if any(path in url_lower for path in exclude_paths_improved):
        return False
    if path.strip('/').isdigit():
        return False
    if len(path.split('/')) > 4:  
        return False
    # has_content_indicator = any(indicator in url_lower for indicator in content_indicators)
    if any(domain in url_lower for domain in social_domains) : 
        return False 
    path_segments = [s for s in path.split('/') if s]
    if len(path_segments) >= 3: 
        return False
        
    return True

def filter_df(row):
    """Filter and prioritize URLs for knowledge base creation"""
    if is_content_url(row["links"]) : 
        return row 
    else : 
        return None 
    

In [60]:
duplicate_df = df.copy()  

In [61]:
filtered_df = duplicate_df.apply(filter_df , axis=1).dropna() 

In [62]:
filtered_df.reset_index(drop=True)  

Unnamed: 0,links,titles,base_domains
0,https://www.sandipuniversity.edu.in/brochure_d...,Download E-Brochures,sandipuniversity.edu.in
1,https://www.sandipuniversity.edu.in/jobs/jobs.php,Careers,sandipuniversity.edu.in
2,https://www.sandipuniversity.edu.in/loan-facil...,Loan Facilities,sandipuniversity.edu.in
3,https://www.sandipuniversity.edu.in/internatio...,International Affairs Cell,sandipuniversity.edu.in
4,https://www.sandipuniversity.edu.in/,Get Connected,sandipuniversity.edu.in
...,...,...,...
80,https://www.sandipuniversity.edu.in/commitees.php,Committees,sandipuniversity.edu.in
81,https://www.sandipuniversity.edu.in/iqac/rti.php,Statutory Declaration Under Section 4(1)(b) of...,sandipuniversity.edu.in
82,https://www.sandipuniversity.edu.in/info@sandi...,info@sandipuniversity.edu.in,sandipuniversity.edu.in
83,https://www.sandipuniversity.edu.in/privacy-po...,Privacy Policy,sandipuniversity.edu.in


In [63]:
filtered_df["links"] = filtered_df["links"].apply(lambda x : x.strip() if isinstance(x , str) else x)  
filtered_df["links"] = filtered_df["links"].str.strip()  

In [65]:
filtered_df.reset_index(drop=True)

Unnamed: 0,links,titles,base_domains
0,https://www.sandipuniversity.edu.in/brochure_d...,Download E-Brochures,sandipuniversity.edu.in
1,https://www.sandipuniversity.edu.in/jobs/jobs.php,Careers,sandipuniversity.edu.in
2,https://www.sandipuniversity.edu.in/loan-facil...,Loan Facilities,sandipuniversity.edu.in
3,https://www.sandipuniversity.edu.in/internatio...,International Affairs Cell,sandipuniversity.edu.in
4,https://www.sandipuniversity.edu.in/,Get Connected,sandipuniversity.edu.in
...,...,...,...
80,https://www.sandipuniversity.edu.in/commitees.php,Committees,sandipuniversity.edu.in
81,https://www.sandipuniversity.edu.in/iqac/rti.php,Statutory Declaration Under Section 4(1)(b) of...,sandipuniversity.edu.in
82,https://www.sandipuniversity.edu.in/info@sandi...,info@sandipuniversity.edu.in,sandipuniversity.edu.in
83,https://www.sandipuniversity.edu.in/privacy-po...,Privacy Policy,sandipuniversity.edu.in


In [None]:
next_links = await extract_urls_crawl("https://www.sandipuniversity.edu.in/commitees.php") 

In [2]:
from crawl4ai import CrawlerRunConfig 
import time 

In [None]:
class RecurrsiveCrawler() : 
    def __init__(self , base_url : str , max_depth : int = 3 , max_links : int = 20 , external_links : bool = False , social_media_links : bool = False): 
        self.base_url = base_url  
        self.max_depth = max_depth  
        self.max_links = max_links 
        self.result_df = pd.DataFrame(columns=["link" , "base_domain" , "title" , "parent_link" , "depth"]) 
        self.crawl_config = CrawlerRunConfig(
            exclude_external_links=external_links , 
            exclude_social_media_links=social_media_links 
        )
        self.crawled_urls = set() 
    
    async def crawl(self) : 
        
        starttime = time.time() 
        
        await extract_recurrsive(self.base_url , 0 , None) 
        
        self.result_df.drop_duplicates(subset = ['url'] , inplace = True) 
        
    async def extract_recurrsive(self , url , current_depth , parent_link) : 
        if current_depth >= self.max_depth or url in self.crawled_urls : 
            return 
        try : 
            links = self._extract_urls(url) ; 
            
            for i , link in links["links"] : 
                new_row = {
                    "url" : link , 
                    "title" : links["titles"][i] if i < len(links["titles"]) else " " , 
                    "base_domain" : links["base_domains"][i] if i < len(links["base_domains"]) else " " , 
                    "parent_link" : parent_link , 
                    "depth" : current_depth  
                } 
                
                self.result_df = pd.concat()
        
                    
    async def _extract_urls(self , url ):
        async with AsyncWebCrawler() as crawler : 
            result = crawler.arun(
                url = url , 
                config=self.crawl_config 
            )
            
            if not result.success:
                return {
                    "links" : [] , 
                    "base_domains" : [] , 
                    "titles" : []
                } 
            
            valid_links = [] 
            titles = [] 
            base_domains = [] 
            for url in result.links["internal"] : 
                if url.get("text") and url.get("text").strip() or url.get("title") and url.get("title").strip(): 
                    valid_links.append(url.get("href"))            
                    titles.append(url.get("title") if url.get("title") else url.get("text")) 
                    base_domains.append(url.get("base_domain")) 
            links_data = {
                "links" : valid_links , 
                "titles" : titles , 
                "base_domains" : base_domains  
            }                                  
            return links_data 
                    
                
             
                