In [67]:
from typing import Optional
import re
import tldextract

class NepaliUrlValidator:
    def __init__(self):
        '''
            pre-defined site specific patterns/rules for inentifying Nepali content
            e.g. bbc only contains nepali content under /nepali section

            # format for site_patterns
            {
                <tldextract.extract(url).registered_domain>: [<site_pattern1>, <site_pattern2>, ...],
                ...
            }

            
            # Patterns match using regex

            pattern: 
                (?!...) syntax means NOT operation of reges inside the brackets
                ^ is start of the string
                $ is end of string
                .* matches any character 0 or more times
                \/ is / <forward slash> with escape_character \  <backward slash> in front
                \. is . <dot> with escape_character \  <backward slash> in front
        '''
        # Define patterns for different news sites
        # Format: {domain: [pattern1, pattern2, ...]}
        self.site_patterns = {
            'bbc.co.uk': [
                r'^.*\/nepali\/.*$'
                # r'/nepali(?:/|$)'
            ],
            "ekagaj.com": [r'^(?!.*en\.).*$'],          # avoid https://en.ekagaj.com/
            "himalpress.com": [r'^(?!.*en\.).*$'],  # avoid # avoid https://en.himalpress.com/govt-spent-rs-107-66-billion-on-agriculture-subsidies-in-past-five-years/
            "nepalbahas.com": [r'^(?!.*en\.).*$'],
            "nayapage.com": [r'^(?!.*en\.).*$'],
            "nepalkhabar.com": [r'^(?!.*en\.).*$'],
            "setopati.com": [r'^(?!.*en\.).*$'],
            
            "nepalgunjnews.com": [r'^(?!.*\/english\/).*$'],                      # avoid https://www.nepalgunjnews.com/english/20230868906/
            "bbc.com": [r'^.*\/nepali\/.*$'],                                     # only follow /nepali ()
            
            "deshsanchar.com": [r'^(?!.*\/english\.).*$'],   # avoid https://english.deshsanchar.com/nepal-india-jbf-meeting-emphasis-on-expansion-of-bilateral-trade/
            "aarthiknews.com": [r'^(?!.*\/english\.).*$'],   # avoid https://english.aarthiknews.com/news/detail/17669/
            "corporatenepal.com": [r'^(?!.*\/english\.).*$'], # avoid https://english.merolifestyle.com/?p=2192
            "nepalpage.com": [r'^(?!.*\/english\.).*$'],           # avoid https://english.nepalpage.com/2022/12/like-walking-on-missiles-us-airman-recalls-the-horror-of-the-vietnam-christmas-bombings-50-years-on/
            "lokaantar.com": [r'^(?!.*\/english\.).*$'],           # avoid https://english.lokaantar.com/news/detail/33475/
            "dhangadhikhabar.com": [r'^(?!.*\/english\.).*$'],   # avoid https://english.dhangadhikhabar.com/news/73691
            "khabarhub.com": [r'^(?!.*\/english\.).*$'],               # avoid https://english.khabarhub.com/2025/12/427719/
            "pardafas.com": [r'^(?!.*\/english\.).*$'],
            "makalukhabar.com": [r'^(?!.*\/english\.).*$'],
            "kathmandupati.com": [r'^(?!.*\/english\.).*$'],
            "annapurnapost.com": [r'^(?!.*\/english\.).*$'],
            "madheshvani.com": [r'^(?!.*\/english\.).*$'],
            "nepalwatch.com": [r'^(?!.*\/english\.).*$'],
            "dcnepal.com": [r'^(?!.*\/english\.).*$'],
            "karobardaily.com": [r'^(?!.*\/english\.).*$']
        }
    
    def is_probable_nepali_content_url(self, url: str) -> bool:
        """
        Check if the given URL contains Nepali content based on predefined patterns.
        return True if url is not in predefined patterns
        Args:
            url (str): The URL to check
            
        Returns:
            bool: whether it is a probable Nepali content URL, None otherwise
        """
        try:
            domain = tldextract.extract(url).registered_domain

            # First check if we have patterns for this domain
            if domain in self.site_patterns:
                # print(f' domain: {domain}, path:{path}')
                # Check if URL matches any of the site's Nepali content patterns
                for pattern in self.site_patterns[domain]:
                    if re.search(pattern, url):
                        return True
                    else:
                        return False
            else:
                # print(f'patterns not defined for {domain}')
                # If no patterns are defined, return True (probable nepali content)
                return True
        except Exception as e:
            print(f"Error processing URL {url}: {str(e)}")
            return None
    
    def get_all_sites(self) -> List[str]:
        """
        Get list of all supported sites.
        
        Returns:
            List[str]: List of supported netlocs
        """
        return list(self.site_patterns.keys())

In [87]:
# Create validator instance
validator = NepaliUrlValidator()

# Test some URLs
test_urls = [
    {'https://www.bbc.com/nepali/news-12345':True},
    {'https://www.bbc.com/english/news':False},
    
    {"http://www.bbc.co.uk/nepali/news/2014/09/140924_ebola_death_rates.shtml":True},
    {"http://www.bbc.co.uk/sports/some_sport_news":False},
    # "https://www.bbc.com/nepali/articles/c3e3wpd7n11o",
    

    {'https://www.nepalgunjnews.com/main-news/20250177225/':True},
    {'https://www.nepalgunjnews.com/english/20230868906/':False},

    {'https://www.karobardaily.com/news/277113':True},
    {'https://english.karobardaily.com/2025/01/12/oag-successful-in-68-27-per-cent-cases-in-fiscal-year-2023-24/':False},

    {'https://ekagaj.com/article/government/188701/':True},
    {'https://en.ekagaj.com/articles/nc-leader-dr-shekhar-koirala-delivers-speech-in-british-parliament/':False},
    
]

for data in test_urls:
    url, value = list(data.items())[0]
    # print(f'url:{url}, value:{value}')
    is_valid_nepali_site = validator.is_probable_nepali_content_url(url)
    assert(is_valid_nepali_site == value, "url:{url} is expected:{value} got:{is_valid_nepali_site}")
    if is_valid_nepali_site:
        print(f"✓ {url} is a Nepali content URL from {site}")
    else:
        print(f"✗ {url} is not a Nepali content URL")

# Check supported sites
print("\nSupported sites:")
for site in validator.get_all_sites():
    print(f"- {site}")

✓ https://www.bbc.com/nepali/news-12345 is a Nepali content URL from karobardaily.com
✗ https://www.bbc.com/english/news is not a Nepali content URL
✓ http://www.bbc.co.uk/nepali/news/2014/09/140924_ebola_death_rates.shtml is a Nepali content URL from karobardaily.com
✗ http://www.bbc.co.uk/sports/some_sport_news is not a Nepali content URL
✓ https://www.nepalgunjnews.com/main-news/20250177225/ is a Nepali content URL from karobardaily.com
✗ https://www.nepalgunjnews.com/english/20230868906/ is not a Nepali content URL
✓ https://www.karobardaily.com/news/277113 is a Nepali content URL from karobardaily.com
✗ https://english.karobardaily.com/2025/01/12/oag-successful-in-68-27-per-cent-cases-in-fiscal-year-2023-24/ is not a Nepali content URL
✓ https://ekagaj.com/article/government/188701/ is a Nepali content URL from karobardaily.com
✗ https://en.ekagaj.com/articles/nc-leader-dr-shekhar-koirala-delivers-speech-in-british-parliament/ is not a Nepali content URL

Supported sites:
- bbc.co

  assert(is_valid_nepali_site == value, "url:{url} is expected:{value} got:{is_valid_nepali_site}")


## References
* regex-tutorials [regexone](https://regexone.com/)