# Find Country name from the URL

In [2]:
!pip install tldextract
!pip install geoip2
!pip install pycountry
!pip install python-whois

Collecting tldextract
  Downloading tldextract-5.3.0-py3-none-any.whl.metadata (11 kB)
Collecting requests-file>=1.4 (from tldextract)
  Downloading requests_file-2.1.0-py2.py3-none-any.whl.metadata (1.7 kB)
Collecting filelock>=3.0.8 (from tldextract)
  Downloading filelock-3.18.0-py3-none-any.whl.metadata (2.9 kB)
Downloading tldextract-5.3.0-py3-none-any.whl (107 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m107.4/107.4 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading filelock-3.18.0-py3-none-any.whl (16 kB)
Downloading requests_file-2.1.0-py2.py3-none-any.whl (4.2 kB)
Installing collected packages: filelock, requests-file, tldextract
Successfully installed filelock-3.18.0 requests-file-2.1.0 tldextract-5.3.0
Collecting geoip2
  Downloading geoip2-5.0.1-py3-none-any.whl.metadata (18 kB)
Collecting aiohttp<4.0.0,>=3.6.2 (from geoip2)
  Downloading aiohttp-3.11.18-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.7 kB)
Collecting

In [3]:
import pandas as pd
import tldextract
import socket
import geoip2.database
import pycountry
from urllib.parse import urlparse
import re

# 1) Load your GeoLite2 database
GEOIP_DB_PATH = "GeoLite2-Country.mmdb"
geo_reader   = geoip2.database.Reader(GEOIP_DB_PATH)

def country_from_ip(url):
    ext    = tldextract.extract(url)
    domain = f"{ext.domain}.{ext.suffix}"
    try:
        ip   = socket.gethostbyname(domain)
        resp = geo_reader.country(ip)
        return resp.country.name
    except Exception:
        return None

def country_from_tld(url):
    ext    = tldextract.extract(url)
    code   = ext.suffix.lower().split('.')[-1]
    if len(code) == 2:
        if code == 'uk':   # normalize UK→GB
            code = 'gb'
        country = pycountry.countries.get(alpha_2=code.upper())
        if country:
            return country.name
    return None

def country_from_path(url):
    """
    Exact match on each path token (and sub-token split on - or _).
    This avoids ‘news’ → Sweden, but will catch ‘guyana’.
    """
    path = urlparse(url).path.strip('/')
    for segment in path.split('/'):
        for token in re.split(r'[-_]', segment):
            if not token:
                continue
            try:
                # lookup() matches alpha_2, alpha_3, or exact country name (case-insensitive)
                return pycountry.countries.lookup(token).name
            except LookupError:
                continue
    return None

def detect_country(url):
    # 1) Hosting‐location via GeoIP
    c = country_from_ip(url)
    if c:
        return c

    # 2) ccTLD hint
    c = country_from_tld(url)
    if c:
        return c

    # 3) URL‐path fallback
    return country_from_path(url)

# Example
df = pd.DataFrame({
    "url": [
        "https://www.lemonde.fr",
        "https://www.bbc.co.uk/news",
        "https://www.nytimes.com",
        "https://example.org",
        "https://www.stabroeknews.com/2025/01/05/news/guyana/hamas-and-israel-edge-towards-ceasefire"
    ]
})

df["country"] = df["url"].apply(detect_country)
print(df)


                                                 url        country
0                             https://www.lemonde.fr         France
1                         https://www.bbc.co.uk/news  United States
2                            https://www.nytimes.com  United States
3                                https://example.org  United States
4  https://www.stabroeknews.com/2025/01/05/news/g...         Guyana


In [5]:
import pandas as pd
import tldextract
import socket
import geoip2.database
import pycountry
from urllib.parse import urlparse
import re

GEOIP_DB_PATH = "GeoLite2-Country.mmdb"
geo_reader = geoip2.database.Reader(GEOIP_DB_PATH)

def country_from_tld(url):
    ext = tldextract.extract(url)
    suffix_parts = ext.suffix.split('.')
    # Check all parts of the suffix for a valid ccTLD (e.g., 'co.uk' → 'uk')
    for part in reversed(suffix_parts):
        if len(part) == 2:
            code = part.lower()
            if code == 'uk':
                code = 'gb'  # pycountry uses 'GB' for United Kingdom
            country = pycountry.countries.get(alpha_2=code.upper())
            if country:
                return country.name
    return None

def country_from_ip(url):
    try:
        domain = tldextract.extract(url).top_domain_under_public_suffix  # New
        if not domain:
            return None
        ip = socket.gethostbyname(domain)
        resp = geo_reader.country(ip)
        return resp.country.name
    except Exception:
        return None

def country_from_path(url):
    country_names = {country.name.lower(): country.name for country in pycountry.countries}
    path_segments = urlparse(url).path.strip('/').split('/')
    for segment in path_segments:
        for token in re.split(r'[-_+]', segment):
            token_lower = token.lower()
            if token_lower in country_names:
                return country_names[token_lower]
    return None

def detect_country(url):
    # 1) Check TLD first
    country = country_from_tld(url)
    if country:
        return country
    
    # 2) Fallback to GeoIP
    country = country_from_ip(url)
    if country:
        return country
    
    # 3) Check URL path for country names
    return country_from_path(url)

# Example Usage
df = pd.DataFrame({
    "SOURCEURL": [
        "https://www.lemonde.fr",
        "https://www.bbc.co.uk/news",
        "https://www.nytimes.com",
        "https://example.org",
        "https://www.stabroeknews.com/2025/01/05/news/guyana/hamas-and-israel-edge-towards-ceasefire"
    ]
})

df["country"] = df["SOURCEURL"].apply(detect_country)
print(df)

                                           SOURCEURL         country
0                             https://www.lemonde.fr          France
1                         https://www.bbc.co.uk/news  United Kingdom
2                            https://www.nytimes.com   United States
3                                https://example.org   United States
4  https://www.stabroeknews.com/2025/01/05/news/g...          Guyana


In [6]:
import pandas as pd

# Sample dataset of major news websites with actual countries
data = {
    "SOURCEURL": [
        "https://www.bbc.com",           # UK
        "https://www.nytimes.com",        # USA
        "https://www.theguardian.com",    # UK
        "https://www.lemonde.fr",         # France
        "https://www.spiegel.de",         # Germany
        "https://www.repubblica.it",      # Italy
        "https://www.elpais.com",         # Spain
        "https://www.globo.com",          # Brazil
        "https://www.japantimes.co.jp",   # Japan
        "https://www.timesofindia.com",   # India
        "https://www.scmp.com",           # Hong Kong
        "https://www.haaretz.com",        # Israel
        "https://www.rt.com",             # Russia
        "https://www.aljazeera.com",      # Qatar
        "https://www.straitstimes.com",   # Singapore
        "https://www.smh.com.au",         # Australia
        "https://www.thestar.com.my",     # Malaysia
        "https://www.torontostar.ca",     # Canada
        "https://www.chinadaily.com.cn",  # China
        "https://www.dawn.com",           # Pakistan
    ],
    "Actual Country": [
        "United Kingdom", "United States", "United Kingdom", "France", "Germany",
        "Italy", "Spain", "Brazil", "Japan", "India", "China", "Israel", "Russia",
        "Qatar", "Singapore", "Australia", "Malaysia", "Canada", "China", "Pakistan"
    ]
}

df = pd.DataFrame(data)

# Apply the detection algorithm
df["Predicted Country"] = df["SOURCEURL"].apply(detect_country)

# Compare results
df["Match"] = df["Actual Country"] == df["Predicted Country"]

# Calculate accuracy
accuracy = df["Match"].mean()

print(f"Accuracy: {accuracy:.1%}")
print("\nDetailed Results:")
df[["SOURCEURL", "Actual Country", "Predicted Country", "Match"]]

Accuracy: 55.0%

Detailed Results:


Unnamed: 0,SOURCEURL,Actual Country,Predicted Country,Match
0,https://www.bbc.com,United Kingdom,United States,False
1,https://www.nytimes.com,United States,United States,True
2,https://www.theguardian.com,United Kingdom,United States,False
3,https://www.lemonde.fr,France,France,True
4,https://www.spiegel.de,Germany,Germany,True
5,https://www.repubblica.it,Italy,Italy,True
6,https://www.elpais.com,Spain,Sweden,False
7,https://www.globo.com,Brazil,Brazil,True
8,https://www.japantimes.co.jp,Japan,Japan,True
9,https://www.timesofindia.com,India,Sweden,False
