In [1]:
import requests
from urllib.parse import urlparse

def get_credibility_score(url):
    
    parsed_url = urlparse(url)
    domain = parsed_url.netloc
    
    credibility_factors = {
        "gov": 0.9,
        "edu": 0.85,
        "org": 0.75,
        "com": 0.6,
        "news": 0.5,
        "blog": 0.4
    }
    
    # Extract top-level domain
    tld = domain.split(".")[-1]
    base_score = credibility_factors.get(tld, 0.5)  # Default to 0.5 if not listed
    
    # Heuristic: Check if the URL is accessible
    try:
        response = requests.get(url, timeout=5)
        status_factor = 1.0 if response.status_code == 200 else 0.6
    except requests.RequestException:
        status_factor = 0.4  # Reduce score if URL is unreachable
    
    # Final score calculation
    score = round(base_score * status_factor, 2)
    
    explanation = (
        f"This source is considered {['low', 'medium', 'high'][int(score * 2)]} credibility based on its domain ({tld}) "
        f"and accessibility status (HTTP {response.status_code if 'response' in locals() else 'unreachable'})."
    )
    
    return {"score": score, "explanation": explanation}

# Example Usage
if __name__ == "__main__":
    example_url = "https://www.nasa.gov"
    print(get_credibility_score(example_url))


{'score': 0.9, 'explanation': 'This source is considered medium credibility based on its domain (gov) and accessibility status (HTTP 200).'}


In [3]:
!pip install whois

Collecting whois
  Downloading whois-1.20240129.2-py3-none-any.whl.metadata (1.3 kB)
Downloading whois-1.20240129.2-py3-none-any.whl (61 kB)
   ---------------------------------------- 0.0/61.8 kB ? eta -:--:--
   ------ --------------------------------- 10.2/61.8 kB ? eta -:--:--
   ------------------- -------------------- 30.7/61.8 kB 262.6 kB/s eta 0:00:01
   ---------------------------------------  61.4/61.8 kB 469.7 kB/s eta 0:00:01
   ---------------------------------------- 61.8/61.8 kB 368.5 kB/s eta 0:00:00
Installing collected packages: whois
Successfully installed whois-1.20240129.2


In [1]:
import requests
from urllib.parse import urlparse
import whois
import time

def get_credibility_score(url):
    """
    Evaluates the credibility of a given URL based on predefined heuristics.
    :param url: str - The URL of the reference.
    :return: dict - JSON object with score and explanation.
    """
    parsed_url = urlparse(url)
    domain = parsed_url.netloc
    
    credibility_factors = {
        "gov": 0.9,
        "edu": 0.85,
        "org": 0.75,
        "com": 0.6,
        "news": 0.5,
        "blog": 0.4
    }
    
    # Extract top-level domain
    tld = domain.split(".")[-1]
    base_score = credibility_factors.get(tld, 0.5)  # Default to 0.5 if not listed
    
    # Heuristic: Check if the URL is accessible
    try:
        response = requests.get(url, timeout=5)
        status_factor = 1.0 if response.status_code == 200 else 0.6
    except requests.RequestException:
        status_factor = 0.4  # Reduce score if URL is unreachable
    
    # Heuristic: Check domain age using WHOIS
    try:
        domain_info = whois.whois(domain)
        if isinstance(domain_info.creation_date, list):
            creation_date = domain_info.creation_date[0]
        else:
            creation_date = domain_info.creation_date
        
        if creation_date:
            domain_age = (time.time() - creation_date.timestamp()) / (365 * 24 * 60 * 60)
            age_factor = min(1.0, domain_age / 10)  # Cap at 10 years for full credibility
        else:
            age_factor = 0.5
    except Exception:
        age_factor = 0.5  # Default if WHOIS lookup fails
    
    # Final score calculation
    score = round(base_score * status_factor * age_factor, 2)
    
    explanation = (
        f"This source is considered {['low', 'medium', 'high'][int(score * 2)]} credibility based on its domain ({tld}), "
        f"accessibility status (HTTP {response.status_code if 'response' in locals() else 'unreachable'}), and "
        f"domain age ({domain_age:.1f} years if known, otherwise assumed average)."
    )
    
    return {"score": score, "explanation": explanation}

# Example Usage
if __name__ == "__main__":
    example_url = "https://www.nasa.gov"
    print(get_credibility_score(example_url))


UnboundLocalError: cannot access local variable 'domain_age' where it is not associated with a value

In [3]:
import requests
from urllib.parse import urlparse
import whois
import time

def get_credibility_score(url):
    """
    Evaluates the credibility of a given URL based on predefined heuristics.
    :param url: str - The URL of the reference.
    :return: dict - JSON object with score and explanation.
    """
    parsed_url = urlparse(url)
    domain = parsed_url.netloc
    
    credibility_factors = {
        "gov": 0.9,
        "edu": 0.85,
        "org": 0.75,
        "com": 0.6,
        "news": 0.5,
        "blog": 0.4
    }
    
    # Extract top-level domain
    tld = domain.split(".")[-1]
    base_score = credibility_factors.get(tld, 0.5)  # Default to 0.5 if not listed
    
    # Heuristic: Check if the URL is accessible
    try:
        response = requests.get(url, timeout=5)
        status_factor = 1.0 if response.status_code == 200 else 0.6
    except requests.RequestException:
        status_factor = 0.4  # Reduce score if URL is unreachable
    
    # Heuristic: Check domain age using WHOIS
    domain_age = None  # Ensure domain_age is always defined
    try:
        domain_info = whois.whois(domain)
        if isinstance(domain_info.creation_date, list):
            creation_date = domain_info.creation_date[0]
        else:
            creation_date = domain_info.creation_date
        
        if creation_date:
            domain_age = (time.time() - creation_date.timestamp()) / (365 * 24 * 60 * 60)
            age_factor = min(1.0, domain_age / 10)  # Cap at 10 years for full credibility
        else:
            age_factor = 0.5
    except Exception:
        age_factor = 0.5  # Default if WHOIS lookup fails
        domain_age = "unknown"
    
    # Final score calculation
    score = round(base_score * status_factor * age_factor, 2)
    
    explanation = (
        f"This source is considered {['low', 'medium', 'high'][int(score * 2)]} credibility based on its domain ({tld}), "
        f"accessibility status (HTTP {response.status_code if 'response' in locals() else 'unreachable'}), and "
        f"domain age ({domain_age} years if known, otherwise assumed average)."
    )
    print(f"Domain: {tld}, Domain Age: {domain_age}, Status Code: {response.status_code}")

    return {"score": score, "explanation": explanation}

# Example Usage
if __name__ == "__main__":
    example_url = "https://www.nasa.gov"
    print(get_credibility_score(example_url))


Domain: gov, Domain Age: unknown, Status Code: 200
{'score': 0.45, 'explanation': 'This source is considered low credibility based on its domain (gov), accessibility status (HTTP 200), and domain age (unknown years if known, otherwise assumed average).'}


In [4]:
import requests
import whois
from datetime import datetime
from urllib.parse import urlparse

def get_credibility_score(url):
    # Extract domain and top-level domain (TLD)
    parsed_url = urlparse(url)
    domain = parsed_url.netloc
    tld = domain.split('.')[-1]

    # Domain credibility weights (higher = more credible)
    domain_weights = {
        "gov": 0.95, "edu": 0.9, "org": 0.8, "com": 0.6, "net": 0.5, 
        "info": 0.4, "xyz": 0.2, "biz": 0.3, "news": 0.3
    }
    
    base_score = domain_weights.get(tld, 0.5)  # Default 0.5 if unknown TLD

    # Check website accessibility
    try:
        response = requests.get(url, timeout=5)
        status_code = response.status_code
        status_factor = 1.0 if status_code == 200 else 0.5  # Penalize unreachable sites
    except requests.exceptions.RequestException:
        status_factor = 0.5
        status_code = "unreachable"

    # Get domain registration details
    try:
        domain_info = whois.whois(domain)
        creation_date = domain_info.creation_date

        if isinstance(creation_date, list):
            creation_date = creation_date[0]  # Handle multiple dates

        if creation_date:
            domain_age = (datetime.now() - creation_date).days / 365
        else:
            domain_age = "unknown"
    except:
        domain_age = "unknown"

    # Handle unknown domain age
    assumed_age = 10 if domain_age == "unknown" else domain_age
    age_factor = min(1.0, assumed_age / 20)  # Normalize age (20+ years = full credibility)

    # Final score calculation
    score = round(base_score * status_factor * age_factor, 2)

    # Construct explanation
    age_explanation = f"{assumed_age:.1f} years" if domain_age != "unknown" else "assumed 10 years based on averages"
    
    explanation = (
        f"This source is considered {['low', 'medium', 'high'][int(score * 2)]} credibility "
        f"based on its domain ({tld}), accessibility status (HTTP {status_code}), and "
        f"domain age ({age_explanation})."
    )

    return {"score": score, "explanation": explanation}

# Example test
if __name__ == "__main__":
    example_url = "https://www.nasa.gov"
    print(get_credibility_score(example_url))


{'score': 0.47, 'explanation': 'This source is considered low credibility based on its domain (gov), accessibility status (HTTP 200), and domain age (assumed 10 years based on averages).'}


In [5]:
import requests
import whois
from datetime import datetime
from urllib.parse import urlparse

def get_credibility_score(url):
    # Extract domain and TLD
    parsed_url = urlparse(url)
    domain = parsed_url.netloc
    tld = domain.split('.')[-1]

    # Domain credibility base scores
    domain_weights = {
        "gov": 0.95, "edu": 0.9, "org": 0.8, "com": 0.6, "net": 0.5, 
        "info": 0.4, "xyz": 0.2, "biz": 0.3, "news": 0.3
    }
    
    base_score = domain_weights.get(tld, 0.5)  # Default to 0.5 if unknown TLD

    # Check website accessibility
    try:
        response = requests.get(url, timeout=5)
        status_code = response.status_code
        status_factor = 1.0 if status_code == 200 else 0.7  # Slight penalty if unreachable
    except requests.exceptions.RequestException:
        status_factor = 0.7
        status_code = "unreachable"

    # Get domain registration details
    try:
        domain_info = whois.whois(domain)
        creation_date = domain_info.creation_date

        if isinstance(creation_date, list):
            creation_date = creation_date[0]  # Handle multiple dates

        if creation_date:
            domain_age = (datetime.now() - creation_date).days / 365
        else:
            domain_age = "unknown"
    except:
        domain_age = "unknown"

    # Assign a default age if unknown
    assumed_age = 10 if domain_age == "unknown" else domain_age

    # Adjust age factor (older domains more trustworthy, max factor of 1.0)
    age_factor = min(1.0, (assumed_age / 10) ** 0.5)  # Square root to prevent excessive penalty

    # Final score calculation (ensuring high-trust domains maintain high credibility)
    score = round(min(1.0, base_score * status_factor * age_factor), 2)

    # Construct explanation
    age_explanation = f"{assumed_age:.1f} years" if domain_age != "unknown" else "assumed 10 years based on averages"
    
    explanation = (
        f"This source is considered {['low', 'medium', 'high'][int(score * 2)]} credibility "
        f"based on its domain ({tld}), accessibility status (HTTP {status_code}), and "
        f"domain age ({age_explanation})."
    )

    return {"score": score, "explanation": explanation}

# Example test
if __name__ == "__main__":
    example_url = "https://www.nasa.gov"
    print(get_credibility_score(example_url))


{'score': 0.95, 'explanation': 'This source is considered medium credibility based on its domain (gov), accessibility status (HTTP 200), and domain age (assumed 10 years based on averages).'}
