In [5]:
%pip install requests beautifulsoup4 tldextract python-dateutil lxml
import re, time, json, math, tldextract, requests
from urllib.parse import urlparse
from datetime import datetime
from dateutil import parser as dateparser
from bs4 import BeautifulSoup

USER_AGENT = 'Mozilla/5.0 (CredibilityPOC/0.1)'
DEFAULT_TIMEOUT = 12

CLICKBAIT_TERMS = [
    "you won't believe", 'shocking', 'jaw-dropping', 'what happened next',
    'unbelievable', 'miracle', 'exposed', "secret they don't want you to know"
]
TRANSPARENCY_HINTS = [
    'author','byline','by ','by:','written by','editor','editorial',
    'fact-check','fact check','sources','references','citations',
    'methodology','about us','about the author','corrections','disclosures'
]
INSTITUTIONAL_TLDS = {'edu','gov','ac','sch','mil'}

def fetch_html(url: str):
    try:
        headers = {'User-Agent': USER_AGENT}
        resp = requests.get(url, headers=headers, timeout=DEFAULT_TIMEOUT)
        resp.raise_for_status()
        return resp.text, None
    except Exception as e:
        return None, f'Fetch error: {e}'

def extract_article_fields(html: str, url: str):
    soup = BeautifulSoup(html, 'lxml')
    text_chunks, title, author, published = [], None, None, None
    if soup.title and soup.title.string:
        title = soup.title.string.strip()
    mt = soup.find('meta', attrs={'property':'og:title'}) or soup.find('meta', attrs={'name':'title'})
    if not title and mt and mt.get('content'):
        title = mt['content'].strip()
    for selector in [
        {'name':'meta','attrs':{'name':'author'}},
        {'name':'meta','attrs':{'property':'article:author'}},
        {'name':'span','class_':re.compile('author|byline', re.I)},
        {'name':'div','class_':re.compile('author|byline', re.I)},
        {'name':'a','class_':re.compile('author', re.I)},
    ]:
        if selector['name']=='meta':
            node = soup.find('meta', attrs=selector['attrs'])
            if node and node.get('content'):
                author = node['content'].strip(); break
        else:
            node = soup.find(selector['name'], class_=selector.get('class_'))
            if node and node.get_text(strip=True):
                candidate = node.get_text(' ', strip=True)
                if len(candidate) >= 3:
                    author = candidate; break
    for date_sel in [
        {'name':'meta','attrs':{'property':'article:published_time'}},
        {'name':'meta','attrs':{'name':'date'}},
        {'name':'time','attrs':{}},
        {'name':'span','class_':re.compile('date|time', re.I)},
    ]:
        if date_sel['name']=='meta':
            node = soup.find('meta', attrs=date_sel['attrs'])
            if node and node.get('content'):
                try:
                    published = dateparser.parse(node['content'], fuzzy=True); break
                except Exception:
                    pass
        else:
            node = soup.find(date_sel['name'], class_=date_sel.get('class_'))
            if node and node.get_text(strip=True):
                try:
                    published = dateparser.parse(node.get_text(strip=True), fuzzy=True); break
                except Exception:
                    pass
    main_container = None
    for cls in ['article','post','story','content','entry-content','article-body']:
        mc = soup.find(True, class_=re.compile(cls, re.I))
        if mc: main_container = mc; break
    paragraphs = (main_container.find_all('p') if main_container else soup.find_all('p'))
    for p in paragraphs:
        t = p.get_text(' ', strip=True)
        if t and len(t) > 40: text_chunks.append(t)
    article_text = '\n\n'.join(text_chunks)[:100000]
    all_links, external_links = [], []
    base_host = urlparse(url).netloc.lower()
    for a in soup.find_all('a', href=True):
        href = a['href']
        if href.startswith('http://') or href.startswith('https://'):
            all_links.append(href)
            if urlparse(href).netloc.lower() != base_host:
                external_links.append(href)
    full_text_for_hints = (article_text + ' ' + ' '.join(TRANSPARENCY_HINTS)).lower()
    return {
        'title': title,
        'author': author,
        'published': published.isoformat() if published else None,
        'text': article_text,
        'num_paragraphs': len(text_chunks),
        'all_links': all_links,
        'external_links': external_links,
        'has_transparency_hints': any(h in full_text_for_hints for h in TRANSPARENCY_HINTS),
    }

def score_url(url: str, fields: dict):
    explanation_bits = []
    score = 50
    if url.lower().startswith('https://'):
        score += 12; explanation_bits.append('+12: uses HTTPS')
    else:
        score -= 10; explanation_bits.append('-10: not using HTTPS')
    ext = tldextract.extract(url)
    tld_last = (ext.suffix.split('.')[-1] if ext.suffix else '')
    if tld_last in INSTITUTIONAL_TLDS:
        score += 14; explanation_bits.append(f'+14: institutional TLD ({tld_last})')
    if fields.get('author'):
        score += 10; explanation_bits.append('+10: author/byline found')
    else:
        score -= 6; explanation_bits.append('-6: no clear author/byline')
    published = fields.get('published')
    if published:
        try:
            dt = dateparser.parse(published)
            if (datetime.utcnow() - dt).days <= 3650:
                score += 6; explanation_bits.append('+6: reasonably recent publication date')
            else:
                score -= 4; explanation_bits.append('-4: appears quite old')
        except Exception:
            explanation_bits.append('0: could not parse publication date reliably')
    else:
        explanation_bits.append('0: no publication date found')
    total_links = len(fields.get('all_links', []))
    external_links_count = len(fields.get('external_links', []))
    if total_links >= 5 and external_links_count >= 3:
        score += 10; explanation_bits.append(f'+10: provides references (links: {total_links}, external: {external_links_count})')
    elif total_links >= 2:
        score += 4; explanation_bits.append(f'+4: some references (links: {total_links})')
    else:
        score -= 6; explanation_bits.append(f'-6: minimal/no references (links: {total_links})')
    num_paras = fields.get('num_paragraphs', 0)
    if num_paras >= 8:
        score += 6; explanation_bits.append('+6: substantive article length')
    elif num_paras >= 3:
        score += 2; explanation_bits.append('+2: moderate article length')
    else:
        score -= 6; explanation_bits.append('-6: very short article text')
    text_lower = (fields.get('text') or '').lower()
    clickbait_hits = sum(1 for term in CLICKBAIT_TERMS if term in text_lower)
    if clickbait_hits >= 2:
        score -= 10; explanation_bits.append('-10: strong clickbait indicators')
    elif clickbait_hits == 1:
        score -= 4; explanation_bits.append('-4: mild clickbait indicators')
    ad_signals = len(re.findall(r"advertis(e|ement)|sponsor(ed|ship)", text_lower))
    iframes_penalty = min(8, math.floor(ad_signals / 5) * 2)
    if iframes_penalty:
        score -= iframes_penalty; explanation_bits.append(f'-{iframes_penalty}: advertising/sponsorship language')
    score = max(0, min(100, int(round(score))))
    explanation = '; '.join(explanation_bits)
    return score, explanation

def evaluate_url(url: str):
    if not isinstance(url, str) or not url.strip():
        return {'score': 0, 'explanation': 'Invalid URL input.', 'details': {'error': 'empty_or_non_string'}}
    html, err = fetch_html(url)
    if err:
        return {'score': 0, 'explanation': f'Failed to fetch: {err}', 'details': {'error': 'fetch_failed'}}
    fields = extract_article_fields(html, url)
    score, explanation = score_url(url, fields)
    return {
        'score': score,
        'explanation': explanation,
        'details': {
            'url': url,
            'title': fields.get('title'),
            'author': fields.get('author'),
            'published': fields.get('published'),
            'num_paragraphs': fields.get('num_paragraphs'),
            'total_links': len(fields.get('all_links', [])),
            'external_links': len(fields.get('external_links', [])),
        },
    }
# Example:
res = evaluate_url('https://www.mayoclinic.org/diseases-conditions/dehydration/symptoms-causes/syc-20354086')
print(json.dumps(res, indent=2))

{
  "score": 60,
  "explanation": "+12: uses HTTPS; -6: no clear author/byline; 0: no publication date found; +10: provides references (links: 138, external: 110); -6: very short article text",
  "details": {
    "url": "https://www.mayoclinic.org/diseases-conditions/dehydration/symptoms-causes/syc-20354086",
    "title": "Dehydration - Symptoms & causes - Mayo Clinic",
    "author": null,
    "published": null,
    "num_paragraphs": 0,
    "total_links": 138,
    "external_links": 110
  }
}
