In [1]:
# @title
# ✅ Install all required packages (including the latest Gemini SDK)
# !pip install -U google-generativeai requests beautifulsoup4 tldextract python-dateutil lxml scikit-learn
# ✅ Install all required packages (silently)
!pip install -q -U google-generativeai requests beautifulsoup4 tldextract python-dateutil lxml scikit-learn > /dev/null 2>&1

import warnings
warnings.filterwarnings("ignore")

In [2]:
# @title
from google.colab import userdata
import os

# os.environ["SERP_API_KEY"] = "test"

#real SERP_API_KEY
os.environ["SERP_API_KEY"] = userdata.get('SERP_API_KEY')
os.environ["GEMINI_API_KEY"] = userdata.get('GEMINI_API_KEY')

In [3]:
# @title
# ============================================================
# ==========  C R E D I B I L I T Y   S C O R I N G  =========
# ============================================================
# This section is the *deliverable_1* rule-based scorer

import re, time, json, math, tldextract, requests, random, hashlib
from urllib.parse import urlparse
from datetime import datetime
from dateutil import parser as dateparser
from bs4 import BeautifulSoup

# --- Networking defaults ---
USER_AGENT = 'Mozilla/5.0 (CredibilityPOC/0.1)'   # Browser-y UA to avoid blocks
DEFAULT_TIMEOUT = 12                               # Seconds

# --- Heuristic signals ---
CLICKBAIT_TERMS = [
    "you won't believe", 'shocking', 'jaw-dropping', 'what happened next',
    'unbelievable', 'miracle', 'exposed', "secret they don't want you to know"
]
TRANSPARENCY_HINTS = [
    'author','byline','by ','by:','written by','editor','editorial',
    'fact-check','fact check','sources','references','citations',
    'methodology','about us','about the author','corrections','disclosures'
]
INSTITUTIONAL_TLDS = {'edu','gov','ac','sch','mil'}

def fetch_html(url: str):
    """
    Fetch raw HTML for a URL with retries and basic anti-block heuristics.
    - Rotates realistic headers
    - Exponential backoff on 403/429/5xx
    - Fallback for Reddit: try old.reddit.com if www.reddit.com blocks
    Returns (html_text, None) on success, or (None, 'error message') on failure.
    """
    import random
    import time
    import requests
    from urllib.parse import urlparse

    # Primary + backup user-agent/header sets
    header_candidates = [
        {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                          "AppleWebKit/537.36 (KHTML, like Gecko) "
                          "Chrome/126.0 Safari/537.36",
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
            "Accept-Language": "en-US,en;q=0.9",
            "Connection": "keep-alive",
        },
        {
            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
                          "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15",
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
            "Accept-Language": "en-US,en;q=0.9",
        },
        {
            "User-Agent": USER_AGENT,  # your original UA, as a last resort
            "Accept-Language": "en-US,en;q=0.9",
        },
    ]

    def reddit_alt(u: str) -> list[str]:
        """If it's a Reddit URL, also try old.reddit.com."""
        try:
            parsed = urlparse(u)
            if parsed.netloc.endswith("reddit.com") and not parsed.netloc.startswith("old."):
                alt = u.replace("//www.reddit.com", "//old.reddit.com")
                if alt == u:  # if it wasn't www, still try old.
                    alt = u.replace("//reddit.com", "//old.reddit.com")
                return [u, alt]
        except Exception:
            pass
        return [u]

    urls_to_try = reddit_alt(url)
    last_err = None

    for candidate_url in urls_to_try:
        # up to 3 attempts per candidate url, with exponential backoff
        backoff = 1.0
        for attempt in range(3):
            headers = header_candidates[min(attempt, len(header_candidates)-1)]
            try:
                resp = requests.get(candidate_url, headers=headers, timeout=DEFAULT_TIMEOUT, allow_redirects=True)
                status = resp.status_code

                # Retry on "blocked"/rate-limited or transient server errors
                if status in (403, 429) or 500 <= status < 600:
                    last_err = f"{status} {resp.reason}"
                    time.sleep(backoff)
                    backoff *= 2.0
                    continue

                resp.raise_for_status()

                # Prefer HTML content; some sites return other types
                ctype = (resp.headers.get("Content-Type") or "").lower()
                if "text/html" not in ctype:
                    last_err = f"Non-HTML content-type: {ctype}"
                    # Don’t retry endlessly for non-HTML; move to next candidate/final
                    break

                return resp.text, None

            except requests.exceptions.RequestException as e:
                last_err = str(e)
                time.sleep(backoff)
                backoff *= 2.0
                continue

    return None, f"Fetch error: {last_err or 'unknown error'}"


def extract_article_fields(html: str, url: str):
    """
    Parse HTML to extract: title, author, published date, body text (paragraphs),
    and link counts (total + external), plus transparency hint flag.
    """
    soup = BeautifulSoup(html, 'lxml')
    text_chunks, title, author, published = [], None, None, None

    # --- Title from <title> or OG meta ---
    if soup.title and soup.title.string:
        title = soup.title.string.strip()
    mt = soup.find('meta', attrs={'property':'og:title'}) or soup.find('meta', attrs={'name':'title'})
    if not title and mt and mt.get('content'):
        title = mt['content'].strip()

    # --- Author / byline in common locations ---
    for selector in [
        {'name':'meta','attrs':{'name':'author'}},
        {'name':'meta','attrs':{'property':'article:author'}},
        {'name':'span','class_':re.compile('author|byline', re.I)},
        {'name':'div','class_':re.compile('author|byline', re.I)},
        {'name':'a','class_':re.compile('author', re.I)},
    ]:
        if selector['name']=='meta':
            node = soup.find('meta', attrs=selector['attrs'])
            if node and node.get('content'):
                author = node['content'].strip(); break
        else:
            node = soup.find(selector['name'], class_=selector.get('class_'))
            if node and node.get_text(strip=True):
                candidate = node.get_text(' ', strip=True)
                if len(candidate) >= 3:
                    author = candidate; break

    # --- Publish date in common meta/time/span patterns ---
    for date_sel in [
        {'name':'meta','attrs':{'property':'article:published_time'}},
        {'name':'meta','attrs':{'name':'date'}},
        {'name':'time','attrs':{}},
        {'name':'span','class_':re.compile('date|time', re.I)},
    ]:
        if date_sel['name']=='meta':
            node = soup.find('meta', attrs=date_sel['attrs'])
            if node and node.get('content'):
                try:
                    published = dateparser.parse(node['content'], fuzzy=True); break
                except Exception:
                    pass
        else:
            node = soup.find(date_sel['name'], class_=date_sel.get('class_'))
            if node and node.get_text(strip=True):
                try:
                    published = dateparser.parse(node.get_text(strip=True), fuzzy=True); break
                except Exception:
                    pass

    # --- Body text: prefer a likely article container, else all <p> ---
    main_container = None
    for cls in ['article','post','story','content','entry-content','article-body']:
        mc = soup.find(True, class_=re.compile(cls, re.I))
        if mc: main_container = mc; break
    paragraphs = (main_container.find_all('p') if main_container else soup.find_all('p'))
    for p in paragraphs:
        t = p.get_text(' ', strip=True)
        if t and len(t) > 40: text_chunks.append(t)
    article_text = '\n\n'.join(text_chunks)[:100000]  # cap to avoid huge pages

    # --- Link counts: total & external ---
    all_links, external_links = [], []
    base_host = urlparse(url).netloc.lower()
    for a in soup.find_all('a', href=True):
        href = a['href']
        if href.startswith('http://') or href.startswith('https://'):
            all_links.append(href)
            if urlparse(href).netloc.lower() != base_host:
                external_links.append(href)

    # --- Transparency hint flag (string match) ---
    full_text_for_hints = (article_text + ' ' + ' '.join(TRANSPARENCY_HINTS)).lower()

    return {
        'title': title,
        'author': author,
        'published': published.isoformat() if published else None,
        'text': article_text,
        'num_paragraphs': len(text_chunks),
        'all_links': all_links,
        'external_links': external_links,
        'has_transparency_hints': any(h in full_text_for_hints for h in TRANSPARENCY_HINTS),
    }

def score_url(url: str, fields: dict):
    """
    Apply heuristic scoring rules → (score 0–100, explanation string).
    Starts at 50 and adds/subtracts per signal.
    """
    explanation_bits = []
    score = 50  # neutral baseline

    # HTTPS
    if url.lower().startswith('https://'):
        score += 12; explanation_bits.append('+12: uses HTTPS')
    else:
        score -= 10; explanation_bits.append('-10: not using HTTPS')

    # Institutional TLD
    ext = tldextract.extract(url)
    tld_last = (ext.suffix.split('.')[-1] if ext.suffix else '')
    if tld_last in INSTITUTIONAL_TLDS:
        score += 14; explanation_bits.append(f'+14: institutional TLD ({tld_last})')

    # Author/byline
    if fields.get('author'):
        score += 10; explanation_bits.append('+10: author/byline found')
    else:
        score -= 6; explanation_bits.append('-6: no clear author/byline')

    # Published recency (NOTE: uses datetime.utcnow(), may warn in 3.12+)
    published = fields.get('published')
    if published:
        try:
            dt = dateparser.parse(published)
            if (datetime.utcnow() - dt).days <= 3650:
                score += 6; explanation_bits.append('+6: reasonably recent publication date')
            else:
                score -= 4; explanation_bits.append('-4: appears quite old')
        except Exception:
            explanation_bits.append('0: could not parse publication date reliably')
    else:
        explanation_bits.append('0: no publication date found')

    # References
    total_links = len(fields.get('all_links', []))
    external_links_count = len(fields.get('external_links', []))
    if total_links >= 5 and external_links_count >= 3:
        score += 10; explanation_bits.append(f'+10: provides references (links: {total_links}, external: {external_links_count})')
    elif total_links >= 2:
        score += 4; explanation_bits.append(f'+4: some references (links: {total_links})')
    else:
        score -= 6; explanation_bits.append(f'-6: minimal/no references (links: {total_links})')

    # Length (by paragraph count)
    num_paras = fields.get('num_paragraphs', 0)
    if num_paras >= 8:
        score += 6; explanation_bits.append('+6: substantive article length')
    elif num_paras >= 3:
        score += 2; explanation_bits.append('+2: moderate article length')
    else:
        score -= 6; explanation_bits.append('-6: very short article text')

    # Clickbait language
    text_lower = (fields.get('text') or '').lower()
    clickbait_hits = sum(1 for term in CLICKBAIT_TERMS if term in text_lower)
    if clickbait_hits >= 2:
        score -= 10; explanation_bits.append('-10: strong clickbait indicators')
    elif clickbait_hits == 1:
        score -= 4; explanation_bits.append('-4: mild clickbait indicators')

    # Advertising/sponsor cues
    ad_signals = len(re.findall(r"advertis(e|ement)|sponsor(ed|ship)", text_lower))
    iframes_penalty = min(8, math.floor(ad_signals / 5) * 2)
    if iframes_penalty:
        score -= iframes_penalty; explanation_bits.append(f'-{iframes_penalty}: advertising/sponsorship language')

    # Clamp score and join explanation
    score = max(0, min(100, int(round(score))))
    explanation = '; '.join(explanation_bits)
    return score, explanation

def evaluate_url(url: str):
    """
    Orchestrator: fetch → parse → score.
    Returns a dict with 'score', 'explanation', and 'details' on success.
    Returns None if the page cannot be accessed (STRICT EXCLUDE).
    """
    if not isinstance(url, str) or not url.strip():
        return None  # exclude invalid

    html, err = fetch_html(url)
    if err:
        return None  # STRICT EXCLUDE: do not return a low score or an error object

    fields = extract_article_fields(html, url)
    score, explanation = score_url(url, fields)

    return {
        'score': score,
        'explanation': explanation,
        'details': {
            'url': url,
            'title': fields.get('title'),
            'author': fields.get('author'),
            'published': fields.get('published'),
            'num_paragraphs': fields.get('num_paragraphs'),
            'total_links': len(fields.get('all_links', [])),
            'external_links': len(fields.get('external_links', [])),
        },
    }

# ============================================================
# ======  H Y B R I D   ( R u l e s  +  L i n e a r  R e g ) =
# ============================================================
# This section:
# 1) builds features from evaluate_url() outputs
# 2) creates a labeled dataset of 20 real URLs (0–100 labels)
# 3) runs 5-fold cross-validation for LinearRegression
# 4) fits a final model on ALL available rows
# 5) exposes hybrid_score(url, alpha) for inference
#
# Notes:
# - If some URLs fail to fetch, they're STRICTLY EXCLUDED (no score, not shown).
# - CV uses MAE and R^2 to give you both error and fit quality.

import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold, cross_val_score
from datetime import datetime, timezone
from dateutil import parser as dateparser

# ---------- Feature Engineering ----------
def _safe(value, default=0):
    return value if value is not None else default

def _https_flag(url: str) -> int:
    return 1 if isinstance(url, str) and url.lower().startswith('https://') else 0

def _institutional_tld_flag(url: str) -> int:
    try:
        ext = tldextract.extract(url)
        tld_last = (ext.suffix.split('.')[-1] if ext.suffix else '')
        return 1 if tld_last in {'edu','gov','ac','sch','mil'} else 0
    except Exception:
        return 0

def _days_since(published_iso: str) -> int:
    if not published_iso:
        return 99999  # treat unknown/absent as very old
    try:
        dt = dateparser.parse(published_iso)
        return max(0, (datetime.utcnow() - dt).days)  # OK to mirror your original utc usage
    except Exception:
        return 99999

def features_from_eval(eval_obj: dict) -> dict:
    """
    Build a feature dict using ONLY fields the original scorer returns.
    These features are model-agnostic and cheap to compute.
    """
    d = eval_obj.get('details', {})
    url = d.get('url') or ''
    feats = {
        "https": _https_flag(url),
        "inst_tld": _institutional_tld_flag(url),
        "has_author": 1 if d.get('author') else 0,
        "num_paragraphs": _safe(d.get('num_paragraphs'), 0),
        "total_links": _safe(d.get('total_links'), 0),
        "external_links": _safe(d.get('external_links'), 0),
        "days_since_pub": _days_since(d.get('published')),
    }
    return feats

FEATURE_ORDER = [
    "https",
    "inst_tld",
    "has_author",
    "num_paragraphs",
    "total_links",
    "external_links",
    "days_since_pub",
]

def vectorize_features(feat_dict: dict, feature_order=FEATURE_ORDER):
    """Convert a feature dict to a fixed-order numeric vector."""
    return np.array([feat_dict.get(k, 0) for k in feature_order], dtype=float)

# ---------- Labeled Dataset (20 REAL URLs, mixed quality) ----------
# Labels are illustrative 0–100 targets. Adjust as you refine ground truth.
LABELED_URLS = [
    # Highly credible / institutional / quality editorial
    ("https://www.cdc.gov/flu/about/index.html", 92),
    ("https://www.nih.gov/news-events/news-releases", 88),
    ("https://www.mayoclinic.org/diseases-conditions/dehydration/symptoms-causes/syc-20354086", 85),
    ("https://www.who.int/news-room/fact-sheets/detail/diabetes", 90),
    ("https://www.britannica.com/science/photosynthesis", 86),
    ("https://www.health.harvard.edu/staying-healthy/what-is-intermittent-fasting", 78),
    ("https://www.hopkinsmedicine.org/health/conditions-and-diseases", 84),
    ("https://www.bbc.com/news/science_and_environment", 80),
    ("https://www.reuters.com/world/us/", 80),
    ("https://apnews.com/hub/technology", 78),
    ("https://en.wikipedia.org/wiki/Ice_cream", 75),

    # Mid credibility consumer health / informative
    ("https://www.healthline.com/nutrition/green-tea-and-weight-loss", 70),
    ("https://www.webmd.com/diet/obesity/features/green-tea-and-weight-loss", 68),
    ("https://www.nature.com/scitable/definition/photosynthesis-288/", 82),
    ("https://med.stanford.edu/news/all-news.html", 82),

    # Lower credibility / UGC / lighter editorial
    ("https://medium.com/", 45),
    ("https://www.reddit.com/r/icecreamery/comments/19elt19/looking_for_resources_to_learn_how_to_make_ice/", 20),
    ("https://www.quora.com/Is-green-tea-good-for-weight-loss", 25),
    ("https://www.livestrong.com/article/13715706-green-tea-benefits/", 60),
    ("https://www.buzzfeed.com/", 40),
]

# ---------- Build Dataset by Evaluating & Featurizing ----------
rows_X, rows_y = [], []

for url, label in LABELED_URLS:
    try:
        ev = evaluate_url(url)
        if ev is None:  # STRICT EXCLUDE
            continue
        feats = features_from_eval(ev)
        x = vectorize_features(feats)
        rows_X.append(x)
        rows_y.append(float(label))
    except Exception:
        # Strict policy: exclude on any error
        continue

X = np.vstack(rows_X) if rows_X else np.zeros((0, len(FEATURE_ORDER)))
y = np.array(rows_y) if rows_y else np.zeros((0,))

print(f"Prepared dataset: {X.shape[0]} rows × {X.shape[1]} features.")

# ---------- 5-Fold Cross-Validation ----------
# We use KFold regression CV with two metrics:
#  - MAE (lower is better): absolute error in points of the 0–100 score
#  - R^2 (higher is better): variance explained
if len(X) >= 5:
    k = min(5, len(X))  # guard in case many rows were excluded
    if k < 2:
        print("\nNot enough rows for CV; skipping cross-validation.")
    else:
        kf = KFold(n_splits=k, shuffle=True, random_state=42)
        # MAE (scikit returns negative MAE for loss; invert sign to report positive MAE)
        mae_scores = cross_val_score(LinearRegression(), X, y, cv=kf, scoring="neg_mean_absolute_error")
        r2_scores  = cross_val_score(LinearRegression(), X, y, cv=kf, scoring="r2")
        mae_vals = -mae_scores  # turn to positive
        print(f"\n{k}-fold CV results on {len(X)} rows:")
        print(f"  MAE per fold: {np.round(mae_vals, 2)} | mean={mae_vals.mean():.2f}")
        print(f"  R^2 per fold:  {np.round(r2_scores, 3)} | mean={r2_scores.mean():.3f}")
else:
    print("\nNot enough rows for CV; need ≥5 examples.")

# ---------- Fit Final Linear Regression on ALL available rows ----------
linreg = LinearRegression()
if len(X) >= 2:
    linreg.fit(X, y)
    print("\nFinal model (trained on all available rows):")
    for kname, w in zip(FEATURE_ORDER, linreg.coef_):
        print(f"  {kname:>16s}: {w: .3f}")
    print(f"  {'Intercept':>16s}: {linreg.intercept_: .3f}")
else:
    print("\nNot enough rows to train final model (need ≥2). Using neutral ML predictions.")

# ---------- Inference helpers ----------
def predict_ml_score_from_eval(eval_obj: dict) -> float:
    """
    Predict a 0–100 credibility score from features using the trained LinearRegression.
    Falls back to 50.0 if not enough training data is available.
    """
    if len(X) < 2:
        return 50.0
    feats = features_from_eval(eval_obj)
    x = vectorize_features(feats).reshape(1, -1)
    pred = linreg.predict(x)[0]
    return float(np.clip(pred, 0.0, 100.0))

def _stars(score_0_100: float) -> str:
    stars = int(round(score_0_100 / 20))
    stars = max(0, min(5, stars))
    return "★"*stars + "☆"*(5 - stars)

def hybrid_score(url: str, alpha: float = 0.6) -> dict | None:
    """
    Blend rule-based score with ML-predicted score:
        final = alpha * rule_score + (1 - alpha) * ml_score

    STRICT EXCLUDE:
    - If fetch/evaluation fails, return None (do not score or include).
    """
    ev = evaluate_url(url)
    if ev is None:
        return None  # STRICT EXCLUDE

    details = ev.get("details", {})
    rule_score = float(ev.get("score", 0.0))

    # Normal path: combine rules + ML
    ml_score = predict_ml_score_from_eval(ev)
    final = float(np.clip(alpha * rule_score + (1 - alpha) * ml_score, 0.0, 100.0))

    # Stars
    stars = int(round(final / 20))
    stars = max(0, min(5, stars))
    star_str = "★" * stars + "☆" * (5 - stars)

    return {
        "url": url,
        "title": details.get("title"),
        "rule_score": round(rule_score, 1),
        "ml_score": round(ml_score, 1),
        "hybrid_score": round(final, 1),
        "stars": star_str,
        "explanation": ev.get("explanation", "No detailed explanation available."),
        "details": details,
    }


Prepared dataset: 13 rows × 7 features.

5-fold CV results on 13 rows:
  MAE per fold: [135.42 115.88  11.52  95.74  19.37] | mean=75.59
  R^2 per fold:  [-1.160880e+02 -1.695836e+03  3.910000e-01 -4.500741e+03 -1.558600e+01] | mean=-1265.572

Final model (trained on all available rows):
             https:  0.000
          inst_tld:  25.932
        has_author: -5.099
    num_paragraphs:  0.194
       total_links:  0.013
    external_links:  0.014
    days_since_pub: -0.000
         Intercept:  82.194


In [8]:
# @title
# --- Improved SerpAPI search (clean results, de-dupe, filter, robust errors) ---
import os, re, requests
from urllib.parse import urlparse

SERP_API_KEY = os.getenv("SERP_API_KEY")

# Common low-signal/social/video domains to exclude by default (tune as needed)
_DEFAULT_EXCLUDE_DOMAINS = {
    "reddit.com", "www.reddit.com", "old.reddit.com",
    "x.com", "twitter.com", "www.twitter.com",
    "tiktok.com", "www.tiktok.com",
    "pinterest.com", "www.pinterest.com",
    "facebook.com", "www.facebook.com",
    "instagram.com", "www.instagram.com",
    "youtube.com", "www.youtube.com", "youtu.be"
}

# Skip obvious non-article filetypes
_SKIP_FILETYPES = re.compile(r"\.(pdf|pptx?|docx?|xlsx?|zip|rar)(?:$|\?)", re.I)

def _host(url: str) -> str:
    return urlparse(url).netloc.lower()

def search_google(
    query: str,
    num_results: int = 5,
    exclude_domains: set | None = None,
    allow_news_results: bool = True,
    timeout: int = 20,
) -> list[dict]:
    """
    Search Google via SerpAPI and return a clean list of results:
        [{"title": str, "link": str, "snippet": str}, ...]
    - De-duplicates by link and avoids flooding from the same host
    - Skips social/UGC/video sites and non-HTML filetypes
    - Optionally includes Google News results
    """
    if not SERP_API_KEY:
        raise RuntimeError("Missing SERP_API_KEY / SERPAPI_API_KEY. Set it in the environment first.")

    exclude = set(_DEFAULT_EXCLUDE_DOMAINS)
    if exclude_domains:
        exclude |= set(exclude_domains)

    params = {
        "engine": "google",
        "q": query,
        "api_key": SERP_API_KEY,
        "num": 10,          # pull extra, then filter/trim to num_results
        "hl": "en",
        "gl": "us",
        "safe": "active",   # optional: reduce NSFW
    }

    try:
        resp = requests.get("https://serpapi.com/search", params=params, timeout=timeout)
        resp.raise_for_status()
        data = resp.json()
    except Exception as e:
        # On failure, return empty list (caller can handle and message user)
        print(f"[SerpAPI] Error: {e}")
        return []

    candidates: list[dict] = []

    # 1) Organic results
    for r in (data.get("organic_results") or []):
        link = r.get("link")
        title = r.get("title")
        snippet = r.get("snippet") or ""
        if not link or _SKIP_FILETYPES.search(link):
            continue
        host = _host(link)
        if host in exclude:
            continue
        candidates.append({"title": title, "link": link, "snippet": snippet})

    # 2) Optional: News results (helpful for timely topics)
    if allow_news_results:
        for r in (data.get("news_results") or []):
            link = r.get("link")
            title = r.get("title")
            snippet = r.get("snippet") or ""
            if not link or _SKIP_FILETYPES.search(link):
                continue
            host = _host(link)
            if host in exclude:
                continue
            candidates.append({"title": title, "link": link, "snippet": snippet})

    # De-duplicate by link; also avoid over-representing a single host
    cleaned, seen_links, seen_hosts = [], set(), set()
    for c in candidates:
        link, host = c["link"], _host(c["link"])
        if link in seen_links:
            continue
        # If we already have enough and this host is duplicate, skip
        if host in seen_hosts and len(cleaned) >= num_results:
            continue
        seen_links.add(link)
        seen_hosts.add(host)
        cleaned.append(c)
        if len(cleaned) >= num_results:
            break

    return cleaned

In [12]:
# @title Gemini Chat
# --- Gemini Chat & Summarization Helpers (chat-first; robust + clean fallbacks) ---
import os, re, google.generativeai as genai

# ---- Safeguard: prevent wrong localhost routing ----
for k in ["GOOGLE_API_BASE_URL", "GOOGLE_AI_API_BASE", "GOOGLE_API_ENDPOINT"]:
    os.environ.pop(k, None)

# ---- Configure Gemini API key + explicit endpoint ----
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
if not GEMINI_API_KEY:
    raise RuntimeError("Missing GEMINI_API_KEY. Please set it first.")
genai.configure(
    api_key=GEMINI_API_KEY,
    client_options={"api_endpoint": "https://generativelanguage.googleapis.com"}
)

# ---- Pick a valid model across SDK versions ----
_CANDIDATE_MODELS = ["gemini-1.5-flash", "gemini-1.5-pro", "gemini-pro"]

def _get_model():
    for name in _CANDIDATE_MODELS:
        try:
            m = genai.GenerativeModel(name)
            _ = m.generate_content("ping")
            return m
        except Exception:
            continue
    return None

_gemini = _get_model()

def _gemini_text(prompt: str) -> str | None:
    """Call Gemini safely; never expose internal errors."""
    if _gemini is None:
        return None
    try:
        resp = _gemini.generate_content(prompt)
        out = getattr(resp, "text", None)
        if not out and hasattr(resp, "candidates") and resp.candidates:
            cand = resp.candidates[0]
            if hasattr(cand, "content") and cand.content and getattr(cand.content, "parts", None):
                out = cand.content.parts[0].text
        return out.strip() if out else None
    except Exception:
        return None

# ---- Summarization (silent Gemini fallback) ----
def summarize_text(text: str, query: str) -> str:
    if not text or not text.strip():
        return "[No article text extracted to summarize.]"

    clipped = text[:5000]
    prompt = (
        "You are a concise, factual assistant.\n"
        f"User topic: {query}\n\n"
        "Write a 3–5 sentence factual summary using the content below.\n\n"
        f"Content:\n{clipped}"
    )
    out = _gemini_text(prompt)
    if out:
        return out

    sents = re.split(r'(?<=[.!?])\s+', clipped)
    return " ".join(sents[:4]).strip()

# ---- Intent / helper patterns ----
_SEARCH_TRIGGERS = {"find","search","sources","articles","look up","get sources","show links"}
_CONFIRM_YES = {"yes","y","yep","yeah","sure","ok","okay","do it","go ahead","please do"}
_CONFIRM_NO = {"no","n","nope","not now","later","stop","cancel","skip"}
_STOP_WORDS = {"exit","quit","bye"}
_GREETING = re.compile(r"\b(hi|hello|hey|how are you|good (morning|afternoon|evening))\b", re.I)
_VAGUE = re.compile(r"^(what\s+(is|do)|i\s+don'?t\s+know|help|huh|\?)", re.I)

def extract_urls(t): return re.findall(r"https?://\S+", t or "")

def classify_intent(msg: str) -> str:
    msg_l = (msg or "").lower()
    if any(w in msg_l for w in _STOP_WORDS): return "stop"
    if extract_urls(msg): return "urls_pasted"
    if any(w in msg_l for w in _SEARCH_TRIGGERS): return "trigger_search"
    return "converse"

def is_confirmation_yes(t): return any(w in (t or "").lower() for w in _CONFIRM_YES)
def is_confirmation_no(t):  return any(w in (t or "").lower() for w in _CONFIRM_NO)
def _is_small_talk(t): return bool(_GREETING.search(t or ""))
def _is_vague(t): return bool(_VAGUE.search((t or "").strip()))

def get_topic_from_history(hist: list[dict]) -> str | None:
    for m in reversed(hist):
        if m.get("role") != "user": continue
        txt = (m.get("content") or "").strip()
        if not txt or extract_urls(txt) or _is_small_talk(txt) or _is_vague(txt):
            continue
        if txt.lower() in {"yes","no","ok","okay","cool","thanks","thank you"}:
            continue
        if len(re.findall(r"[A-Za-z]{2,}", txt)) >= 2:
            return txt
    return None

def gemini_chat_reply(hist: list[dict], user_msg: str) -> str:
    """Friendly, contextual chat—no parroting."""
    if _is_small_talk(user_msg):
        return "I’m doing well—thanks for asking! What topic are you curious about today?"
    if _is_vague(user_msg):
        return "No worries—what subject would you like to explore? For example: ice cream, data science, or cars."

    topic = get_topic_from_history(hist)
    if topic and _gemini:
        convo = "\n".join(f"{m['role'].capitalize()}: {m['content']}" for m in hist[-6:])
        prompt = (
            "You are CredBot: natural, concise, and curious. "
            "If a topic exists, share one factual insight or ask one smart question about it. "
            "Never echo the user's sentence. "
            f"Known topic: {topic}\n\nConversation:\n{convo}\n\nReply naturally in 1–2 sentences."
        )
        out = _gemini_text(prompt)
        if out: return out
    if topic:
        return f"{topic.split()[0].capitalize()} sounds interesting. Should I pull some sources for you?"
    return "What would you like to talk about today?"


In [13]:
# @title Conversational Credbot
# ============================
# Conversational CredBot (chat-first; memory + auto-search confirmation)
# ============================
import re

chat_history = []
_pending_links: list[str] = []

AUTO_SEARCH_BEHAVIOR = "ask"   # or "auto"
AUTO_SEARCH_AFTER = 3
MEMORY_SUMMARY_EVERY = 4

_turns_since_last_search = 0
_last_summary_turn_idx = -1

def display_response(title, url, summary, hybrid_dict):
    score = hybrid_dict.get("hybrid_score", 0.0)
    stars = hybrid_dict.get("stars") or ("★" * int(round(score / 20)))
    why = hybrid_dict.get("explanation", "").strip()
    print(f"\n📰 Title: {title or '[No Title]'}")
    print(f"🔗 Link: {url}")
    print(f"📄 Summary: {summary}")
    print(f"⭐ Credibility: {stars} ({score}/100)")
    if why: print(f"📝 Why: {why}\n")

def _score_and_summarize_url(link: str, user_query: str) -> bool:
    hybrid = hybrid_score(link, alpha=0.6)
    if hybrid is None: return False
    html, err = fetch_html(link)
    if err or not html: return False
    try:
        fields = extract_article_fields(html, link)
        text = fields.get("text") or ""
        if not text.strip(): return False
        summary = summarize_text(text, user_query)
        title = fields.get("title") or "[No Title]"
        display_response(title, link, summary, hybrid)
        return True
    except Exception:
        return False

def _run_search_flow(topic: str) -> int:
    print("🔍 Searching for sources...")
    results = search_google(topic, num_results=5)
    if not results:
        print("⚠️ No relevant articles found.")
        return 0
    shown = 0
    for item in results:
        link = item.get("link")
        if link and _score_and_summarize_url(link, topic): shown += 1
    if not shown:
        print("⚠️ I couldn’t access any promising sources for that query.")
    return shown

def run_chatbot():
    print("🤖 Hi, I’m CredBot. We can chat normally. I’ll remember the conversation and the topic. (Type 'exit' to quit.)")
    global _pending_links, _turns_since_last_search, _last_summary_turn_idx
    user_turn_index = 0

    while True:
        user_input = input("\nYou: ").strip()
        if not user_input: continue
        if user_input.lower() in {"exit","quit","bye"}:
            print("👋 Goodbye!"); break

        user_turn_index += 1
        chat_history.append({"role":"user","content":user_input})

        # ---- Handle confirmations ----
        if _pending_links:
            if is_confirmation_yes(user_input):
                if len(_pending_links)==1 and _pending_links[0].startswith("AUTOSEARCH::"):
                    topic=_pending_links[0].split("::",1)[1]
                    print(f"🔍 Searching for “{topic}”...")
                    shown=_run_search_flow(topic)
                    chat_history.append({"role":"bot","content":f"[auto-confirmed] Returned {shown} results."})
                    _pending_links=[]; _turns_since_last_search=0; continue
                shown=0; topic=get_topic_from_history(chat_history) or user_input
                for l in _pending_links:
                    if not l.startswith("AUTOSEARCH::"):
                        shown+=1 if _score_and_summarize_url(l,topic) else 0
                _pending_links=[]
                if not shown: print("⚠️ I couldn’t access those links.")
                chat_history.append({"role":"bot","content":f"Processed {shown} links."})
                _turns_since_last_search=0; continue
            elif is_confirmation_no(user_input):
                _pending_links=[]; print("👍 Okay, I won’t run that. What else would you like to do?")
                chat_history.append({"role":"bot","content":"User declined action."}); continue

        # ---- Intent routing ----
        intent=classify_intent(user_input)

        if intent=="urls_pasted":
            links=extract_urls(user_input)
            seen=set(); _pending_links=[u for u in links if not (u in seen or seen.add(u))]
            print("🔗 I see you shared link(s). Do you want me to check their credibility? (yes/no)")
            chat_history.append({"role":"bot","content":"Asked to confirm scoring pasted links."}); continue

        if intent=="trigger_search":
            topic=get_topic_from_history(chat_history) or user_input
            shown=_run_search_flow(topic)
            chat_history.append({"role":"bot","content":f"Returned {shown} sources."})
            _turns_since_last_search=0; continue

        # ---- Default: converse normally ----
        bot=gemini_chat_reply(chat_history,user_input)
        print(f"🤖 {bot}")
        chat_history.append({"role":"bot","content":bot})
        _turns_since_last_search+=1

        # ---- Memory summary ----
        if (MEMORY_SUMMARY_EVERY and user_turn_index%MEMORY_SUMMARY_EVERY==0 and not _pending_links):
            if 'summarize_conversation' in globals():
                summary=summarize_conversation(chat_history)
                print(f"\n🧠 So far, here’s our focus:\n{summary}")
                chat_history.append({"role":"bot","content":f"[memory-summary]\\n{summary}"})
                _last_summary_turn_idx=user_turn_index

        # ---- Auto-search offer ----
        topic=get_topic_from_history(chat_history)
        if topic and _turns_since_last_search>=AUTO_SEARCH_AFTER and not _pending_links:
            if AUTO_SEARCH_BEHAVIOR=="ask":
                print(f"\n🔎 Want me to pull some sources on “{topic}”? (yes/no)")
                chat_history.append({"role":"bot","content":f"Offer to search on: {topic}"})
                _pending_links=[f"AUTOSEARCH::{topic}"]
            else:
                shown=_run_search_flow(topic)
                chat_history.append({"role":"bot","content":f"[auto] Returned {shown} results."})
                _turns_since_last_search=0


In [14]:
run_chatbot()

🤖 Hi, I’m CredBot. We can chat normally. I’ll remember the conversation and the topic. (Type 'exit' to quit.)

You: how are you doing
🤖 I’m doing well—thanks for asking! What topic are you curious about today?

You: i don't want to learn about anything
🤖 I sounds interesting. Should I pull some sources for you?

You: what
🤖 I sounds interesting. Should I pull some sources for you?

🔎 Want me to pull some sources on “i don't want to learn about anything”? (yes/no)

You: no tell me about ice cream
👍 Okay, I won’t run that. What else would you like to do?

You: i want to learn about ice cream
🤖 I sounds interesting. Should I pull some sources for you?

🔎 Want me to pull some sources on “i want to learn about ice cream”? (yes/no)

You: yes
🔍 Searching for “i want to learn about ice cream”...
🔍 Searching for sources...

📰 Title: The History of Ice Cream - IDFA
🔗 Link: https://www.idfa.org/the-history-of-ice-cream
📄 Summary: Ice cream's origins are known to reach back as far as the second ce

KeyboardInterrupt: Interrupted by user

TESTING FUNCTIONS (NO NEED TO RUN):

In [None]:
# # @title
# #Testing URL hybrind scoring function:
# # def hybrid_score(url: str, alpha: float = 0.6)

# # hybrid_score("https://www.health.harvard.edu/staying-healthy/what-is-intermittent-fasting")

# # ---------- Demo on a few URLs ----------
# demo_urls = [
#     "https://www.cdc.gov/flu/about/index.html",
#     "https://www.mayoclinic.org/diseases-conditions/dehydration/symptoms-causes/syc-20354086",
#     "https://www.healthline.com/nutrition/green-tea-and-weight-loss",
#     "https://www.reddit.com/r/icecreamery/comments/19elt19/looking_for_resources_to_learn_how_to_make_ice/",
#     "https://www.buzzfeed.com/",
#     "https://www.cnn.com/2023/07/16/business/ice-cream-consumption",
#     "https://www.idfa.org/the-history-of-ice-cream",
#     "https://pmc.ncbi.nlm.nih.gov/articles/PMC12261055/",
#     "https://www.britannica.com/science/photosynthesis",
#     "https://www.health.harvard.edu/staying-healthy/what-is-intermittent-fasting",
#     "https://www.cheryls.com/articles/food-facts/facts-about-ice-cream?srsltid=AfmBOopmqWKbaUMsJs1YCI0XWNTyAU7otmNF9qtr_0CmT9nuQBDLmc4v"

# ]

# for u in demo_urls:
#     try:
#         res = hybrid_score(u, alpha=0.6)  # 60% rules / 40% ML
#         if res is None:  # STRICT EXCLUDE
#             continue
#         print("\n" + "—"*70)
#         print(f"🔗 {res['url']}")
#         print(f"📰 {res['title'] or '[No Title]'}")
#         print(f"⭐ {res['stars']}  ({res['hybrid_score']}/100)  |  Rule: {res['rule_score']}  ML: {res['ml_score']}")
#         print(f"📝 Why: {res['explanation']}")
#     except Exception:
#         # For the demo,
#         print ( [u] + " - some error")
#         continue

In [None]:
# # @title
# import requests
# url = "https://www.ice.edu/blog/scoop-getting-know-ice-cream"
# r = requests.get(url, headers={"User-Agent": "Mozilla/5.0"}, timeout=12)
# print(r.status_code, r.headers.get("Content-Type"))
# print(r.text[:500])

In [None]:
# # @title
# # ============================================================
# # DELIVERABLE_2: Simple manual test on 5 ice cream URLs
# # ============================================================

# test_links = [
#     "https://en.wikipedia.org/wiki/Ice_cream",
#     "https://www.dreamscoops.com/ice-cream-science/",
#     "https://www.mymochi.com/blog/fun-facts-about-ice-cream-that-you-didnt-know/",
#     "https://www.idfa.org/the-history-of-ice-cream",
#     "https://www.ice.edu/blog/scoop-getting-know-ice-cream",
# ]

# print("🧪 Running hybrid credibility scoring test on 5 URLs...\n")

# for url in test_links:
#     try:
#         result = hybrid_score(url, alpha=0.6)  # uses rule + ML
#         print("—" * 70)
#         print(f"🔗 {url}")
#         print(f"⭐ {result['stars']}  ({result['hybrid_score']}/100)")
#         print(f"   • Rule score: {result['rule_score']}")
#         print(f"   • ML score:   {result['ml_score']}")
#         print(f"📝 Why: {result['explanation']}\n")
#     except Exception as e:
#         print(f"⚠️ Error scoring {url}: {e}")

# print("✅ Test complete!")