In [14]:
# ✅ Install all required packages (including the latest Gemini SDK)
!pip install -U google-generativeai requests beautifulsoup4 tldextract python-dateutil lxml scikit-learn



In [15]:
from google.colab import userdata
import os

#temporary serp api key so that I don't run out of api calls
# os.environ["SERP_API_KEY"] = "test"

#real SERP_API_KEY
os.environ["SERP_API_KEY"] = userdata.get('SERP_API_KEY')
os.environ["GEMINI_API_KEY"] = userdata.get('GEMINI_API_KEY')

In [16]:
# ============================================================
# ==========  C R E D I B I L I T Y   S C O R I N G  =========
# ============================================================
# This section is the *deliverable_1* rule-based scorer

import re, time, json, math, tldextract, requests
from urllib.parse import urlparse
from datetime import datetime
from dateutil import parser as dateparser
from bs4 import BeautifulSoup

# --- Networking defaults ---
USER_AGENT = 'Mozilla/5.0 (CredibilityPOC/0.1)'   # Browser-y UA to avoid blocks
DEFAULT_TIMEOUT = 12                               # Seconds

# --- Heuristic signals ---
CLICKBAIT_TERMS = [
    "you won't believe", 'shocking', 'jaw-dropping', 'what happened next',
    'unbelievable', 'miracle', 'exposed', "secret they don't want you to know"
]
TRANSPARENCY_HINTS = [
    'author','byline','by ','by:','written by','editor','editorial',
    'fact-check','fact check','sources','references','citations',
    'methodology','about us','about the author','corrections','disclosures'
]
INSTITUTIONAL_TLDS = {'edu','gov','ac','sch','mil'}

# def fetch_html(url: str):
#     """
#     Fetch raw HTML for a URL.
#     Returns (html_text, None) on success, or (None, 'error message') on failure.
#     """
#     try:
#         headers = {'User-Agent': USER_AGENT}
#         resp = requests.get(url, headers=headers, timeout=DEFAULT_TIMEOUT)
#         resp.raise_for_status()
#         return resp.text, None
#     except Exception as e:
#         return None, f'Fetch error: {e}'
#REPLACED TO ADD RETRIES FOR EDGE CASE:
# adds:
# Header rotation (Chrome/Safari/user agent)
# Backoff retry on 403/429/5xx
# Reddit fallback to old.reddit.com (often less strict)
# Ensures we actually got HTML

def fetch_html(url: str):
    """
    Fetch raw HTML for a URL with retries and basic anti-block heuristics.
    - Rotates realistic headers
    - Exponential backoff on 403/429/5xx
    - Fallback for Reddit: try old.reddit.com if www.reddit.com blocks
    Returns (html_text, None) on success, or (None, 'error message') on failure.
    """
    import random
    import time
    import requests
    from urllib.parse import urlparse

    # Primary + backup user-agent/header sets
    header_candidates = [
        {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                          "AppleWebKit/537.36 (KHTML, like Gecko) "
                          "Chrome/126.0 Safari/537.36",
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
            "Accept-Language": "en-US,en;q=0.9",
            "Connection": "keep-alive",
        },
        {
            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
                          "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15",
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
            "Accept-Language": "en-US,en;q=0.9",
        },
        {
            "User-Agent": USER_AGENT,  # your original UA, as a last resort
            "Accept-Language": "en-US,en;q=0.9",
        },
    ]

    def reddit_alt(u: str) -> list[str]:
        """If it's a Reddit URL, also try old.reddit.com."""
        try:
            parsed = urlparse(u)
            if parsed.netloc.endswith("reddit.com") and not parsed.netloc.startswith("old."):
                alt = u.replace("//www.reddit.com", "//old.reddit.com")
                if alt == u:  # if it wasn't www, still try old.
                    alt = u.replace("//reddit.com", "//old.reddit.com")
                return [u, alt]
        except Exception:
            pass
        return [u]

    urls_to_try = reddit_alt(url)
    last_err = None

    for candidate_url in urls_to_try:
        # up to 3 attempts per candidate url, with exponential backoff
        backoff = 1.0
        for attempt in range(3):
            headers = header_candidates[min(attempt, len(header_candidates)-1)]
            try:
                resp = requests.get(candidate_url, headers=headers, timeout=DEFAULT_TIMEOUT, allow_redirects=True)
                status = resp.status_code

                # Retry on "blocked"/rate-limited or transient server errors
                if status in (403, 429) or 500 <= status < 600:
                    last_err = f"{status} {resp.reason}"
                    time.sleep(backoff)
                    backoff *= 2.0
                    continue

                resp.raise_for_status()

                # Prefer HTML content; some sites return other types
                ctype = (resp.headers.get("Content-Type") or "").lower()
                if "text/html" not in ctype:
                    last_err = f"Non-HTML content-type: {ctype}"
                    # Don’t retry endlessly for non-HTML; move to next candidate/final
                    break

                return resp.text, None

            except requests.exceptions.RequestException as e:
                last_err = str(e)
                time.sleep(backoff)
                backoff *= 2.0
                continue

    return None, f"Fetch error: {last_err or 'unknown error'}"


def extract_article_fields(html: str, url: str):
    """
    Parse HTML to extract: title, author, published date, body text (paragraphs),
    and link counts (total + external), plus transparency hint flag.
    """
    soup = BeautifulSoup(html, 'lxml')
    text_chunks, title, author, published = [], None, None, None

    # --- Title from <title> or OG meta ---
    if soup.title and soup.title.string:
        title = soup.title.string.strip()
    mt = soup.find('meta', attrs={'property':'og:title'}) or soup.find('meta', attrs={'name':'title'})
    if not title and mt and mt.get('content'):
        title = mt['content'].strip()

    # --- Author / byline in common locations ---
    for selector in [
        {'name':'meta','attrs':{'name':'author'}},
        {'name':'meta','attrs':{'property':'article:author'}},
        {'name':'span','class_':re.compile('author|byline', re.I)},
        {'name':'div','class_':re.compile('author|byline', re.I)},
        {'name':'a','class_':re.compile('author', re.I)},
    ]:
        if selector['name']=='meta':
            node = soup.find('meta', attrs=selector['attrs'])
            if node and node.get('content'):
                author = node['content'].strip(); break
        else:
            node = soup.find(selector['name'], class_=selector.get('class_'))
            if node and node.get_text(strip=True):
                candidate = node.get_text(' ', strip=True)
                if len(candidate) >= 3:
                    author = candidate; break

    # --- Publish date in common meta/time/span patterns ---
    for date_sel in [
        {'name':'meta','attrs':{'property':'article:published_time'}},
        {'name':'meta','attrs':{'name':'date'}},
        {'name':'time','attrs':{}},
        {'name':'span','class_':re.compile('date|time', re.I)},
    ]:
        if date_sel['name']=='meta':
            node = soup.find('meta', attrs=date_sel['attrs'])
            if node and node.get('content'):
                try:
                    published = dateparser.parse(node['content'], fuzzy=True); break
                except Exception:
                    pass
        else:
            node = soup.find(date_sel['name'], class_=date_sel.get('class_'))
            if node and node.get_text(strip=True):
                try:
                    published = dateparser.parse(node.get_text(strip=True), fuzzy=True); break
                except Exception:
                    pass

    # --- Body text: prefer a likely article container, else all <p> ---
    main_container = None
    for cls in ['article','post','story','content','entry-content','article-body']:
        mc = soup.find(True, class_=re.compile(cls, re.I))
        if mc: main_container = mc; break
    paragraphs = (main_container.find_all('p') if main_container else soup.find_all('p'))
    for p in paragraphs:
        t = p.get_text(' ', strip=True)
        if t and len(t) > 40: text_chunks.append(t)
    article_text = '\n\n'.join(text_chunks)[:100000]  # cap to avoid huge pages

    # --- Link counts: total & external ---
    all_links, external_links = [], []
    base_host = urlparse(url).netloc.lower()
    for a in soup.find_all('a', href=True):
        href = a['href']
        if href.startswith('http://') or href.startswith('https://'):
            all_links.append(href)
            if urlparse(href).netloc.lower() != base_host:
                external_links.append(href)

    # --- Transparency hint flag (string match) ---
    full_text_for_hints = (article_text + ' ' + ' '.join(TRANSPARENCY_HINTS)).lower()

    return {
        'title': title,
        'author': author,
        'published': published.isoformat() if published else None,
        'text': article_text,
        'num_paragraphs': len(text_chunks),
        'all_links': all_links,
        'external_links': external_links,
        'has_transparency_hints': any(h in full_text_for_hints for h in TRANSPARENCY_HINTS),
    }

def score_url(url: str, fields: dict):
    """
    Apply heuristic scoring rules → (score 0–100, explanation string).
    Starts at 50 and adds/subtracts per signal.
    """
    explanation_bits = []
    score = 50  # neutral baseline

    # HTTPS
    if url.lower().startswith('https://'):
        score += 12; explanation_bits.append('+12: uses HTTPS')
    else:
        score -= 10; explanation_bits.append('-10: not using HTTPS')

    # Institutional TLD
    ext = tldextract.extract(url)
    tld_last = (ext.suffix.split('.')[-1] if ext.suffix else '')
    if tld_last in INSTITUTIONAL_TLDS:
        score += 14; explanation_bits.append(f'+14: institutional TLD ({tld_last})')

    # Author/byline
    if fields.get('author'):
        score += 10; explanation_bits.append('+10: author/byline found')
    else:
        score -= 6; explanation_bits.append('-6: no clear author/byline')

    # Published recency (NOTE: uses datetime.utcnow(), may warn in 3.12+)
    published = fields.get('published')
    if published:
        try:
            dt = dateparser.parse(published)
            if (datetime.utcnow() - dt).days <= 3650:
                score += 6; explanation_bits.append('+6: reasonably recent publication date')
            else:
                score -= 4; explanation_bits.append('-4: appears quite old')
        except Exception:
            explanation_bits.append('0: could not parse publication date reliably')
    else:
        explanation_bits.append('0: no publication date found')

    # References
    total_links = len(fields.get('all_links', []))
    external_links_count = len(fields.get('external_links', []))
    if total_links >= 5 and external_links_count >= 3:
        score += 10; explanation_bits.append(f'+10: provides references (links: {total_links}, external: {external_links_count})')
    elif total_links >= 2:
        score += 4; explanation_bits.append(f'+4: some references (links: {total_links})')
    else:
        score -= 6; explanation_bits.append(f'-6: minimal/no references (links: {total_links})')

    # Length (by paragraph count)
    num_paras = fields.get('num_paragraphs', 0)
    if num_paras >= 8:
        score += 6; explanation_bits.append('+6: substantive article length')
    elif num_paras >= 3:
        score += 2; explanation_bits.append('+2: moderate article length')
    else:
        score -= 6; explanation_bits.append('-6: very short article text')

    # Clickbait language
    text_lower = (fields.get('text') or '').lower()
    clickbait_hits = sum(1 for term in CLICKBAIT_TERMS if term in text_lower)
    if clickbait_hits >= 2:
        score -= 10; explanation_bits.append('-10: strong clickbait indicators')
    elif clickbait_hits == 1:
        score -= 4; explanation_bits.append('-4: mild clickbait indicators')

    # Advertising/sponsor cues
    ad_signals = len(re.findall(r"advertis(e|ement)|sponsor(ed|ship)", text_lower))
    iframes_penalty = min(8, math.floor(ad_signals / 5) * 2)
    if iframes_penalty:
        score -= iframes_penalty; explanation_bits.append(f'-{iframes_penalty}: advertising/sponsorship language')

    # Clamp score and join explanation
    score = max(0, min(100, int(round(score))))
    explanation = '; '.join(explanation_bits)
    return score, explanation

def evaluate_url(url: str):
    """
    Orchestrator: fetch → parse → score.
    Returns a dict with top-level 'score', 'explanation', and 'details' for display/debug.
    """
    if not isinstance(url, str) or not url.strip():
        return {'score': 0, 'explanation': 'Invalid URL input.', 'details': {'error': 'empty_or_non_string'}}

    html, err = fetch_html(url)
    if err:
        return {'score': 0, 'explanation': f'Failed to fetch: {err}', 'details': {'error': 'fetch_failed', 'url': url}}

    fields = extract_article_fields(html, url)
    score, explanation = score_url(url, fields)

    return {
        'score': score,
        'explanation': explanation,
        'details': {
            'url': url,
            'title': fields.get('title'),
            'author': fields.get('author'),
            'published': fields.get('published'),
            'num_paragraphs': fields.get('num_paragraphs'),
            'total_links': len(fields.get('all_links', [])),
            'external_links': len(fields.get('external_links', [])),
        },
    }

# Optional quick smoke test:
# print(json.dumps(evaluate_url('https://www.mayoclinic.org/diseases-conditions/dehydration/symptoms-causes/syc-20354086'), indent=2))


# ============================================================
# ======  H Y B R I D   ( R u l e s  +  L i n e a r  R e g ) =
# ============================================================
# This section:
# 1) builds features from evaluate_url() outputs
# 2) creates a labeled dataset of 20 real URLs (0–100 labels)
# 3) runs 5-fold cross-validation for LinearRegression
# 4) fits a final model on ALL available rows
# 5) exposes hybrid_score(url, alpha) for inference
#
# Notes:
# - If some URLs fail to fetch, they're skipped gracefully.
# - CV uses MAE and R^2 to give you both error and fit quality.

import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold, cross_val_score
from datetime import datetime, timezone
from dateutil import parser as dateparser

# ---------- Feature Engineering ----------
def _safe(value, default=0):
    return value if value is not None else default

def _https_flag(url: str) -> int:
    return 1 if isinstance(url, str) and url.lower().startswith('https://') else 0

def _institutional_tld_flag(url: str) -> int:
    try:
        ext = tldextract.extract(url)
        tld_last = (ext.suffix.split('.')[-1] if ext.suffix else '')
        return 1 if tld_last in {'edu','gov','ac','sch','mil'} else 0
    except Exception:
        return 0

def _days_since(published_iso: str) -> int:
    if not published_iso:
        return 99999  # treat unknown/absent as very old
    try:
        dt = dateparser.parse(published_iso)
        return max(0, (datetime.utcnow() - dt).days)  # OK to mirror your original utc usage
    except Exception:
        return 99999

def features_from_eval(eval_obj: dict) -> dict:
    """
    Build a feature dict using ONLY fields the original scorer returns.
    These features are model-agnostic and cheap to compute.
    """
    d = eval_obj.get('details', {})
    url = d.get('url') or ''
    feats = {
        "https": _https_flag(url),
        "inst_tld": _institutional_tld_flag(url),
        "has_author": 1 if d.get('author') else 0,
        "num_paragraphs": _safe(d.get('num_paragraphs'), 0),
        "total_links": _safe(d.get('total_links'), 0),
        "external_links": _safe(d.get('external_links'), 0),
        "days_since_pub": _days_since(d.get('published')),
    }
    return feats

FEATURE_ORDER = [
    "https",
    "inst_tld",
    "has_author",
    "num_paragraphs",
    "total_links",
    "external_links",
    "days_since_pub",
]

def vectorize_features(feat_dict: dict, feature_order=FEATURE_ORDER):
    """Convert a feature dict to a fixed-order numeric vector."""
    return np.array([feat_dict.get(k, 0) for k in feature_order], dtype=float)

# ---------- Labeled Dataset (20 REAL URLs, mixed quality) ----------
# Labels are illustrative 0–100 targets. Adjust as you refine ground truth.
LABELED_URLS = [
    # Highly credible / institutional / quality editorial
    ("https://www.cdc.gov/flu/about/index.html", 92),
    ("https://www.nih.gov/news-events/news-releases", 88),
    ("https://www.mayoclinic.org/diseases-conditions/dehydration/symptoms-causes/syc-20354086", 85),
    ("https://www.who.int/news-room/fact-sheets/detail/diabetes", 90),
    ("https://www.britannica.com/science/photosynthesis", 86),
    ("https://www.health.harvard.edu/staying-healthy/what-is-intermittent-fasting", 78),
    ("https://www.hopkinsmedicine.org/health/conditions-and-diseases", 84),
    ("https://www.bbc.com/news/science_and_environment", 80),
    ("https://www.reuters.com/world/us/", 80),
    ("https://apnews.com/hub/technology", 78),
    ("https://en.wikipedia.org/wiki/Ice_cream", 75),

    # Mid credibility consumer health / informative
    ("https://www.healthline.com/nutrition/green-tea-and-weight-loss", 70),
    ("https://www.webmd.com/diet/obesity/features/green-tea-and-weight-loss", 68),
    ("https://www.nature.com/scitable/definition/photosynthesis-288/", 82),
    ("https://med.stanford.edu/news/all-news.html", 82),

    # Lower credibility / UGC / lighter editorial
    ("https://medium.com/", 45),
    ("https://www.reddit.com/r/icecreamery/comments/19elt19/looking_for_resources_to_learn_how_to_make_ice/", 20),
    ("https://www.quora.com/Is-green-tea-good-for-weight-loss", 25),
    ("https://www.livestrong.com/article/13715706-green-tea-benefits/", 60),
    ("https://www.buzzfeed.com/", 40),
]

# ---------- Build Dataset by Evaluating & Featurizing ----------
rows_X, rows_y, skipped = [], [], []

for url, label in LABELED_URLS:
    try:
        ev = evaluate_url(url)
        if ev.get("details", {}).get("error") == "fetch_failed":
            skipped.append((url, "fetch_failed"))
            continue
        feats = features_from_eval(ev)
        x = vectorize_features(feats)
        rows_X.append(x)
        rows_y.append(float(label))
    except Exception as e:
        skipped.append((url, str(e)))

X = np.vstack(rows_X) if rows_X else np.zeros((0, len(FEATURE_ORDER)))
y = np.array(rows_y) if rows_y else np.zeros((0,))

print(f"Prepared dataset: {X.shape[0]} rows × {X.shape[1]} features. Skipped: {len(skipped)}")
if skipped:
    print("Skipped examples (first few):", skipped[:3])

# ---------- 5-Fold Cross-Validation ----------
# We use KFold regression CV with two metrics:
#  - MAE (lower is better): absolute error in points of the 0–100 score
#  - R^2 (higher is better): variance explained
if len(X) >= 5:
    k = min(5, len(X))  # guard in case many rows were skipped
    if k < 2:
        print("\nNot enough rows for CV; skipping cross-validation.")
    else:
        kf = KFold(n_splits=k, shuffle=True, random_state=42)
        # MAE (scikit returns negative MAE for loss; we invert sign to report positive MAE)
        mae_scores = cross_val_score(LinearRegression(), X, y, cv=kf, scoring="neg_mean_absolute_error")
        r2_scores  = cross_val_score(LinearRegression(), X, y, cv=kf, scoring="r2")
        mae_vals = -mae_scores  # turn to positive
        print(f"\n5-fold CV (k={k}) results on {len(X)} rows:")
        print(f"  MAE per fold: {np.round(mae_vals, 2)} | mean={mae_vals.mean():.2f}")
        print(f"  R^2 per fold:  {np.round(r2_scores, 3)} | mean={r2_scores.mean():.3f}")
else:
    print("\nNot enough rows for CV; need ≥5 examples.")

# ---------- Fit Final Linear Regression on ALL available rows ----------
linreg = LinearRegression()
if len(X) >= 2:
    linreg.fit(X, y)
    print("\nFinal model (trained on all available rows):")
    for kname, w in zip(FEATURE_ORDER, linreg.coef_):
        print(f"  {kname:>16s}: {w: .3f}")
    print(f"  {'Intercept':>16s}: {linreg.intercept_: .3f}")
else:
    print("\nNot enough rows to train final model (need ≥2). Using neutral ML predictions.")

# ---------- Inference helpers ----------
def predict_ml_score_from_eval(eval_obj: dict) -> float:
    """
    Predict a 0–100 credibility score from features using the trained LinearRegression.
    Falls back to 50.0 if not enough training data is available.
    """
    if len(X) < 2:
        return 50.0
    feats = features_from_eval(eval_obj)
    x = vectorize_features(feats).reshape(1, -1)
    pred = linreg.predict(x)[0]
    return float(np.clip(pred, 0.0, 100.0))

def _stars(score_0_100: float) -> str:
    stars = int(round(score_0_100 / 20))
    stars = max(0, min(5, stars))
    return "★"*stars + "☆"*(5 - stars)

# def hybrid_score(url: str, alpha: float = 0.6) -> dict:
#     """
#     Blend rule-based score with ML-predicted score:
#         final = alpha * rule_score + (1 - alpha) * ml_score
#     Returns a dict with scores, stars, and the rule explanation for transparency.
#     """
#     ev = evaluate_url(url)
#     rule_score = float(ev.get("score", 0.0))
#     ml_score = predict_ml_score_from_eval(ev)
#     final = float(np.clip(alpha * rule_score + (1 - alpha) * ml_score, 0.0, 100.0))
#     return {
#         "url": url,
#         "title": ev.get("details", {}).get("title"),
#         "rule_score": round(rule_score, 1),
#         "ml_score": round(ml_score, 1),
#         "hybrid_score": round(final, 1),
#         "stars": _stars(final),
#         "explanation": ev.get("explanation"),
#         "details": ev.get("details"),
#     }
# REPLACED TO CONSIDER EDGE CASE OF FAILED FETCHING:
def hybrid_score(url: str, alpha: float = 0.6, fetch_fail_floor: float = 20.0) -> dict:
    """
    Blend rule-based score with ML-predicted score:
        final = alpha * rule_score + (1 - alpha) * ml_score

    Edge case handling:
    - If fetch failed, we KEEP a low—but non-zero—score (fetch_fail_floor),
      skip ML, and return a clear explanation about limited accessibility.
    - Otherwise, proceed normally.

    Args:
        url: page to score
        alpha: weight for RULE score (0..1). (1-alpha) goes to ML score.
        fetch_fail_floor: low fallback score (0..100) used when fetch fails.

    Returns:
        dict with rule_score, ml_score, hybrid_score, stars, explanation, details.
    """
    ev = evaluate_url(url)
    details = ev.get("details", {})
    rule_score = float(ev.get("score", 0.0))

    # 🧩 Edge case: fetch failed
    if details.get("error") == "fetch_failed":
        final = float(np.clip(fetch_fail_floor, 0.0, 100.0))  # keep a LOW score, not zero
        # Friendly explanation (prepend ours, keep the original failure note too if you like)
        friendly = (
            "Credibility is low due to limited accessible information — "
            "the page could not be fetched or analyzed (e.g., the site blocked automated requests)."
        )
        base_expl = ev.get("explanation") or ""
        full_expl = f"{friendly} {(' | ' + base_expl) if base_expl else ''}".strip()

        # Stars from final score
        stars = int(round(final / 20))
        stars = max(0, min(5, stars))
        star_str = "★" * stars + "☆" * (5 - stars)

        return {
            "url": url,
            "title": details.get("title"),
            "rule_score": round(rule_score, 1),  # usually 0 on fetch fail, but shown for transparency
            "ml_score": 0.0,                      # we skip ML when fetch failed
            "hybrid_score": round(final, 1),
            "stars": star_str,
            "explanation": full_expl,
            "details": details,
        }

    # ✅ Normal path: combine rules + ML
    ml_score = predict_ml_score_from_eval(ev)
    final = float(np.clip(alpha * rule_score + (1 - alpha) * ml_score, 0.0, 100.0))

    # Stars
    stars = int(round(final / 20))
    stars = max(0, min(5, stars))
    star_str = "★" * stars + "☆" * (5 - stars)

    return {
        "url": url,
        "title": details.get("title"),
        "rule_score": round(rule_score, 1),
        "ml_score": round(ml_score, 1),
        "hybrid_score": round(final, 1),
        "stars": star_str,
        "explanation": ev.get("explanation", "No detailed explanation available."),
        "details": details,
    }

# ---------- Demo on a few URLs ----------
demo_urls = [
    "https://www.cdc.gov/flu/about/index.html",
    "https://www.mayoclinic.org/diseases-conditions/dehydration/symptoms-causes/syc-20354086",
    "https://www.healthline.com/nutrition/green-tea-and-weight-loss",
    "https://www.reddit.com/r/icecreamery/comments/19elt19/looking_for_resources_to_learn_how_to_make_ice/",
    "https://www.buzzfeed.com/"
]

for u in demo_urls:
    try:
        res = hybrid_score(u, alpha=0.6)  # 60% rules / 40% ML
        print("\n" + "—"*70)
        print(f"🔗 {res['url']}")
        print(f"📰 {res['title'] or '[No Title]'}")
        print(f"⭐ {res['stars']}  ({res['hybrid_score']}/100)  |  Rule: {res['rule_score']}  ML: {res['ml_score']}")
        print(f"📝 Why: {res['explanation']}")
    except Exception as e:
        print(f"Error on {u}: {e}")


  if (datetime.utcnow() - dt).days <= 3650:
  return max(0, (datetime.utcnow() - dt).days)  # OK to mirror your original utc usage


Prepared dataset: 14 rows × 7 features. Skipped: 6
Skipped examples (first few): [('https://www.health.harvard.edu/staying-healthy/what-is-intermittent-fasting', 'fetch_failed'), ('https://www.webmd.com/diet/obesity/features/green-tea-and-weight-loss', 'fetch_failed'), ('https://www.nature.com/scitable/definition/photosynthesis-288/', 'fetch_failed')]

5-fold CV (k=5) results on 14 rows:
  MAE per fold: [ 40.78 135.69  17.67 113.8    5.25] | mean=62.64
  R^2 per fold:  [-7.93610e+01 -1.51254e+02  1.54000e-01 -6.42219e+02 -8.30000e-01] | mean=-174.702

Final model (trained on all available rows):
             https:  0.000
          inst_tld:  25.909
        has_author: -5.094
    num_paragraphs:  0.191
       total_links:  0.013
    external_links:  0.011
    days_since_pub: -0.000
         Intercept:  82.622

——————————————————————————————————————————————————————————————————————
🔗 https://www.cdc.gov/flu/about/index.html
📰 About Influenza | Influenza (Flu) | CDC
⭐ ★★★★★  (93.6/100)  |

In [None]:
# @title
# # DELIVERABLE_1
# # ============================================
# # FILE: credibility_scoring.py
# # ============================================
# # This module implements the credibility scoring logic for articles

# import re, time, json, math, tldextract, requests
# from urllib.parse import urlparse
# from datetime import datetime
# from dateutil import parser as dateparser
# from bs4 import BeautifulSoup

# # Define a custom user agent to mimic a browser
# USER_AGENT = 'Mozilla/5.0 (CredibilityPOC/0.1)'
# DEFAULT_TIMEOUT = 12

# # List of common clickbait terms to detect low-quality articles
# CLICKBAIT_TERMS = [
#     "you won't believe", 'shocking', 'jaw-dropping', 'what happened next',
#     'unbelievable', 'miracle', 'exposed', "secret they don't want you to know"
# ]

# # Signals of editorial transparency that indicate credibility
# TRANSPARENCY_HINTS = [
#     'author','byline','by ','by:','written by','editor','editorial',
#     'fact-check','fact check','sources','references','citations',
#     'methodology','about us','about the author','corrections','disclosures'
# ]

# # Institutional top-level domains are considered more credible
# INSTITUTIONAL_TLDS = {'edu','gov','ac','sch','mil'}

# # Function to fetch the HTML content of a URL
# def fetch_html(url: str):
#     try:
#         headers = {'User-Agent': USER_AGENT}
#         resp = requests.get(url, headers=headers, timeout=DEFAULT_TIMEOUT)
#         resp.raise_for_status()
#         return resp.text, None
#     except Exception as e:
#         return None, f'Fetch error: {e}'

# # Function to extract useful metadata and content from an article
# def extract_article_fields(html: str, url: str):
#     soup = BeautifulSoup(html, 'lxml')
#     text_chunks, title, author, published = [], None, None, None

#     # Attempt to extract the title from multiple locations
#     if soup.title and soup.title.string:
#         title = soup.title.string.strip()
#     mt = soup.find('meta', attrs={'property':'og:title'}) or soup.find('meta', attrs={'name':'title'})
#     if not title and mt and mt.get('content'):
#         title = mt['content'].strip()

#     # Try to find author info
#     for selector in [
#         {'name':'meta','attrs':{'name':'author'}},
#         {'name':'meta','attrs':{'property':'article:author'}},
#         {'name':'span','class_':re.compile('author|byline', re.I)},
#         {'name':'div','class_':re.compile('author|byline', re.I)},
#         {'name':'a','class_':re.compile('author', re.I)},
#     ]:
#         if selector['name']=='meta':
#             node = soup.find('meta', attrs=selector['attrs'])
#             if node and node.get('content'):
#                 author = node['content'].strip(); break
#         else:
#             node = soup.find(selector['name'], class_=selector.get('class_'))
#             if node and node.get_text(strip=True):
#                 candidate = node.get_text(' ', strip=True)
#                 if len(candidate) >= 3:
#                     author = candidate; break

#     # Try to find a publish date
#     for date_sel in [
#         {'name':'meta','attrs':{'property':'article:published_time'}},
#         {'name':'meta','attrs':{'name':'date'}},
#         {'name':'time','attrs':{}},
#         {'name':'span','class_':re.compile('date|time', re.I)},
#     ]:
#         if date_sel['name']=='meta':
#             node = soup.find('meta', attrs=date_sel['attrs'])
#             if node and node.get('content'):
#                 try:
#                     published = dateparser.parse(node['content'], fuzzy=True); break
#                 except Exception:
#                     pass
#         else:
#             node = soup.find(date_sel['name'], class_=date_sel.get('class_'))
#             if node and node.get_text(strip=True):
#                 try:
#                     published = dateparser.parse(node.get_text(strip=True), fuzzy=True); break
#                 except Exception:
#                     pass

#     # Extract paragraph text from the article
#     main_container = None
#     for cls in ['article','post','story','content','entry-content','article-body']:
#         mc = soup.find(True, class_=re.compile(cls, re.I))
#         if mc: main_container = mc; break

#     paragraphs = (main_container.find_all('p') if main_container else soup.find_all('p'))
#     for p in paragraphs:
#         t = p.get_text(' ', strip=True)
#         if t and len(t) > 40: text_chunks.append(t)

#     article_text = '\n\n'.join(text_chunks)[:100000]

#     # Extract links and count external references
#     all_links, external_links = [], []
#     base_host = urlparse(url).netloc.lower()
#     for a in soup.find_all('a', href=True):
#         href = a['href']
#         if href.startswith('http://') or href.startswith('https://'):
#             all_links.append(href)
#             if urlparse(href).netloc.lower() != base_host:
#                 external_links.append(href)

#     # Check for transparency language
#     full_text_for_hints = (article_text + ' ' + ' '.join(TRANSPARENCY_HINTS)).lower()

#     return {
#         'title': title,
#         'author': author,
#         'published': published.isoformat() if published else None,
#         'text': article_text,
#         'num_paragraphs': len(text_chunks),
#         'all_links': all_links,
#         'external_links': external_links,
#         'has_transparency_hints': any(h in full_text_for_hints for h in TRANSPARENCY_HINTS),
#     }

# # Function to calculate a credibility score based on article metadata and content
# def score_url(url: str, fields: dict):
#     explanation_bits = []
#     score = 50  # Start from neutral score

#     # + HTTPS is a sign of trust
#     if url.lower().startswith('https://'):
#         score += 12; explanation_bits.append('+12: uses HTTPS')
#     else:
#         score -= 10; explanation_bits.append('-10: not using HTTPS')

#     # + TLD check (institutional domains boost score)
#     ext = tldextract.extract(url)
#     tld_last = (ext.suffix.split('.')[-1] if ext.suffix else '')
#     if tld_last in INSTITUTIONAL_TLDS:
#         score += 14; explanation_bits.append(f'+14: institutional TLD ({tld_last})')

#     # + Author attribution
#     if fields.get('author'):
#         score += 10; explanation_bits.append('+10: author/byline found')
#     else:
#         score -= 6; explanation_bits.append('-6: no clear author/byline')

#     # + Publication recency
#     published = fields.get('published')
#     if published:
#         try:
#             dt = dateparser.parse(published)
#             if (datetime.utcnow() - dt).days <= 3650:
#                 score += 6; explanation_bits.append('+6: reasonably recent publication date')
#             else:
#                 score -= 4; explanation_bits.append('-4: appears quite old')
#         except Exception:
#             explanation_bits.append('0: could not parse publication date reliably')
#     else:
#         explanation_bits.append('0: no publication date found')

#     # + References/links
#     total_links = len(fields.get('all_links', []))
#     external_links_count = len(fields.get('external_links', []))
#     if total_links >= 5 and external_links_count >= 3:
#         score += 10; explanation_bits.append(f'+10: provides references (links: {total_links}, external: {external_links_count})')
#     elif total_links >= 2:
#         score += 4; explanation_bits.append(f'+4: some references (links: {total_links})')
#     else:
#         score -= 6; explanation_bits.append(f'-6: minimal/no references (links: {total_links})')

#     # + Article length
#     num_paras = fields.get('num_paragraphs', 0)
#     if num_paras >= 8:
#         score += 6; explanation_bits.append('+6: substantive article length')
#     elif num_paras >= 3:
#         score += 2; explanation_bits.append('+2: moderate article length')
#     else:
#         score -= 6; explanation_bits.append('-6: very short article text')

#     # - Clickbait detection
#     text_lower = (fields.get('text') or '').lower()
#     clickbait_hits = sum(1 for term in CLICKBAIT_TERMS if term in text_lower)
#     if clickbait_hits >= 2:
#         score -= 10; explanation_bits.append('-10: strong clickbait indicators')
#     elif clickbait_hits == 1:
#         score -= 4; explanation_bits.append('-4: mild clickbait indicators')

#     # - Advertising/sponsored content signal
#     ad_signals = len(re.findall(r"advertis(e|ement)|sponsor(ed|ship)", text_lower))
#     iframes_penalty = min(8, math.floor(ad_signals / 5) * 2)
#     if iframes_penalty:
#         score -= iframes_penalty; explanation_bits.append(f'-{iframes_penalty}: advertising/sponsorship language')

#     score = max(0, min(100, int(round(score))))
#     explanation = '; '.join(explanation_bits)
#     return score, explanation

# # Main function to evaluate the URL and return all metadata, score, and explanation
# def evaluate_url(url: str):
#     if not isinstance(url, str) or not url.strip():
#         return {'score': 0, 'explanation': 'Invalid URL input.', 'details': {'error': 'empty_or_non_string'}}

#     html, err = fetch_html(url)
#     if err:
#         return {'score': 0, 'explanation': f'Failed to fetch: {err}', 'details': {'error': 'fetch_failed'}}

#     fields = extract_article_fields(html, url)
#     score, explanation = score_url(url, fields)

#     return {
#         'score': score,
#         'explanation': explanation,
#         'details': {
#             'url': url,
#             'title': fields.get('title'),
#             'author': fields.get('author'),
#             'published': fields.get('published'),
#             'num_paragraphs': fields.get('num_paragraphs'),
#             'total_links': len(fields.get('all_links', [])),
#             'external_links': len(fields.get('external_links', [])),
#         },
#     }

# # Sample test
# if __name__ == "__main__":
#     res = evaluate_url('https://www.mayoclinic.org/diseases-conditions/dehydration/symptoms-causes/syc-20354086')
#     print(json.dumps(res, indent=2))

In [None]:
# @title
# VERSION_1
# import requests, os

# SERP_API_KEY = os.getenv("SERP_API_KEY")

# def search_google(query, num_results=3):
#     url = "https://serpapi.com/search"
#     params = {
#         "q": query,
#         "api_key": SERP_API_KEY,
#         "num": num_results,
#         "engine": "google",
#         "hl": "en",
#     }
#     response = requests.get(url, params=params)
#     data = response.json()
#     links = [r.get("link") for r in data.get("organic_results", []) if r.get("link")]
#     return links

In [17]:
# --- Improved SerpAPI search (clean results, de-dupe, filter, robust errors) ---
import os, re, requests
from urllib.parse import urlparse

SERP_API_KEY = os.getenv("SERP_API_KEY")  # set this earlier in your notebook

# Common low-signal/social/video domains to exclude by default (tune as needed)
_DEFAULT_EXCLUDE_DOMAINS = {
    "reddit.com", "www.reddit.com", "old.reddit.com",
    "x.com", "twitter.com", "www.twitter.com",
    "tiktok.com", "www.tiktok.com",
    "pinterest.com", "www.pinterest.com",
    "facebook.com", "www.facebook.com",
    "instagram.com", "www.instagram.com",
    "youtube.com", "www.youtube.com", "youtu.be"
}

# Skip obvious non-article filetypes
_SKIP_FILETYPES = re.compile(r"\.(pdf|pptx?|docx?|xlsx?|zip|rar)(?:$|\?)", re.I)

def _host(url: str) -> str:
    return urlparse(url).netloc.lower()

def search_google(
    query: str,
    num_results: int = 5,
    exclude_domains: set | None = None,
    allow_news_results: bool = True,
    timeout: int = 20,
) -> list[dict]:
    """
    Search Google via SerpAPI and return a clean list of results:
        [{"title": str, "link": str, "snippet": str}, ...]
    - De-duplicates by link and avoids flooding from the same host
    - Skips social/UGC/video sites and non-HTML filetypes
    - Optionally includes Google News results
    """
    if not SERP_API_KEY:
        raise RuntimeError("Missing SERP_API_KEY. Set os.environ['SERP_API_KEY'] first.")

    exclude = set(_DEFAULT_EXCLUDE_DOMAINS)
    if exclude_domains:
        exclude |= set(exclude_domains)

    params = {
        "engine": "google",
        "q": query,
        "api_key": SERP_API_KEY,
        "num": 10,          # pull extra, then filter/trim to num_results
        "hl": "en",
        "gl": "us",
        "safe": "active",   # optional: reduce NSFW
    }

    try:
        resp = requests.get("https://serpapi.com/search", params=params, timeout=timeout)
        resp.raise_for_status()
        data = resp.json()
    except Exception as e:
        # On failure, return empty list (caller can handle and message user)
        print(f"[SerpAPI] Error: {e}")
        return []

    candidates: list[dict] = []

    # 1) Organic results
    for r in (data.get("organic_results") or []):
        link = r.get("link")
        title = r.get("title")
        snippet = r.get("snippet") or ""
        if not link or _SKIP_FILETYPES.search(link):
            continue
        host = _host(link)
        if host in exclude:
            continue
        candidates.append({"title": title, "link": link, "snippet": snippet})

    # 2) Optional: News results (helpful for timely topics)
    if allow_news_results:
        for r in (data.get("news_results") or []):
            link = r.get("link")
            title = r.get("title")
            snippet = r.get("snippet") or ""
            if not link or _SKIP_FILETYPES.search(link):
                continue
            host = _host(link)
            if host in exclude:
                continue
            candidates.append({"title": title, "link": link, "snippet": snippet})

    # De-duplicate by link; also avoid over-representing a single host
    cleaned, seen_links, seen_hosts = [], set(), set()
    for c in candidates:
        link, host = c["link"], _host(c["link"])
        if link in seen_links:
            continue
        # If we already have enough and this host is duplicate, skip
        if host in seen_hosts and len(cleaned) >= num_results:
            continue
        seen_links.add(link)
        seen_hosts.add(host)
        cleaned.append(c)
        if len(cleaned) >= num_results:
            break

    return cleaned

In [18]:
import os, google.generativeai as genai

# 1) Configure the SDK with your key (make sure you set os.environ["GEMINI_API_KEY"] earlier)
genai.configure(api_key=os.getenv("GEMINI_API_KEY"))

# 2) Use a current model name (the old "gemini-pro" on v1beta throws 404)
#    Options: "gemini-1.5-flash" (faster/cheaper) or "gemini-1.5-pro" (stronger).
MODEL_NAME = "gemini-1.5-flash"
model = genai.GenerativeModel(MODEL_NAME)

def summarize_text(text: str, query: str) -> str:
    """
    Summarize `text` for the user's `query` using Gemini.
    Falls back to a simple extractive summary if the API call fails.
    """
    if not text:
        return "[No article text extracted to summarize.]"

    # keep prompt compact; long inputs can hit limits
    clipped = text[:4000]

    prompt = (
        "You are a helpful assistant. The user asked:\n"
        f"{query}\n\n"
        "Based on the following article content, write a concise 3–5 sentence summary "
        "that directly helps answer the user's question. Avoid fluff; be factual.\n\n"
        f"Article:\n{clipped}"
    )

    try:
        resp = model.generate_content(prompt)
        # Some SDK versions use resp.text; keep a safe fallback
        out = getattr(resp, "text", None)
        return out.strip() if out else "[Empty response from model.]"
    except Exception as e:
        # Fallback: simple extractive summary (first few sentences)
        import re
        sents = re.split(r'(?<=[.!?])\s+', clipped)
        return " ".join(sents[:3]) + f"\n\n[Gemini fallback due to error: {e}]"

In [19]:
# @title
# VERSION_1
# chat_history = []

# def display_response(title, url, summary, score):
#     stars = "★" * round(score / 20)
#     print(f"\n📰 Title: {title or '[No Title]'}")
#     print(f"🔗 Link: {url}")
#     print(f"📄 Summary: {summary}")
#     print(f"⭐ Credibility: {stars} ({score}/100)\n")

# def run_chatbot():
#     print("🤖 Hello! I'm CredBot. What would you like to learn about today? FYI: To end session please enter 'exit' at any time!")
#     while True:
#         user_input = input("\nYou: ").strip()
#         if user_input.lower() in {"exit", "quit", "bye"}:
#             print("👋 Goodbye!")
#             break

#         chat_history.append({"role": "user", "content": user_input})
#         print("🔍 Searching Google...")
#         links = search_google(user_input)

#         if not links:
#             print("⚠️ No relevant articles found.")
#             continue

#         for link in links:
#             print(f"\nEvaluating: {link}")
#             result = evaluate_url(link)
#             score = result["score"]
#             details = result["details"]
#             title = details.get("title", "Untitled")
#             text = details.get("text") or ""
#             summary = summarize_text(text, user_input)
#             display_response(title, link, summary, score)

#         chat_history.append({"role": "bot", "content": f"Returned {len(links)} articles."})

# run_chatbot()

🤖 Hello! I'm CredBot. What would you like to learn about today? FYI: To end session please enter 'exit' at any time!

You: I want to learn about ice cream
🔍 Searching Google...

Evaluating: {'title': 'Ice cream', 'link': 'https://en.wikipedia.org/wiki/Ice_cream', 'snippet': 'Ice cream is a frozen dessert typically made from milk or cream that has been flavoured with a sweetener, either sugar or an alternative, and a spice'}

📰 Title: Untitled
🔗 Link: {'title': 'Ice cream', 'link': 'https://en.wikipedia.org/wiki/Ice_cream', 'snippet': 'Ice cream is a frozen dessert typically made from milk or cream that has been flavoured with a sweetener, either sugar or an alternative, and a spice'}
📄 Summary: [No article text extracted to summarize.]
⭐ Credibility:  (0/100)


Evaluating: {'title': 'Ice Cream Science', 'link': 'https://www.dreamscoops.com/ice-cream-science/', 'snippet': "The fats also give ice cream it's creamy texture and richness. Higher fat ice creams are rich and creamy with a lon

In [None]:
# @title
# VERSION_1 OUTPUT
# 🤖 Hello! I'm CredBot. What would you like to learn about today? FYI: To end session please enter 'exit' at any time!

# You: I want to learn about ice cream
# 🔍 Searching Google...

# Evaluating: {'title': 'Ice cream', 'link': 'https://en.wikipedia.org/wiki/Ice_cream', 'snippet': 'Ice cream is a frozen dessert typically made from milk or cream that has been flavoured with a sweetener, either sugar or an alternative, and a spice'}

# 📰 Title: Untitled
# 🔗 Link: {'title': 'Ice cream', 'link': 'https://en.wikipedia.org/wiki/Ice_cream', 'snippet': 'Ice cream is a frozen dessert typically made from milk or cream that has been flavoured with a sweetener, either sugar or an alternative, and a spice'}
# 📄 Summary: [No article text extracted to summarize.]
# ⭐ Credibility:  (0/100)


# Evaluating: {'title': 'Ice Cream Science', 'link': 'https://www.dreamscoops.com/ice-cream-science/', 'snippet': "The fats also give ice cream it's creamy texture and richness. Higher fat ice creams are rich and creamy with a long lingering after-taste."}

# 📰 Title: Untitled
# 🔗 Link: {'title': 'Ice Cream Science', 'link': 'https://www.dreamscoops.com/ice-cream-science/', 'snippet': "The fats also give ice cream it's creamy texture and richness. Higher fat ice creams are rich and creamy with a long lingering after-taste."}
# 📄 Summary: [No article text extracted to summarize.]
# ⭐ Credibility:  (0/100)


# Evaluating: {'title': "15 Ice Cream Fun Facts That You Didn't Know", 'link': 'https://www.mymochi.com/blog/fun-facts-about-ice-cream-that-you-didnt-know/', 'snippet': 'It takes 12 pounds of milk to produce just 1 gallon of ice cream · The average number of licks to finish a scoop of ice cream is 50 · The country that consumes ...'}

# 📰 Title: Untitled
# 🔗 Link: {'title': "15 Ice Cream Fun Facts That You Didn't Know", 'link': 'https://www.mymochi.com/blog/fun-facts-about-ice-cream-that-you-didnt-know/', 'snippet': 'It takes 12 pounds of milk to produce just 1 gallon of ice cream · The average number of licks to finish a scoop of ice cream is 50 · The country that consumes ...'}
# 📄 Summary: [No article text extracted to summarize.]
# ⭐ Credibility:  (0/100)


# Evaluating: {'title': 'The History of Ice Cream - IDFA', 'link': 'https://www.idfa.org/the-history-of-ice-cream', 'snippet': "Ice cream's origins are known to reach back as far as the second century BC, although no specific date of origin nor inventor has been undisputably credited ..."}

# 📰 Title: Untitled
# 🔗 Link: {'title': 'The History of Ice Cream - IDFA', 'link': 'https://www.idfa.org/the-history-of-ice-cream', 'snippet': "Ice cream's origins are known to reach back as far as the second century BC, although no specific date of origin nor inventor has been undisputably credited ..."}
# 📄 Summary: [No article text extracted to summarize.]
# ⭐ Credibility:  (0/100)


# Evaluating: {'title': 'The Scoop: Getting to Know Ice Cream', 'link': 'https://www.ice.edu/blog/scoop-getting-know-ice-cream', 'snippet': 'Chef Jenny McCoy dives into the history and science of ice cream, from soft serve to Philly-style to gelato.'}

# 📰 Title: Untitled
# 🔗 Link: {'title': 'The Scoop: Getting to Know Ice Cream', 'link': 'https://www.ice.edu/blog/scoop-getting-know-ice-cream', 'snippet': 'Chef Jenny McCoy dives into the history and science of ice cream, from soft serve to Philly-style to gelato.'}
# 📄 Summary: [No article text extracted to summarize.]
# ⭐ Credibility:  (0/100)


# You: exit
# 👋 Goodbye!


In [20]:
# ============================
# Simple console chatbot (CredBot) using HYBRID scoring
# ============================

chat_history = []

def display_response(title, url, summary, hybrid_dict):
    """
    Pretty-prints one search result using the hybrid scorer output.
    `hybrid_dict` is the object returned by hybrid_score(url).
    """
    score = hybrid_dict.get("hybrid_score", 0.0)
    stars = hybrid_dict.get("stars") or ("★" * int(round(score / 20)))
    why = hybrid_dict.get("explanation", "").strip()

    print(f"\n📰 Title: {title or '[No Title]'}")
    print(f"🔗 Link: {url}")
    print(f"📄 Summary: {summary}")
    print(f"⭐ Credibility: {stars} ({score}/100)")
    if why:
        print(f"📝 Why: {why}\n")

def run_chatbot():
    print("🤖 Hello! I'm CredBot. What would you like to learn about today? FYI: To end session please enter 'exit' at any time!")
    while True:
        user_input = input("\nYou: ").strip()
        if user_input.lower() in {"exit", "quit", "bye"}:
            print("👋 Goodbye!")
            break

        chat_history.append({"role": "user", "content": user_input})
        print("🔍 Searching Google...")

        # NOTE: our improved search_google returns a list of dicts:
        # [{"title": str, "link": str, "snippet": str}, ...]
        results = search_google(user_input, num_results=5)

        if not results:
            print("⚠️ No relevant articles found.")
            continue

        for item in results:
            title = item.get("title") or "[No Title]"
            link = item.get("link")
            snippet = item.get("snippet") or ""

            if not link:
                continue

            print(f"\nEvaluating: {link}")

            # 1) Get hybrid credibility score (uses your rule+ML logic)
            hybrid = hybrid_score(link, alpha=0.6)

            # 2) Build summary text
            #    Prefer full article text; fall back to the search snippet if needed.
            #    evaluate_url() does not return article text in 'details', so we fetch & parse here.
            article_text = ""
            html, err = fetch_html(link)
            if not err and html:
                try:
                    fields = extract_article_fields(html, link)
                    article_text = fields.get("text") or ""
                except Exception:
                    article_text = ""

            # If we still don't have content, summarize the snippet (so user sees something useful).
            summary_input = article_text if article_text else snippet
            summary = summarize_text(summary_input, user_input)

            # 3) Display nicely with hybrid score + “why”
            display_response(title, link, summary, hybrid)

        chat_history.append({"role": "bot", "content": f"Returned {len(results)} articles."})

# Run it
run_chatbot()

🤖 Hello! I'm CredBot. What would you like to learn about today? FYI: To end session please enter 'exit' at any time!

You: i want to learn about ice cream
🔍 Searching Google...

Evaluating: https://en.wikipedia.org/wiki/Ice_cream


  if (datetime.utcnow() - dt).days <= 3650:
  return max(0, (datetime.utcnow() - dt).days)  # OK to mirror your original utc usage



📰 Title: Ice cream
🔗 Link: https://en.wikipedia.org/wiki/Ice_cream
📄 Summary: Ice cream is a frozen dessert typically made from milk or cream that has been flavoured with a sweetener, either sugar or an alternative, and a spice

[Gemini fallback due to error: 404 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash:generateContent?%24alt=json%3Benum-encoding%3Dint: models/gemini-1.5-flash is not found for API version v1beta, or is not supported for generateContent. Call ListModels to see the list of available models and their supported methods.]
⭐ Credibility: ★★★★☆ (76.6/100)
📝 Why: +12: uses HTTPS; +10: author/byline found; -4: appears quite old; +10: provides references (links: 308, external: 307); -6: very short article text


Evaluating: https://www.dreamscoops.com/ice-cream-science/





📰 Title: Ice Cream Science
🔗 Link: https://www.dreamscoops.com/ice-cream-science/
📄 Summary: Ice cream contains solid ice and fat, liquid sugar solution, and gas air bubbles. Ice crystals give firmness, air gives softness, and fat ...

[Gemini fallback due to error: 404 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash:generateContent?%24alt=json%3Benum-encoding%3Dint: models/gemini-1.5-flash is not found for API version v1beta, or is not supported for generateContent. Call ListModels to see the list of available models and their supported methods.]
⭐ Credibility: ★★★☆☆ (50.6/100)
📝 Why: +12: uses HTTPS; -6: no clear author/byline; 0: no publication date found; -6: minimal/no references (links: 0); -6: very short article text


Evaluating: https://www.mymochi.com/blog/fun-facts-about-ice-cream-that-you-didnt-know/


  if (datetime.utcnow() - dt).days <= 3650:
  return max(0, (datetime.utcnow() - dt).days)  # OK to mirror your original utc usage



📰 Title: 15 Ice Cream Fun Facts That You Didn't Know
🔗 Link: https://www.mymochi.com/blog/fun-facts-about-ice-cream-that-you-didnt-know/
📄 Summary: Do you love ice cream as much as we do? If you can’t go a day, a week, or even a month without the delicious flavors of ice cream, then we know how you feel. While you may be a pro at eating ice cream, did you know that there are some really interesting facts about ice cream?

[Gemini fallback due to error: 404 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash:generateContent?%24alt=json%3Benum-encoding%3Dint: models/gemini-1.5-flash is not found for API version v1beta, or is not supported for generateContent. Call ListModels to see the list of available models and their supported methods.]
⭐ Credibility: ★★★☆☆ (69.4/100)
📝 Why: +12: uses HTTPS; +10: author/byline found; 0: could not parse publication date reliably; +4: some references (links: 31); +2: moderate article length


Evaluating: https://www.ice.edu/bl




📰 Title: The Scoop: Getting to Know Ice Cream
🔗 Link: https://www.ice.edu/blog/scoop-getting-know-ice-cream
📄 Summary: Chef Jenny McCoy dives into the history and science of ice cream, from soft serve to Philly-style to gelato.

[Gemini fallback due to error: 404 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash:generateContent?%24alt=json%3Benum-encoding%3Dint: models/gemini-1.5-flash is not found for API version v1beta, or is not supported for generateContent. Call ListModels to see the list of available models and their supported methods.]
⭐ Credibility: ★☆☆☆☆ (20.0/100)
📝 Why: Credibility is low due to limited accessible information — the page could not be fetched or analyzed (e.g., the site blocked automated requests).  | Failed to fetch: Fetch error: 403 Forbidden


Evaluating: https://www.idfa.org/the-history-of-ice-cream





📰 Title: The History of Ice Cream - IDFA
🔗 Link: https://www.idfa.org/the-history-of-ice-cream
📄 Summary: Ice cream's origins are known to reach back as far as the second century B.C., although no specific date of origin nor inventor has been undisputably credited with its discovery. We know that Alexander the Great enjoyed snow and ice flavored with honey and nectar. Biblical references also show that King Solomon was fond of iced drinks during harvesting.

[Gemini fallback due to error: 404 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash:generateContent?%24alt=json%3Benum-encoding%3Dint: models/gemini-1.5-flash is not found for API version v1beta, or is not supported for generateContent. Call ListModels to see the list of available models and their supported methods.]
⭐ Credibility: ★★★☆☆ (65.8/100)
📝 Why: +12: uses HTTPS; -6: no clear author/byline; 0: no publication date found; +10: provides references (links: 39, external: 10); +2: moderate article l

In [22]:
# ============================================================
# DELIVERABLE_2: Simple manual test on 5 ice cream URLs
# ============================================================

test_links = [
    "https://en.wikipedia.org/wiki/Ice_cream",
    "https://www.dreamscoops.com/ice-cream-science/",
    "https://www.mymochi.com/blog/fun-facts-about-ice-cream-that-you-didnt-know/",
    "https://www.idfa.org/the-history-of-ice-cream",
    "https://www.ice.edu/blog/scoop-getting-know-ice-cream",
]

print("🧪 Running hybrid credibility scoring test on 5 URLs...\n")

for url in test_links:
    try:
        result = hybrid_score(url, alpha=0.6)  # uses rule + ML
        print("—" * 70)
        print(f"🔗 {url}")
        print(f"⭐ {result['stars']}  ({result['hybrid_score']}/100)")
        print(f"   • Rule score: {result['rule_score']}")
        print(f"   • ML score:   {result['ml_score']}")
        print(f"📝 Why: {result['explanation']}\n")
    except Exception as e:
        print(f"⚠️ Error scoring {url}: {e}")

print("✅ Test complete!")

🧪 Running hybrid credibility scoring test on 5 URLs...



  if (datetime.utcnow() - dt).days <= 3650:
  return max(0, (datetime.utcnow() - dt).days)  # OK to mirror your original utc usage


——————————————————————————————————————————————————————————————————————
🔗 https://en.wikipedia.org/wiki/Ice_cream
⭐ ★★★★☆  (76.6/100)
   • Rule score: 72.0
   • ML score:   83.5
📝 Why: +12: uses HTTPS; +10: author/byline found; -4: appears quite old; +10: provides references (links: 308, external: 307); -6: very short article text

——————————————————————————————————————————————————————————————————————
🔗 https://www.dreamscoops.com/ice-cream-science/
⭐ ★★★☆☆  (50.6/100)
   • Rule score: 44.0
   • ML score:   60.5
📝 Why: +12: uses HTTPS; -6: no clear author/byline; 0: no publication date found; -6: minimal/no references (links: 0); -6: very short article text



  if (datetime.utcnow() - dt).days <= 3650:
  return max(0, (datetime.utcnow() - dt).days)  # OK to mirror your original utc usage


——————————————————————————————————————————————————————————————————————
🔗 https://www.mymochi.com/blog/fun-facts-about-ice-cream-that-you-didnt-know/
⭐ ★★★☆☆  (69.4/100)
   • Rule score: 78.0
   • ML score:   56.6
📝 Why: +12: uses HTTPS; +10: author/byline found; 0: could not parse publication date reliably; +4: some references (links: 31); +2: moderate article length

——————————————————————————————————————————————————————————————————————
🔗 https://www.idfa.org/the-history-of-ice-cream
⭐ ★★★☆☆  (65.8/100)
   • Rule score: 68.0
   • ML score:   62.5
📝 Why: +12: uses HTTPS; -6: no clear author/byline; 0: no publication date found; +10: provides references (links: 39, external: 10); +2: moderate article length

——————————————————————————————————————————————————————————————————————
🔗 https://www.ice.edu/blog/scoop-getting-know-ice-cream
⭐ ★☆☆☆☆  (20.0/100)
   • Rule score: 0.0
   • ML score:   0.0
📝 Why: Credibility is low due to limited accessible information — the page could not be fetche

In [23]:
import requests
url = "https://www.ice.edu/blog/scoop-getting-know-ice-cream"
r = requests.get(url, headers={"User-Agent": "Mozilla/5.0"}, timeout=12)
print(r.status_code, r.headers.get("Content-Type"))
print(r.text[:500])

403 text/html; charset=UTF-8
<!DOCTYPE html><html lang="en-US"><head><title>Just a moment...</title><meta http-equiv="Content-Type" content="text/html; charset=UTF-8"><meta http-equiv="X-UA-Compatible" content="IE=Edge"><meta name="robots" content="noindex,nofollow"><meta name="viewport" content="width=device-width,initial-scale=1"><style>*{box-sizing:border-box;margin:0;padding:0}html{line-height:1.15;-webkit-text-size-adjust:100%;color:#313131;font-family:system-ui,-apple-system,BlinkMacSystemFont,"Segoe UI",Roboto,"Helve


Deliverable 2 Summary:
This phase integrates Google Search (SERP API), the rule-based credibility engine, and a lightweight Linear Regression model into one hybrid scoring pipeline. It features smarter HTML fetching with header rotation, back-off retries, and explicit edge-case handling for blocked sites. Twenty real URLs are labeled and used to train and 5-fold-validate the ML model, producing blended scores (rules + ML) with clear explanations and star ratings. A manual test cell confirms that each link runs end-to-end—from fetching and feature extraction to final hybrid credibility output—demonstrating a robust, interpretable scoring system ready for chatbot integration in Deliverable 3.


DELIVERABLE 3 needs to better intergrate the gemini chat bot so that it doesn't just search for links right away and actually provides a back and forth conversation. I also need to figure out a better solution to the credibility function not being able to access some websites, maybe i won't have the chat bot include those links.

But for now, deliverable 2 has the function working properly and tackling edge cases such as providing low score and a reasoning behind the low score when not being able to access contents of a link.