# SEO Content Detector Pipeline
_Notebook: `seo_pipeline.ipynb`_

This notebook implements the full assignment pipeline:
A) Setup & Data Load → B) HTML → Clean Text → C) Feature Engineering → D) Duplicate Detection → E) Quality Labels & Model → F) Real-time `analyze_url(url)` Demo → G) Save Outputs

**Repo layout** (expected):
```
seo-content-detector/
├─ data/
│  ├─ data.csv
│  ├─ extracted_content.csv
│  ├─ features.csv
│  └─ duplicates.csv
├─ notebooks/
│  └─ seo_pipeline.ipynb
├─ models/
│  └─ quality_model.pkl (created later)
└─ requirements.txt (recommended)
```


## A. Setup & Data Load

In [2]:
pip install tldextract

Collecting tldextract
  Downloading tldextract-5.1.2-py3-none-any.whl (97 kB)
Collecting requests-file>=1.4
  Downloading requests_file-3.0.1-py2.py3-none-any.whl (4.5 kB)
Installing collected packages: requests-file, tldextract
Successfully installed requests-file-3.0.1 tldextract-5.1.2
Note: you may need to restart the kernel to use updated packages.


In [4]:
pip install textstat

Collecting textstat
  Downloading textstat-0.7.10-py3-none-any.whl (239 kB)
Collecting pyphen
  Downloading pyphen-0.16.0-py3-none-any.whl (2.1 MB)
Installing collected packages: pyphen, textstat
Successfully installed pyphen-0.16.0 textstat-0.7.10
Note: you may need to restart the kernel to use updated packages.


In [7]:
# Imports
import os, re, json, time, math, itertools, warnings
from pathlib import Path

import numpy as np
import pandas as pd

from bs4 import BeautifulSoup
import tldextract
import textstat

import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

# NLTK downloads (quiet)
nltk.download('punkt', quiet=True)

# Paths
ROOT = Path("C:/Users/AISHWARYA/Downloads/seo-content-detector")
DATA = ROOT / "data"
MODELS = ROOT / "models"

DATA.mkdir(parents=True, exist_ok=True)
MODELS.mkdir(parents=True, exist_ok=True)

# Load dataset
data_path = DATA / "data.csv"

df = pd.read_csv(data_path)


has_html = 'html_content' in df.columns
print("Rows:", len(df), "| Columns:", list(df.columns))
print("Has html_content:", has_html)

Rows: 81 | Columns: ['url', 'html_content']
Has html_content: True


## B. HTML → Clean Text

In [17]:
import re
from typing import List, Tuple
from bs4 import BeautifulSoup

# Ensure the primary dataset shape is correct
assert has_html, "Primary dataset expected: data.csv must include an 'html_content' column."
assert 'html_content' in df.columns, "Missing 'html_content' column in data.csv."

def _remove_boilerplate(soup: BeautifulSoup) -> None:
    """Remove non-content elements in-place."""
    for tag in soup(['script','style','noscript','header','footer','nav','aside','form','svg','iframe']):
        tag.decompose()

def _get_title(soup: BeautifulSoup) -> str:
    """Prefer <title>, fall back to first <h1>."""
    if soup.title and soup.title.string:
        t = soup.title.string.strip()
        if t:
            return t
    h1 = soup.find("h1")
    if h1:
        t = h1.get_text(" ", strip=True)
        if t:
            return t
    return ""

def _extract_main_text(soup: BeautifulSoup) -> str:
    """Prefer article/main/role=main; fall back to semantic tags."""
    _remove_boilerplate(soup)

    # 1) Prefer main content containers
    main_candidates = soup.select("article, main, [role=main]")
    main_candidates = [c for c in main_candidates if c and c.get_text(strip=True)]
    if main_candidates:
        # pick the longest text node
        best = max(main_candidates, key=lambda el: len(el.get_text(" ", strip=True)))
        text = best.get_text(" ", strip=True)
        return re.sub(r"\s+", " ", text).strip()

    # 2) Fallback: collect from semantic tags
    parts: List[str] = []
    for sel in ['h1','h2','h3','p','li']:
        for t in soup.select(sel):
            txt = t.get_text(" ", strip=True)
            if txt:
                parts.append(txt)
    text = " ".join(parts)
    return re.sub(r"\s+", " ", text).strip()

def _html_to_title_and_text(html: str) -> Tuple[str, str]:
    if not isinstance(html, str) or not html.strip():
        return "", ""
    soup = BeautifulSoup(html, "lxml")
    title = _get_title(soup)
    body_text = _extract_main_text(soup)
    return title, body_text

def _safe_sent_count(text: str) -> int:
    if not text: 
        return 0
    try:
        return len(nltk.sent_tokenize(text))
    except Exception:
        # fallback if punkt isn't available for some reason
        return max(1, text.count(".") + text.count("!") + text.count("?"))

# Process rows
records = []
for idx, row in df.iterrows():
    url = str(row.get("url", ""))
    html = row.get("html_content", "")
    try:
        title, body = _html_to_title_and_text(html)
    except Exception as e:
        # Graceful handling per requirements
        title, body = "", ""
    wc = len(body.split()) if body else 0
    sc = _safe_sent_count(body)
    fre = textstat.flesch_reading_ease(body) if body else 0.0
    records.append({
        "url": url,
        "title": title,
        "body_text": body,
        "word_count": wc,
        "sentence_count": sc,
        "flesch_reading_ease": fre
    })

# Build DataFrame & save with title included
extracted_df = pd.DataFrame.from_records(
    records,
    columns=["url","title","body_text","word_count","sentence_count","flesch_reading_ease"]
)

extracted_out = DATA / "extracted_content.csv"
extracted_df.to_csv(extracted_out, index=False)
print(f"Saved extracted content to: {extracted_out}")

# Quick peek
extracted_df.head(5)

Saved extracted content to: C:\Users\AISHWARYA\Downloads\seo-content-detector\data\extracted_content.csv


Unnamed: 0,url,title,body_text,word_count,sentence_count,flesch_reading_ease
0,https://www.cm-alliance.com/cybersecurity-blog,Cyber Security Blog,Cybersecurity Blog,2,1,-91.295
1,https://www.varonis.com/blog/cybersecurity-tips,Top 10 Cybersecurity Awareness Tips: How to St...,Blog Privacy & Compliance Top 10 Cybersecurity...,1747,94,40.871699
2,https://www.cisecurity.org/insights/blog/11-cy...,11 Cyber Defense Tips to Stay Secure at Work a...,Home Insights Blog Posts 11 Cyber Defense Tips...,1058,73,53.262918
3,https://www.cisa.gov/topics/cybersecurity-best...,Cybersecurity Best Practices | Cybersecurity a...,Cybersecurity Best Practices CISA provides inf...,779,27,1.035698
4,https://www.qnbtrust.bank/Resources/Learning-C...,,,0,0,0.0


## C. Feature Engineering

In [18]:
import re, json, numpy as np, pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

# --- 1) Clean the extracted text (lowercase, collapse whitespace) ---
def clean_text(s: str) -> str:
    s = "" if pd.isna(s) else str(s)
    s = s.lower()
    s = re.sub(r"\s+", " ", s).strip()
    return s

texts_raw = extracted_df['body_text'].fillna("")
texts_clean = texts_raw.map(clean_text)

# If you want to preserve counts computed earlier, keep them.
# (They were computed from the unlowercased text; that's fine for the assignment.)
# Otherwise, you could recompute counts from texts_clean.

# --- 2) TF-IDF on cleaned text (will also power duplicates in Section D) ---
tfidf = TfidfVectorizer(
    max_features=10000,
    ngram_range=(1,2),
    stop_words='english'
)
X = tfidf.fit_transform(texts_clean)

# Backward-compatible feature names
if hasattr(tfidf, "get_feature_names_out"):
    feature_names = np.array(tfidf.get_feature_names_out())
else:
    feature_names = np.array(tfidf.get_feature_names())

# --- 3) Top-5 keywords per document from TF-IDF ---
def top_keywords_from_vector(row_matrix, topk=5):
    row = row_matrix.tocoo()
    if row.nnz == 0:
        return []
    scores = row.data
    idxs = row.col
    order = np.argsort(scores)[-topk:][::-1]
    return feature_names[idxs[order]].tolist()

top_kw = [top_keywords_from_vector(X[i]) for i in range(X.shape[0])]

# --- 4) Embeddings column ---
# Try Sentence-Transformers (dense semantic embeddings).
# Fallback: TruncatedSVD on TF-IDF (50-D) for compact numeric vectors.
use_sbert = False
emb_matrix = None
emb_note = None

try:
    from sentence_transformers import SentenceTransformer
    # Small, fast, good-quality encoder
    sbert_model = SentenceTransformer("all-MiniLM-L6-v2")
    emb_matrix = sbert_model.encode(
        texts_clean.tolist(),
        batch_size=64,
        convert_to_numpy=True,
        show_progress_bar=False,
        normalize_embeddings=True
    )
    use_sbert = True
    emb_note = "sbert_all-MiniLM-L6-v2"
except Exception:
    from sklearn.decomposition import TruncatedSVD
    # Safe n_components given corpus size/features
    n_components = min(50, max(2, min(X.shape[1]-1, X.shape[0]-1)))
    if n_components < 2:
        # Degenerate case: extremely tiny corpus; make at least 2 dims
        n_components = 2
    svd = TruncatedSVD(n_components=n_components, random_state=42)
    emb_matrix = svd.fit_transform(X)
    emb_note = f"tfidf_svd_{n_components}d"

# Serialize each embedding row as a compact JSON list string
def to_json_vector(vec: np.ndarray) -> str:
    # round to keep file size reasonable
    return json.dumps([float(f) for f in np.round(vec, 6)])

emb_strings = [to_json_vector(emb_matrix[i]) for i in range(emb_matrix.shape[0])]

# --- 5) Assemble features DataFrame ---
features_df = extracted_df.copy()
# add cleaned text if you want to inspect later (optional)
# features_df['body_text_clean'] = texts_clean

features_df['top_keywords'] = [ "|".join(kws) for kws in top_kw ]  # join to match your example
features_df['embedding'] = emb_strings

# (Word/sentence/readability already exist in extracted_df.)
# Persist exactly the required columns:
out_cols = [
    'url',
    'word_count',
    'sentence_count',
    'flesch_reading_ease',
    'top_keywords',
    'embedding'
]

features_out = DATA / "features.csv"
features_df[out_cols].to_csv(features_out, index=False)

print(f"Saved features to: {features_out}")
print("Embedding type:", emb_note)
features_df[out_cols].head(5)

Saved features to: C:\Users\AISHWARYA\Downloads\seo-content-detector\data\features.csv
Embedding type: tfidf_svd_50d


Unnamed: 0,url,word_count,sentence_count,flesch_reading_ease,top_keywords,embedding
0,https://www.cm-alliance.com/cybersecurity-blog,2,1,-91.295,cybersecurity|blog,"[0.055314, 0.050039, -0.036121, -0.000758, -0...."
1,https://www.varonis.com/blog/cybersecurity-tips,1747,94,40.871699,varonis|data|access|security|app,"[0.258805, 0.390029, -0.082082, 0.026401, -0.2..."
2,https://www.cisecurity.org/insights/blog/11-cy...,1058,73,53.262918,password|passphrase|cyber defense|authenticati...,"[0.205783, 0.302723, -0.096544, 0.04456, -0.23..."
3,https://www.cisa.gov/topics/cybersecurity-best...,779,27,1.035698,cisa|cybersecurity|cyber|cybersecurity best|pr...,"[0.114343, 0.157992, -0.048894, 0.006224, -0.1..."
4,https://www.qnbtrust.bank/Resources/Learning-C...,0,0,0.0,,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


## D. Duplicate Detection

In [12]:
# Compute cosine similarity on TF-IDF; flag pairs above threshold
from sklearn.metrics.pairwise import cosine_similarity

S = cosine_similarity(X)
thresh = 0.85  # adjustable threshold

pairs = []
n = S.shape[0]
for i in range(n):
    for j in range(i+1, n):
        sim = float(S[i, j])
        if sim >= thresh:
            pairs.append((features_df.iloc[i]['url'], features_df.iloc[j]['url'], sim))

dup_df = pd.DataFrame(pairs, columns=['url1','url2','similarity'])
dup_out = DATA / "duplicates.csv"
dup_df.to_csv(dup_out, index=False)
dup_df.head(10)

Unnamed: 0,url1,url2,similarity
0,https://en.wikipedia.org/wiki/SD-WAN,https://www.fortinet.com/resources/cyberglossa...,0.876297
1,https://www.cisco.com/site/us/en/learn/topics/...,https://www.fortinet.com/resources/cyberglossa...,0.872764


In [19]:
import json, itertools, numpy as np, pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

# 1) Build the representation matrix (embeddings preferred)
use_embeddings = False
emb_matrix = None

if 'embedding' in features_df.columns:
    try:
        emb_matrix = np.vstack(features_df['embedding'].map(lambda s: np.array(json.loads(s), dtype=np.float32)).values)
        if emb_matrix.ndim == 2 and emb_matrix.shape[0] == len(features_df):
            use_embeddings = True
            print(f"Using embeddings for similarity (shape: {emb_matrix.shape}).")
    except Exception as e:
        print("Failed to parse embeddings from features_df['embedding']:", e)

# Fallback to TF-IDF if embeddings are missing/unusable
if not use_embeddings:
    if 'X' in globals() and X is not None:
        from sklearn.preprocessing import normalize
        emb_matrix = X  # sparse CSR
        print(f"Falling back to TF-IDF matrix for similarity (shape: {emb_matrix.shape}).")
    else:
        raise RuntimeError("No embeddings/TF-IDF matrix available. Run Section C first.")

# 2) Cosine similarity matrix
# For sparse TF-IDF, cosine_similarity handles sparse input directly.
S = cosine_similarity(emb_matrix)

# 3) Threshold and pair extraction
thresh = 0.80  # as per requirement/example
pairs = []
n = S.shape[0]

for i in range(n):
    for j in range(i+1, n):
        sim = float(S[i, j])
        if sim >= thresh:
            pairs.append((features_df.iloc[i]['url'], features_df.iloc[j]['url'], round(sim, 4)))

# Sort pairs by similarity descending for readability
pairs.sort(key=lambda t: t[2], reverse=True)

# 4) Save duplicates.csv with EXACT columns: url1,url2,similarity
dup_df = pd.DataFrame(pairs, columns=['url1','url2','similarity'])
dup_out = DATA / "duplicates.csv"
dup_df.to_csv(dup_out, index=False)
print(f"Saved duplicate pairs to: {dup_out}")

# 5) Thin content detection -> is_thin column and update features.csv
features_df['is_thin'] = (features_df['word_count'] < 500).astype(int)

# Persist updated features with is_thin included (add column if missing in file)
features_out = DATA / "features.csv"
cols = ['url','word_count','sentence_count','flesch_reading_ease','top_keywords','embedding','is_thin']
existing_cols = [c for c in cols if c in features_df.columns]
features_df[existing_cols].to_csv(features_out, index=False)
print(f"Updated features (with is_thin) saved to: {features_out}")

# 6) Print summary (like the example)
total_pages = len(features_df)
duplicate_pairs = len(dup_df)
thin_pages = int(features_df['is_thin'].sum())
thin_pct = (thin_pages / total_pages * 100.0) if total_pages else 0.0

print("\nSummary:")
print(f"Total pages analyzed: {total_pages}")
print(f"Duplicate pairs: {duplicate_pairs}")
print(f"Thin content pages: {thin_pages} ({thin_pct:.0f}%)")

# Peek at the top of duplicates
dup_df.head(10)

Using embeddings for similarity (shape: (81, 50)).
Saved duplicate pairs to: C:\Users\AISHWARYA\Downloads\seo-content-detector\data\duplicates.csv
Updated features (with is_thin) saved to: C:\Users\AISHWARYA\Downloads\seo-content-detector\data\features.csv

Summary:
Total pages analyzed: 81
Duplicate pairs: 26
Thin content pages: 28 (35%)


Unnamed: 0,url1,url2,similarity
0,https://www.microsoft.com/en-us/security/busin...,https://www.zscaler.com/resources/security-ter...,0.9866
1,https://copyblogger.com/content-marketing/,https://mailchimp.com/marketing-glossary/conte...,0.9836
2,https://www.cisco.com/site/us/en/learn/topics/...,https://www.fortinet.com/resources/cyberglossa...,0.9825
3,https://en.wikipedia.org/wiki/SD-WAN,https://www.fortinet.com/resources/cyberglossa...,0.9803
4,https://guardiandigital.com/resources/blog/gui...,https://inspiredelearning.com/blog/phishing-pr...,0.9741
5,https://sign.dropbox.com/products/dropbox-fax,https://www.fax.plus/,0.9708
6,https://nytlicensing.com/latest/trends/content...,https://www.twilio.com/en-us/blog/insights/con...,0.9592
7,https://en.wikipedia.org/wiki/SD-WAN,https://www.cisco.com/site/us/en/learn/topics/...,0.9578
8,https://emotive.io/blog/11-essential-digital-m...,https://blog.hubspot.com/marketing/what-is-dig...,0.9529
9,https://www.forbes.com/advisor/business/what-i...,https://blog.hubspot.com/marketing/what-is-dig...,0.947


## E. Quality Labels & Model

In [20]:
import numpy as np, pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import joblib

# ---- 1) Create synthetic labels (non-overlapping)
def make_quality_label(wc, fre):
    if (wc > 1500) and (50 <= fre <= 70):
        return "High"
    if (wc < 500) or (fre < 30):
        return "Low"
    return "Medium"

# features_df must exist from Section C; extracted_df from Section B
# (features_df already has word_count, sentence_count, flesch_reading_ease)
labeled_df = features_df.copy()
labeled_df['quality_label'] = [
    make_quality_label(wc, fre) 
    for wc, fre in zip(labeled_df['word_count'], labeled_df['flesch_reading_ease'])
]

# ---- 2) Define features (you can add more; keep the core 3)
feature_cols = ['word_count','sentence_count','flesch_reading_ease']
X = labeled_df[feature_cols].fillna(0).astype(float).values
y = labeled_df['quality_label'].values

# Ensure we have enough data
if len(labeled_df) < 10 or labeled_df['quality_label'].nunique() < 2:
    raise RuntimeError("Not enough data or label variety to train a classifier.")

# ---- 3) Train/test 70/30 split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, random_state=42, stratify=y
)

# ---- 4) Baseline (rule-based) using word_count only
def baseline_predict_wordcount_only(wc):
    if wc < 500:
        return "Low"
    elif wc > 1500:
        return "High"
    else:
        return "Medium"

y_pred_baseline = [
    baseline_predict_wordcount_only(wc) for wc in labeled_df.loc[y_test.index if hasattr(y_test,'index') else labeled_df.index[:len(y_test)], 'word_count']
] if hasattr(y_test, 'index') else [baseline_predict_wordcount_only(wc) for wc in X_test[:, 0]]

# If the above indexing is confusing, recompute baseline on X_test directly (word_count is col 0):
y_pred_baseline = [baseline_predict_wordcount_only(wc) for wc in X_test[:, 0]]

base_acc = accuracy_score(y_test, y_pred_baseline)
base_f1  = f1_score(y_test, y_pred_baseline, average='macro')

print("=== Baseline (rule-based on word_count only) ===")
print(classification_report(y_test, y_pred_baseline, digits=3))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred_baseline))
print(f"Baseline Accuracy: {base_acc:.3f} | Baseline Macro F1: {base_f1:.3f}")

# ---- 5) Train models
# Logistic Regression (class_weight balanced helps rare 'High')
logreg = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', LogisticRegression(max_iter=2000, multi_class='auto', class_weight='balanced', random_state=42))
])

# Random Forest
rf = RandomForestClassifier(
    n_estimators=400, max_depth=None, min_samples_leaf=2,
    class_weight='balanced', random_state=42, n_jobs=-1
)

models = [
    ("LogisticRegression", logreg),
    ("RandomForest", rf),
]

best = None
best_scores = (-1, -1)  # (accuracy, macro_f1)
best_report = ""
best_cm = None
best_name = ""

for name, model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    mf1 = f1_score(y_test, y_pred, average='macro')
    print(f"\n=== {name} ===")
    print(classification_report(y_test, y_pred, digits=3))
    print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))
    print(f"Accuracy: {acc:.3f} | Macro F1: {mf1:.3f}")
    if (acc, mf1) > best_scores:
        best_scores = (acc, mf1)
        best = model
        best_name = name
        best_report = classification_report(y_test, y_pred, digits=3)
        best_cm = confusion_matrix(y_test, y_pred)

# ---- 6) Top features (2–3)
def top_features_for_model(model, feature_names, topk=3):
    tops = []
    if isinstance(model, Pipeline):
        # unwrap
        clf = model.named_steps.get('clf', model)
    else:
        clf = model

    if isinstance(clf, LogisticRegression):
        # importance = mean absolute coef across classes
        coefs = np.abs(clf.coef_)  # shape: [n_classes, n_features]
        imp = coefs.mean(axis=0)
        order = np.argsort(imp)[::-1][:topk]
        tops = [(feature_names[i], float(imp[i])) for i in order]
    elif isinstance(clf, RandomForestClassifier):
        imp = clf.feature_importances_
        order = np.argsort(imp)[::-1][:topk]
        tops = [(feature_names[i], float(imp[i])) for i in order]
    return tops

feature_names = feature_cols
tops = top_features_for_model(best, feature_names, topk=3)

print("\n=== Best Model Summary ===")
print(f"Best Model: {best_name}")
print(best_report)
print("Best model confusion matrix:\n", best_cm)
print(f"Overall Accuracy: {best_scores[0]:.3f} | Macro F1: {best_scores[1]:.3f}")
print("Top Features:")
for i, (fname, score) in enumerate(tops, 1):
    print(f"{i}. {fname} (importance: {score:.3f})")

# ---- 7) Save best model
MODELS.mkdir(parents=True, exist_ok=True)
joblib.dump(best, MODELS / "quality_model.pkl")
print("\nSaved best model to:", MODELS / "quality_model.pkl")
print("Compare vs Baseline -> Accuracy Δ: "
      f"{(best_scores[0]-base_acc):+.3f}, Macro F1 Δ: {(best_scores[1]-base_f1):+.3f}")

=== Baseline (rule-based on word_count only) ===
              precision    recall  f1-score   support

        High      0.154     1.000     0.267         2
         Low      1.000     0.429     0.600        14
      Medium      0.167     0.111     0.133         9

    accuracy                          0.360        25
   macro avg      0.440     0.513     0.333        25
weighted avg      0.632     0.360     0.405        25

Confusion matrix:
 [[2 0 0]
 [3 6 5]
 [8 0 1]]
Baseline Accuracy: 0.360 | Baseline Macro F1: 0.333

=== LogisticRegression ===
              precision    recall  f1-score   support

        High      0.286     1.000     0.444         2
         Low      1.000     0.929     0.963        14
      Medium      1.000     0.556     0.714         9

    accuracy                          0.800        25
   macro avg      0.762     0.828     0.707        25
weighted avg      0.943     0.800     0.832        25

Confusion matrix:
 [[ 2  0  0]
 [ 1 13  0]
 [ 4  0  5]]
Accura

## F. Real-time `analyze_url(url)` Demo

In [26]:
import re, json, time, requests, numpy as np, pandas as pd
from typing import List, Tuple
from bs4 import BeautifulSoup
from sklearn.metrics.pairwise import cosine_similarity

# --- polite fetch with UA + error handling ---
def _fetch_html(url: str, timeout: int = 20) -> str:
    headers = {"User-Agent": "Mozilla/5.0 (SEO-Assignment/1.0; +https://example.com)"}
    try:
        r = requests.get(url, headers=headers, timeout=timeout, allow_redirects=True)
        r.raise_for_status()
        return r.text
    except requests.exceptions.RequestException:
        return ""

# --- minimal boilerplate removal + main-content extraction (same approach as Section B) ---
def _remove_boilerplate(soup: BeautifulSoup) -> None:
    for tag in soup(['script','style','noscript','header','footer','nav','aside','form','svg','iframe']):
        tag.decompose()

def _get_title(soup: BeautifulSoup) -> str:
    if soup.title and soup.title.string:
        t = soup.title.string.strip()
        if t: return t
    h1 = soup.find("h1")
    if h1:
        t = h1.get_text(" ", strip=True)
        if t: return t
    return ""

def _extract_main_text(soup: BeautifulSoup) -> str:
    _remove_boilerplate(soup)
    # main/article/role=main first
    cands = soup.select("article, main, [role=main]")
    cands = [c for c in cands if c and c.get_text(strip=True)]
    if cands:
        best = max(cands, key=lambda el: len(el.get_text(" ", strip=True)))
        text = best.get_text(" ", strip=True)
        return re.sub(r"\s+", " ", text).strip()
    # fallback: semantic tags
    parts: List[str] = []
    for sel in ['h1','h2','h3','p','li']:
        for t in soup.select(sel):
            txt = t.get_text(" ", strip=True)
            if txt:
                parts.append(txt)
    return re.sub(r"\s+", " ", " ".join(parts)).strip()

def _html_to_text(html: str) -> Tuple[str, str]:
    if not isinstance(html, str) or not html.strip():
        return "", ""
    soup = BeautifulSoup(html, "lxml")
    title = _get_title(soup)
    body  = _extract_main_text(soup)
    return title, body

def _safe_sent_count(text: str) -> int:
    if not text: return 0
    try:
        return len(nltk.sent_tokenize(text))
    except Exception:
        return max(1, text.count(".") + text.count("!") + text.count("?"))

# --- quality labeling: try trained model; fallback to rule-based ---
def _rule_quality_label(wc: int, fre: float) -> str:
    if (wc > 1500) and (50 <= fre <= 70):
        return "High"
    if (wc < 500) or (fre < 30):
        return "Low"
    return "Medium"

def _predict_quality(word_count: int, sentence_count: int, flesch: float):
    x = np.array([[float(word_count), float(sentence_count), float(flesch)]])
    # Try using the best trained model from Section E (variable `best`) or load from disk
    model = None
    if 'best' in globals() and best is not None:
        model = best
    else:
        try:
            import joblib
            model = joblib.load(MODELS / "quality_model.pkl")
        except Exception:
            model = None
    if model is not None:
        try:
            return model.predict(x)[0]
        except Exception:
            pass
    return _rule_quality_label(word_count, flesch)

# --- vectorization for similarity ---
# Priority: embeddings vs. corpus embeddings; fallback to TF-IDF space
def _vectorize_for_similarity(text: str):
    text_clean = re.sub(r"\s+", " ", text.lower()).strip()
    # 1) SBERT available?
    if 'sbert_model' in globals() and sbert_model is not None:
        try:
            vec = sbert_model.encode([text_clean], convert_to_numpy=True, normalize_embeddings=True)
            return vec, 'sbert'
        except Exception:
            pass
    # 2) SVD (TF-IDF -> compact embedding) available?
    if 'svd' in globals() and svd is not None:
        try:
            row = tfidf.transform([text_clean])  # needs Section C to have built 'tfidf'
            vec = svd.transform(row)
            return vec, 'svd'
        except Exception:
            pass
    # 3) Fallback: raw TF-IDF vector
    row = tfidf.transform([text_clean])
    return row, 'tfidf'

def _corpus_matrix_and_urls():
    # if we saved embeddings as strings, parse them
    if 'embedding' in features_df.columns:
        try:
            emb_matrix = np.vstack(features_df['embedding'].map(lambda s: np.array(json.loads(s), dtype=np.float32)).values)
            return emb_matrix, features_df['url'].tolist(), 'embeddings'
        except Exception:
            pass
    # else use TF-IDF matrix X (global from Section C)
    return X, features_df['url'].tolist(), 'tfidf'

def analyze_url(url: str, top_k: int = 3, dup_threshold: float = 0.80):
    html = _fetch_html(url)
    if not html:
        return {"url": url, "error": "Failed to fetch the page."}

    title, body = _html_to_text(html)
    wc = len(body.split()) if body else 0
    sc = _safe_sent_count(body)
    fre = textstat.flesch_reading_ease(body) if body else 0.0

    quality = _predict_quality(wc, sc, fre)
    is_thin = wc < 500

    # Vectorize this page and compare to corpus
    q_vec, q_kind = _vectorize_for_similarity(body)
    corpus_mat, corpus_urls, corpus_kind = _corpus_matrix_and_urls()

    sims = cosine_similarity(q_vec, corpus_mat)[0]
    order = np.argsort(sims)[::-1]

    similar_list = []
    # collect matches >= threshold
    for idx in order:
        sim = float(sims[idx])
        if sim >= dup_threshold:
            similar_list.append({"url": corpus_urls[idx], "similarity": round(sim, 4)})
            if len(similar_list) >= top_k:
                break

    # if none exceed threshold, still provide the single most similar for context
    if not similar_list and len(order) > 0:
        best_idx = int(order[0])
        similar_list = [{"url": corpus_urls[best_idx], "similarity": round(float(sims[best_idx]), 4)}]

    result = {
        "url": url,
        "title": title,
        "word_count": wc,
        "readability": round(float(fre), 3),
        "quality_label": quality,
        "is_thin": bool(is_thin),
        "similar_to": similar_list
    }
    return result

In [27]:
# === Interactive prompt  ===
import re, json

def _normalize_url(u: str) -> str:
    u = (u or "").strip()
    if not u:
        return u
    if not re.match(r"^https?://", u, flags=re.I):
        u = "https://" + u
    return u

user_url = input("Enter a URL to analyze: ").strip()
user_url = _normalize_url(user_url)

if not user_url:
    print("No URL provided.")
else:
    result = analyze_url(user_url)
    print(json.dumps(result, indent=2, ensure_ascii=False))

Enter a URL to analyze: https://thehackernews.com/2025/10/openai-unveils-aardvark-gpt-5-agent.html
{
  "url": "https://thehackernews.com/2025/10/openai-unveils-aardvark-gpt-5-agent.html",
  "title": "OpenAI Unveils Aardvark: GPT-5 Agent That Finds and Fixes Code Flaws Automatically",
  "word_count": 721,
  "readability": 18.367,
  "quality_label": "Low",
  "is_thin": false,
  "similar_to": [
    {
      "url": "https://www.fortinet.com/resources/cyberglossary/what-is-network-security",
      "similarity": 0.6572
    }
  ]
}


## G. Save Outputs (CSV + model)

In [23]:
# Ensure outputs are saved (already written earlier)
extracted_df.to_csv(DATA / "extracted_content.csv", index=False)
features_df[['url','word_count','sentence_count','flesch_reading_ease','top_keywords']].to_csv(DATA / "features.csv", index=False)
dup_df.to_csv(DATA / "duplicates.csv", index=False)

print("Saved:")
print(" -", DATA / "extracted_content.csv")
print(" -", DATA / "features.csv")
print(" -", DATA / "duplicates.csv")
print("Model (if trained):", MODELS / "quality_model.pkl", (MODELS / "quality_model.pkl").exists())

Saved:
 - C:\Users\AISHWARYA\Downloads\seo-content-detector\data\extracted_content.csv
 - C:\Users\AISHWARYA\Downloads\seo-content-detector\data\features.csv
 - C:\Users\AISHWARYA\Downloads\seo-content-detector\data\duplicates.csv
Model (if trained): C:\Users\AISHWARYA\Downloads\seo-content-detector\models\quality_model.pkl True
