# CAD-RAG: Content Analysis Detection using RAG
**Full Standalone Implementation**

This notebook contains the complete pipeline for the CAD-RAG system, including:
1.  **Environment Setup**: Loading API keys and libraries.
2.  **ML Model Training/Loading**: Logistic Regression with TF-IDF for hate speech detection (persisted via `joblib`).
3.  **Knowledge Base**: Evolving Slur Lexicon and Reclaimed Speech Corpus using ChromaDB.
4.  **RAG System**: Integration with OpenRouter (Hermes 3) and GoogleNews for contextual analysis.
5.  **Interactive Analysis**: Real-time hate speech detection.

## 1. Setup Environment

In [None]:
import os
import re
import warnings
from dotenv import load_dotenv
import pandas as pd
import numpy as np
import spacy
import chromadb
import joblib

warnings.filterwarnings('ignore')

# Set script directory (current working directory in notebook)
SCRIPT_DIR = os.getcwd()
env_path = os.path.join(SCRIPT_DIR, '.env')

if os.path.exists(env_path):
    load_dotenv(env_path)
    print("Loaded API keys from .env file")
else:
    print("No .env file found")

OPENROUTER_API_KEY = os.environ.get('OPENROUTER_API_KEY', '')
NEO4J_URI = os.environ.get('NEO4J_URI', '')
NEO4J_USERNAME = os.environ.get('NEO4J_USERNAME', '')
NEO4J_PASSWORD = os.environ.get('NEO4J_PASSWORD', '')

has_openrouter_api = bool(OPENROUTER_API_KEY)
has_neo4j = bool(NEO4J_URI and NEO4J_USERNAME and NEO4J_PASSWORD)

print(f"OpenRouter API: {'[OK]' if has_openrouter_api else '[MISSING]'}")
print(f"Neo4j: {'[OK]' if has_neo4j else '[MISSING]'}")

## 2. Initialize Vector Store (ChromaDB)

In [None]:
CHROMA_PATH = os.path.join(SCRIPT_DIR, "cad_rag_chroma")
chroma_client = chromadb.PersistentClient(path=CHROMA_PATH)

slur_lexicon_collection = chroma_client.get_or_create_collection(name="slur_lexicon")
reclaimed_speech_collection = chroma_client.get_or_create_collection(name="reclaimed_speech")

try:
    from langchain_community.vectorstores import Chroma
    from langchain_huggingface import HuggingFaceEmbeddings
    
    embedding_function = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    
    slur_lexicon_db = Chroma(client=chroma_client, collection_name="slur_lexicon", embedding_function=embedding_function)
    reclaimed_speech_db = Chroma(client=chroma_client, collection_name="reclaimed_speech", embedding_function=embedding_function)
    print("Vector Stores Initialized.")
except Exception as e:
    print(f"Vector Store Error: {e}")

## 3. Train/Load ML Model

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier

MODEL_PATH = os.path.join(SCRIPT_DIR, 'cad_rag_model.pkl')
VECTORIZER_PATH = os.path.join(SCRIPT_DIR, 'cad_rag_vectorizer.pkl')
TRAIN_FILE = os.path.join(SCRIPT_DIR, 'train.csv')

label_columns = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

tfidf_vectorizer = None
logistic_regression_model = None

if os.path.exists(MODEL_PATH) and os.path.exists(VECTORIZER_PATH):
    print("Loading saved model...")
    logistic_regression_model = joblib.load(MODEL_PATH)
    tfidf_vectorizer = joblib.load(VECTORIZER_PATH)
    print("Model Loaded!")
elif os.path.exists(TRAIN_FILE):
    print("Training model (this may take a minute)...")
    df = pd.read_csv(TRAIN_FILE)
    
    # Preprocessing
    text_col = 'comment_text' if 'comment_text' in df.columns else 'tweet'
    existing_labels = [c for c in label_columns if c in df.columns]
    
    if existing_labels:
        for l in existing_labels: df[l] = (df[l] > 0).astype(int)
        df[text_col] = df[text_col].fillna('').astype(str).str.lower().apply(lambda x: re.sub(r'[^a-z0-9\s]', '', x).strip())
        df.dropna(subset=existing_labels, inplace=True)
        
        tfidf_vectorizer = TfidfVectorizer(max_features=10000)
        X_tfidf = tfidf_vectorizer.fit_transform(df[text_col])
        y = df[existing_labels]
        
        logistic_regression_model = MultiOutputClassifier(LogisticRegression(solver='sag', class_weight='balanced', max_iter=1000))
        logistic_regression_model.fit(X_tfidf, y)
        
        joblib.dump(logistic_regression_model, MODEL_PATH)
        joblib.dump(tfidf_vectorizer, VECTORIZER_PATH)
        print("Model Trained and Saved!")
else:
    print("Training File Not Found!")

## 4. News & Knowledge Graph

In [None]:
from GoogleNews import GoogleNews

recent_news_cache = []
try:
    googlenews = GoogleNews(lang='en', period='7d')
    googlenews.search('immigration policy')
    res = googlenews.result()
    recent_news_cache = [f"{a['title']}: {a['desc']}" for a in res[:5]]
    print(f"Fetched {len(recent_news_cache)} news articles.")
except Exception as e:
    print(f"News fetch failed: {e}")

# Neo4j Logic (Optional/Skipped)
neo4j_driver = None
# Logic for Neo4j removed/skipped for stability as requested.

## 5. RAG & LLM Setup

In [None]:
from openai import OpenAI

llm = None
nlp = spacy.load("en_core_web_lg")

if has_openrouter_api:
    client = OpenAI(base_url="https://openrouter.ai/api/v1", api_key=OPENROUTER_API_KEY)
    
    def call_llm(prompt):
        try:
            print("    (Calling Hermes 3)...", end="\r")
            resp = client.chat.completions.create(
                model="nousresearch/hermes-3-llama-3.1-405b:free",
                messages=[{"role": "user", "content": prompt}],
                extra_body={"HTTP-Referer": "http://localhost:3000", "X-Title": "CAD-RAG Local"}
            )
            return resp.choices[0].message.content
        except Exception as e:
            return f"LLM Error: {str(e)[:100]}..."
    llm = call_llm
    print("LLM Interface Ready.")
else:
    print("OpenRouter Key Missing!")

## 6. Analysis Functions

In [None]:
def analyze_sentence(sentence):
    print(f"\nAnalyzing: '{sentence}'")
    
    # 1. Pre-Retrieval
    doc = nlp(sentence)
    entities = [ent.text for ent in doc.ents]
    neologisms = [t.text for t in doc if t.is_oov]
    
    # 2. ML Prediction
    ml_res = "N/A"
    if logistic_regression_model:
        cleaned = re.sub(r'[^a-z0-9\s]', '', sentence.lower()).strip()
        vec = tfidf_vectorizer.transform([cleaned])
        probs = logistic_regression_model.predict_proba(vec)
        probs_pos = np.array([p[:, 1] for p in probs]).T[0] 
        if any(probs_pos >= 0.5):
             ml_res = "HATEFUL"
             labels = [label_columns[i] for i, p in enumerate(probs_pos) if p >= 0.5]
             print(f"  -> ML Prediction: HATEFUL {labels}")
        else:
             ml_res = "NOT_HATEFUL"
             print(f"  -> ML Prediction: NOT_HATEFUL")
             
    # 3. RAG Retrieval
    context = ""
    if entities or neologisms:
        hits = []
        # News
        for e in entities:
            if recent_news_cache:
                 n = [x for x in recent_news_cache if e.lower() in x.lower()]
                 if n: hits.append(f"News({e}): {n[0]}")
        # Lexicon
        for n in neologisms:
            d = slur_lexicon_db.similarity_search(n, k=1)
            if d: hits.append(f"Lexicon({n}): {d[0].page_content}")
        context = " | ".join(hits)
        if context: print(f"  -> Context: {context[:100]}...")
    
    # 4. LLM Analysis
    if llm:
        prompt = f"Analyze this sentence for hate speech. Sentence: '{sentence}' Context: {context or 'None'}. Classify and explain."
        rationale = llm(prompt)
        print(f"  -> LLM Rationale: {rationale}")
        return rationale
    return ml_res

## 7. Interactive Test Loop

In [None]:
# Run this cell to test multiple sentences!
while True:
    txt = input("Enter sentence (or 'exit'): ")
    if txt.lower() in ['exit', 'quit']: break
    analyze_sentence(txt)