In [1]:
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

import torch
torch.cuda.empty_cache()

In [43]:
import torch
import os
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForSequenceClassification, pipeline
from sentence_transformers import SentenceTransformer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score
from pymongo import MongoClient
import numpy as np
from typing import List, Dict, Any
import re

In [None]:
class Config:
    MONGODB_URI = "mongodb://localhost:27017/"
    DB_NAME = "fake_news_db"
    COLLECTION_NAME = "news_embeddings"
    EMBEDDING_MODEL = "distilbert-base-uncased"  # Lightweight model for embeddings
    LLM_MODEL = "distilbert-base-uncased"  # The main Gemma model
    DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
    MAX_LENGTH = 2048  # Maximum token length for the model
    # File paths
    FAKE_NEWS_FILE = "fake.csv"
    TRUE_NEWS_FILE = "true.csv"
    FINE_FAKE_FILE = "FineFake.csv"  

config = Config()
print(f"Using device: {config.DEVICE}")

Using device: cuda


In [4]:
def load_training_data():
    """
    Load and combine fake and true news datasets for training
    """
    # Load fake news
    fake_df = pd.read_csv(config.FAKE_NEWS_FILE, sep='\t')
    fake_df['label'] = 1  # 1 for fake news
    
    # Load true news
    true_df = pd.read_csv(config.TRUE_NEWS_FILE)
    true_df['label'] = 0  # 0 for true news
    
    # Combine datasets
    combined_df = pd.concat([fake_df, true_df], ignore_index=True)
    combined_df = combined_df.dropna(subset=['text'])
    
    # Create content field combining title and text for better context
    combined_df['content'] = combined_df['title'].fillna('') + " " + combined_df['text'].fillna('')
    
    print(f"Loaded {len(combined_df)} news articles ({len(fake_df)} fake, {len(true_df)} true)")
    return combined_df

def load_fine_grained_data():
    """
    Load the detailed metadata file for MongoDB storage
    This file has rich features like knowledge embeddings, entities, etc.
    """
    try:
        # Try to load the file - first checking if it's JSON
        try:
            df = pd.read_json(config.FINE_FAKE_FILE, lines=True)
        except:
            # If not JSON, try CSV with tab separator
            df = pd.read_csv(config.FINE_FAKE_FILE, sep='\t')
        
        # Process the knowledge embeddings if they exist
        if 'knowledge_embedding' in df.columns:
            # Convert string representation of embeddings to actual arrays
            df['knowledge_embedding'] = df['knowledge_embedding'].apply(
                lambda x: np.array(ast.literal_eval(x)) if isinstance(x, str) else x
            )
        
        print(f"Loaded {len(df)} detailed news articles for database")
        return df
    except Exception as e:
        print(f"Error loading fine-grained data: {e}")
        print("Will continue without detailed metadata.")
        return None

In [5]:
def preprocess_text(text):
    """Clean and preprocess text"""
    if not isinstance(text, str):
        return ""
        
    # Remove extra quotes if present
    if text.startswith('"') and text.endswith('"'):
        text = text[1:-1]
        
    # Basic preprocessing
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = text.strip()
    return text

In [18]:
from huggingface_hub import login
token = "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"
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
def create_qa_pipeline(model):
    """Create a text generation pipeline using just the model"""
    pipe = pipeline(
        "text-generation",
        model=model,
        max_length=config.MAX_LENGTH,
        device_map="auto"
    )
    return pipe

def setup_mongodb():
    """Connect to MongoDB and set up vector collections"""
    client = MongoClient(config.MONGODB_URI)
    db = client[config.DB_NAME]
    collection = db[config.COLLECTION_NAME]
    
    # Create vector search index if it doesn't exist
    try:
        if "vector_index" not in collection.index_information():
            # For MongoDB Atlas or versions supporting vector search
            collection.create_index([("embedding", "vector")])
            print("Created vector search index")
    except Exception as e:
        print(f"Warning: Vector index creation failed: {e}")
        print("This may be expected if not using MongoDB Atlas or a version with vector search support")
        print("The system will still store embeddings but vector search capabilities may be limited")
    
    return collection

In [21]:
try:
    tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
    model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")

    # Initialize the classification pipeline
    classifier = pipeline("text-classification", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)
    print("Models and MongoDB collection initialized!")
except Exception as e:
    print(f"Error during initialization: {e}")
    print("Try running with minimal resources or in separate steps.")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cuda:0


Models and MongoDB collection initialized!


In [8]:
from pymongo import MongoClient
from pymongo.errors import AutoReconnect, NetworkTimeout, ConnectionFailure
import time
import numpy as np
import pandas as pd
import re
import ast
from tenacity import retry, stop_after_attempt, wait_exponential

@retry(stop=stop_after_attempt(5), wait=wait_exponential(multiplier=1, min=4, max=60))
def mongodb_operation(collection, operation, *args, **kwargs):
    """Wrapper for MongoDB operations with retry logic"""
    try:
        if operation == "update_one":
            return collection.update_one(*args, **kwargs)
        elif operation == "find_one":
            return collection.find_one(*args, **kwargs)
        # Add other operations as needed
    except (AutoReconnect, NetworkTimeout, ConnectionFailure) as e:
        print(f"MongoDB connection error: {e}. Retrying...")
        # Recreate client connection before retry
        raise  # This will trigger the retry

In [9]:
def generate_and_store_embeddings(df, embedding_model, mongodb_uri, db_name, collection_name, batch_size=50):
    """Generate embeddings for news articles and store in MongoDB with robust error handling"""
    print(f"Generating embeddings for {len(df)} documents...")
    
    # Process in smaller batches to avoid memory issues and timeout risks
    successful_count = 0
    error_count = 0
    
    for i in range(0, len(df), batch_size):
        batch_df = df.iloc[i:i+batch_size]
        print(f"Processing batch {i//batch_size + 1}/{(len(df) + batch_size - 1)//batch_size}")
        
        # Create a fresh connection for each batch
        try:
            client = MongoClient(mongodb_uri, serverSelectionTimeoutMS=30000)
            db = client[db_name]
            collection = db[collection_name]
            
            # Test connection before proceeding
            client.admin.command('ping')
            print("Connected to MongoDB successfully")
        except Exception as e:
            print(f"Failed to connect to MongoDB: {e}")
            time.sleep(10)  # Wait before trying to reconnect
            continue
            
        for _, row in batch_df.iterrows():
            try:
                # Create base document with universally expected fields
                doc = {
                    "id": str(row.name),
                    "content_type": "news_article"
                }
                
                # Add all available fields from the dataframe
                for col in row.index:
                    if col != 'content' and col != 'knowledge_embedding' and not pd.isna(row[col]):
                        # Skip content (we'll process it separately) and knowledge_embedding
                        doc[col] = row[col]
                
                # Prepare text for embedding
                if 'content' in row and not pd.isna(row['content']):
                    content = row['content']
                elif 'text' in row and not pd.isna(row['text']):
                    if 'title' in row and not pd.isna(row['title']):
                        content = f"{row['title']} {row['text']}"
                    else:
                        content = row['text']
                else:
                    # Skip this document if no content to embed
                    print(f"Skipping row {row.name}: No content to embed")
                    continue
                    
                # Process the text
                processed_text = preprocess_text(content)
                
                # Store the original and processed content
                doc['original_content'] = content
                doc['processed_content'] = processed_text
                
                # Use pre-computed knowledge embedding if available
                if 'knowledge_embedding' in row and not pd.isna(row['knowledge_embedding']):
                    if isinstance(row['knowledge_embedding'], np.ndarray):
                        doc['embedding'] = row['knowledge_embedding'].tolist()
                    else:
                        # Try to parse the embedding if it's a string
                        try:
                            doc['embedding'] = ast.literal_eval(row['knowledge_embedding'])
                        except:
                            # If parsing fails, generate a new embedding
                            doc['embedding'] = embedding_model.encode(processed_text).tolist()
                else:
                    # Generate new embedding
                    doc['embedding'] = embedding_model.encode(processed_text).tolist()
                
                # Store in MongoDB with retry logic
                mongodb_operation(collection, "update_one", {"id": doc["id"]}, {"$set": doc}, upsert=True)
                successful_count += 1
                
                # Periodically log progress
                if successful_count % 10 == 0:
                    print(f"Successfully processed {successful_count} documents")
                    
            except Exception as e:
                error_count += 1
                print(f"Error processing row {row.name}: {e}")
                continue
        
        # Close connection after each batch
        client.close()
        print(f"Completed batch {i//batch_size + 1}")
        
        # Brief pause between batches to avoid overloading the connection
        time.sleep(2)
    
    print(f"Embedding generation complete! Successfully processed {successful_count} documents. Errors: {error_count}")

In [10]:
# Data Loading and Splitting Functions
def load_and_split_data():
    """
    Load and split data from all sources with an 80-20 train-test split
    """
    data_sources = []
    
    # Load fake news
    try:
        fake_df = pd.read_csv(config.FAKE_NEWS_FILE, sep='\t')
        fake_df['label'] = 1  # 1 for fake news
        fake_df['source'] = 'fake.csv'
        data_sources.append(fake_df)
        print(f"Loaded {len(fake_df)} articles from fake.csv")
    except Exception as e:
        print(f"Error loading fake.csv: {e}")
    
    # Load true news
    try:
        true_df = pd.read_csv(config.TRUE_NEWS_FILE)
        true_df['label'] = 0  # 0 for true news
        true_df['source'] = 'true.csv'
        data_sources.append(true_df)
        print(f"Loaded {len(true_df)} articles from true.csv")
    except Exception as e:
        print(f"Error loading true.csv: {e}")
    
    # Load fine-grained data
    try:
        # Try to load the file - first checking if it's JSON
        try:
            fine_df = pd.read_json(config.FINE_FAKE_FILE, lines=True)
        except:
            # If not JSON, try CSV with tab separator
            fine_df = pd.read_csv(config.FINE_FAKE_FILE, sep='\t')
        
        # Process the knowledge embeddings if they exist
        if 'knowledge_embedding' in fine_df.columns:
            fine_df['knowledge_embedding'] = fine_df['knowledge_embedding'].apply(
                lambda x: np.array(ast.literal_eval(x)) if isinstance(x, str) else x
            )
        
        # Ensure label exists
        if 'label' not in fine_df.columns:
            fine_df['label'] = 1  # Assume fake if not specified
            
        fine_df['source'] = 'finefake'
        data_sources.append(fine_df)
        print(f"Loaded {len(fine_df)} articles from finefake")
    except Exception as e:
        print(f"Error loading finefake: {e}")
    
    # Combine all data sources
    all_data = pd.concat(data_sources, ignore_index=True)
    all_data = all_data.dropna(subset=['text'])
    
    # Create content field combining title and text for better context
    all_data['content'] = all_data['title'].fillna('') + " " + all_data['text'].fillna('')
    
    # Shuffle the data
    all_data = all_data.sample(frac=1, random_state=42).reset_index(drop=True)
    
    # Split into train (80%) and test (20%) sets
    split_idx = int(len(all_data) * 0.8)
    train_df = all_data.iloc[:split_idx]
    test_df = all_data.iloc[split_idx:]
    
    print(f"Split data into {len(train_df)} training and {len(test_df)} test samples")
    
    return train_df, test_df

In [40]:
def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

def retrieve_similar_documents(query: str, k: int = 5) -> list[str]:
    model = SentenceTransformer(config.EMBEDDING_MODEL)
    query_embedding = model.encode(query)

    client = MongoClient(config.MONGODB_URI)
    collection = client[config.DB_NAME][config.COLLECTION_NAME]

    docs = collection.find()
    results = []
    for doc in docs:
        doc_embedding = np.array(doc['embedding'])
        score = cosine_similarity(query_embedding, doc_embedding)
        results.append((score, doc['content']))

    results.sort(reverse=True)
    return [content for _, content in results[:k]]

In [42]:
def classify_claim(claim: str) -> str:
    evidence = retrieve_similar_documents(claim)
    context = "\n".join(evidence)
    
    prompt = f"""You are a fact-checking assistant. Based on the following evidence, determine whether the claim is REAL or FAKE.

Evidence:
{context}

Claim: {claim}

Label (REAL or FAKE):"""

    tokenizer = AutoTokenizer.from_pretrained(config.LLM_MODEL)
    model = AutoModelForCausalLM.from_pretrained(config.LLM_MODEL).to(config.DEVICE)

    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=config.MAX_LENGTH).to(config.DEVICE)
    outputs = model.generate(**inputs, max_new_tokens=10)
    prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)

    return prediction.split(":")[-1].strip()


In [11]:
def setup_mongodb_collection(collection_name):
    """Create a MongoDB collection with proper indexes"""
    client = MongoClient(config.MONGODB_URI)
    db = client[config.DB_NAME]
    collection = db[collection_name]
    
    # Create vector search index if it doesn't exist
    try:
        if "vector_index" not in collection.index_information():
            collection.create_index([("embedding", "vector")])
    except Exception as e:
        print(f"Warning: Vector index creation failed: {e}")
    
    return collection

In [33]:
import pickle
import os
import pandas as pd
from pymongo import MongoClient

def process_and_save_data(embedding_model):
    """Process data and save to disk or load from cache"""
    # File paths for saved data
    train_path = "processed_data/train_embeddings.pkl"
    test_path = "processed_data/test_embeddings.pkl"

    os.makedirs("processed_data", exist_ok=True)

    # Load from cache if available
    if os.path.exists(train_path) and os.path.exists(test_path):
        print("Loading processed data from disk...")
        with open(train_path, 'rb') as f:
            train_data = pickle.load(f)
        with open(test_path, 'rb') as f:
            test_data = pickle.load(f)
    else:
        print("Processing data from scratch...")
        train_df, test_df = load_and_split_data()

        train_data = generate_docs_with_embeddings(train_df, embedding_model, tokenizer)
        test_data = generate_docs_with_embeddings(test_df, embedding_model, tokenizer)

        with open(train_path, 'wb') as f:
            pickle.dump(train_data, f)
        with open(test_path, 'wb') as f:
            pickle.dump(test_data, f)

    # Store in MongoDB
    store_in_mongodb(train_data, config.MONGODB_URI, config.DB_NAME, config.COLLECTION_NAME)
    store_in_mongodb(test_data, config.MONGODB_URI, config.DB_NAME, config.COLLECTION_NAME + "_test")

    return len(train_data), len(test_data)


def generate_docs_with_embeddings(df, embedding_model, tokenizer, batch_size=16):
    """Create embedded documents from a dataframe using DistilBERT for embeddings"""
    docs = []
    batch_texts = []
    batch_idx = []

    for idx, row in df.iterrows():
        if idx % 100 == 0:  # Print progress every 100 samples
            print(f"Processing document {idx}...")
        # Compose content text
        title = str(row.get('title', '') or '')
        text = str(row.get('text', '') or '')
        content = f"{title.strip()} {text.strip()}".strip()
        if not content:
            continue

        processed_text = preprocess_text(content)

        processed_text = preprocess_text(content)
        batch_texts.append(processed_text)
        batch_idx.append(idx)

        if len(batch_texts) >= batch_size:
            # Process the batch
            inputs = tokenizer(batch_texts, truncation=True, padding=True, return_tensors="pt").to(embedding_model.device)
            with torch.no_grad():
                outputs = embedding_model(**inputs, output_hidden_states=True)
                hidden_states = outputs.hidden_states[-1]
                embeddings = hidden_states.mean(dim=1).squeeze().cpu().numpy()

            for i, doc_embedding in zip(batch_idx, embeddings):
                doc = {
                    col: row[col]
                    for col in df.columns
                    if col not in ['content', 'knowledge_embedding'] and not pd.isna(row[col])
                }
                doc['id'] = str(i)
                doc['original_content'] = content
                doc['processed_content'] = processed_text
                doc['embedding'] = doc_embedding.tolist()

                # Add label handling
                if 'label' in doc:
                    try:
                        doc['label'] = int(doc['label'])
                        if doc['label'] not in [0, 1]:
                            continue
                    except:
                        continue

                docs.append(doc)

            # Clear the batch
            batch_texts = []
            batch_idx = []

    return docs


def store_in_mongodb(data, mongodb_uri, db_name, collection_name):
    """Store pre-processed data in MongoDB"""
    client = MongoClient(mongodb_uri)
    db = client[db_name]
    collection = db[collection_name]

    if collection.count_documents({}) == 0:
        print(f"Inserting {len(data)} documents into `{collection_name}`...")
        for doc in data:
            collection.update_one({"id": doc["id"]}, {"$set": doc}, upsert=True)
    else:
        print(f"Collection `{collection_name}` already contains data. Skipping insert.")

    client.close()


In [34]:
process_and_save_data(model)

Processing data from scratch...
Loaded 23481 articles from fake.csv
Loaded 21417 articles from true.csv
Loaded 86087 articles from finefake
Split data into 17133 training and 4284 test samples
Processing document 0...
Processing document 100...
Processing document 200...
Processing document 300...
Processing document 400...
Processing document 500...
Processing document 600...
Processing document 700...
Processing document 800...
Processing document 900...
Processing document 1000...
Processing document 1100...
Processing document 1200...
Processing document 1300...
Processing document 1400...
Processing document 1500...
Processing document 1600...
Processing document 1700...
Processing document 1800...
Processing document 1900...
Processing document 2000...
Processing document 2100...
Processing document 2200...
Processing document 2300...
Processing document 2400...
Processing document 2500...
Processing document 2600...
Processing document 2700...
Processing document 2800...
Process

(17120, 4272)

In [None]:
train_df, test_df = load_and_split_data()

# Process training data
print("Processing training data...")
generate_and_store_embeddings(
    df=train_df,
    embedding_model=model,
    mongodb_uri=config.MONGODB_URI,
    db_name=config.DB_NAME,
    collection_name=config.COLLECTION_NAME,
    batch_size=50
)

# Store test data in a separate collection for evaluation
test_collection = setup_mongodb_collection(config.COLLECTION_NAME + "_test")
print("Processing test data...")
generate_and_store_embeddings(
    df=test_df,
    embedding_model=embedding_model,
    mongodb_uri=config.MONGODB_URI,
    db_name=config.DB_NAME,
    collection_name=config.COLLECTION_NAME,
    batch_size=50
)

Loaded 23481 articles from fake.csv
Loaded 21417 articles from true.csv
Loaded 86087 articles from finefake
Split data into 17133 training and 4284 test samples
Processing training data...
Generating embeddings for 17133 documents...
Processing batch 1/343
Connected to MongoDB successfully


KeyboardInterrupt: 

In [44]:
def evaluate_system_with_tokenizer(classifier, tokenizer, mongodb_uri, db_name, test_collection_name):
    """Evaluate the system on test data with proper tokenization for long texts"""
    # Connect to the test collection
    client = MongoClient(mongodb_uri)
    db = client[db_name]
    collection = db[test_collection_name]
    
    # Retrieve all test documents
    test_docs = list(collection.find({}))
    
    # Initialize metrics
    true_labels = []
    predicted_labels = []
    
    print(f"Evaluating on {len(test_docs)} test documents...")
    
    for doc in test_docs:
        try:
            # Get the true label
            if 'label' in doc:
                true_label = doc['label']
                
                true_label = int(true_label)  # in case it's a string like '1'
                assert true_label in [0, 1], f"Invalid label: {true_label}"
                true_labels.append(true_label)
                
                
                # Get text for classification
                text = doc.get('processed_content', doc.get('original_content', ''))
                
                # Properly truncate using the model's tokenizer
                encoded_input = tokenizer(text, truncation=True, max_length=512, return_tensors="pt")

                # Move tokenized input to same device as the model
                device = next(classifier.model.parameters()).device
                encoded_input = {k: v.to(device) for k, v in encoded_input.items()}
                
                # Run classifier with tokenized input
                with torch.no_grad():
                    output = classifier.model(**encoded_input)

                    logits = output.logits
                    
                    predicted_class = torch.argmax(logits, dim=1).item()
                
                # Map the output to your label scheme
                pred_label = predicted_class  # Adjust as needed for your model
                
                predicted_labels.append(pred_label)
                
                # Store prediction in MongoDB for later analysis
                collection.update_one(
                    {"_id": doc["_id"]},
                    {"$set": {"predicted_label": pred_label}}
                )
                
            else:
                print(f"Document {doc.get('id', 'unknown')} missing label, skipping...")
                
        except Exception as e:
            print(f"Error evaluating document {doc.get('id', 'unknown')}: {e}")
    
    # Calculate metrics
    if true_labels and predicted_labels:
        accuracy = accuracy_score(true_labels, predicted_labels)
        precision = precision_score(true_labels, predicted_labels)
        recall = recall_score(true_labels, predicted_labels)
        f1 = f1_score(true_labels, predicted_labels)
        cm = confusion_matrix(true_labels, predicted_labels)
        
        print("\nEvaluation Results:")
        print(f"Accuracy: {accuracy:.4f}")
        print(f"Precision: {precision:.4f}")
        print(f"Recall: {recall:.4f}")
        print(f"F1 Score: {f1:.4f}")
        print("\nConfusion Matrix:")
        print(cm)
        
        # Calculate AUC if possible
        try:
            auc = roc_auc_score(true_labels, predicted_labels)
            print(f"AUC: {auc:.4f}")
        except:
            print("Could not calculate AUC")
        
        return {
            "accuracy": accuracy,
            "precision": precision,
            "recall": recall,
            "f1": f1,
            "confusion_matrix": cm.tolist()
        }
    else:
        print("No valid predictions to evaluate")
        return None

In [36]:
from transformers import AutoTokenizer
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load your model
model = classifier.model.to(device)
model.eval()

# Choose a test string (you can pull one from MongoDB if needed)
text = "This is a test article. It talks about some fake event."

# Tokenize and move to GPU
inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512, padding=True)
inputs = {k: v.to(device) for k, v in inputs.items()}

with torch.no_grad():
    print("Calling model...")
    outputs = model(**inputs)
    print("Success! Logits:", outputs.logits)


Calling model...
Success! Logits: tensor([[0.1382, 0.0289]], device='cuda:0')


In [14]:
tokenizer = AutoTokenizer.from_pretrained(config.LLM_MODEL)

In [45]:
eval_results = evaluate_system_with_tokenizer(
    classifier=classifier,
    tokenizer=tokenizer,
    mongodb_uri=config.MONGODB_URI,
    db_name=config.DB_NAME,
    test_collection_name=config.COLLECTION_NAME + "_test"
)

Evaluating on 4284 test documents...

Evaluation Results:
Accuracy: 0.7003
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000

Confusion Matrix:
[[3000 1284]
 [   0    0]]
Could not calculate AUC


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [30]:
def search_similar_documents(query_text, embedding_model, collection, top_k=3):
    """Search for similar documents using vector similarity"""
    # Generate embedding for the query
    query_embedding = embedding_model.encode(preprocess_text(query_text))
    
    # Search in MongoDB using vector similarity
    results = collection.aggregate([
        {
            "$vectorSearch": {
                "index": "vector_index",
                "path": "embedding",
                "queryVector": query_embedding.tolist(),
                "numCandidates": top_k * 3,  # Request more candidates for better results
                "limit": top_k
            }
        },
        {
            "$project": {
                "_id": 0,
                "title": 1,
                "text": 1,
                "subject": 1, 
                "date": 1,
                "score": {"$meta": "searchScore"}
            }
        }
    ])
    
    return list(results)

def process_query_with_rag(query, embedding_model, collection, qa_pipeline):
    """Process user query with RAG approach"""
    # Find relevant documents
    similar_docs = search_similar_documents(query, embedding_model, collection)
    
    # If no relevant documents found
    if not similar_docs:
        prompt = f"Question: {query}\nAnswer: Based on my knowledge,"
        return qa_pipeline(prompt)[0]["generated_text"]
    
    # Build context from similar documents
    context = ""
    for i, doc in enumerate(similar_docs):
        context += f"Document {i+1}:\nTitle: {doc['title']}\nContent: {doc['text']}\nSubject: {doc.get('subject', 'N/A')}\nDate: {doc.get('date', 'N/A')}\n\n"
    
    # Build RAG prompt
    prompt = f"""Context information is below.
---------------------
{context}
---------------------
Given the context information and not prior knowledge, answer the question: {query}
If the context doesn't contain enough information to answer the question, say so.
Check if the news might be fake by analyzing inconsistencies, extreme language, or unverifiable claims.
"""
    
    # Generate response
    response = qa_pipeline(prompt, max_length=512, num_return_sequences=1)[0]["generated_text"]
    
    # Extract only the generated answer part (after the prompt)
    answer = response[len(prompt):]
    return answer.strip()

def detect_fake_news(news_text, qa_pipeline):
    """Specialized function to analyze if a news item might be fake"""
    prompt = f"""Analyze the following news text and determine if it might be fake news.
Consider these aspects:
1. Is the language extremely emotional or sensationalist?
2. Are there unverifiable claims or statistics?
3. Does it contain inconsistencies or logical fallacies?
4. Is there extreme political bias?
5. Does it use manipulative tactics?

News text:
{news_text}

Provide a detailed analysis with evidence from the text, and conclude with a judgment on whether this is likely real news or fake news.
"""
    
    response = qa_pipeline(prompt, max_length=768, num_return_sequences=1)[0]["generated_text"]
    # Extract only the generated answer part
    answer = response[len(prompt):]
    return answer.strip()

In [None]:
def ask_question(query):
    """Process a question using the RAG system"""
    result = process_query_with_rag(query, embedding_model, collection, qa_pipeline)
    print(f"Question: {query}")
    print("\nRAG Response:")
    print(result)
    print("\n" + "-"*50 + "\n")

def analyze_news(news_text):
    """Analyze if a news article might be fake"""
    result = detect_fake_news(news_text, qa_pipeline)
    print("Fake News Analysis:")
    print(result)
    print("\n" + "-"*50 + "\n")

# Example usage:
# ask_question("What are common characteristics of fake news about climate change?")

# sample_news = """BREAKING: Scientists Found that Climate Change is a Hoax Created by China to Decrease US Manufacturing. 
# Documents leaked by anonymous sources show that 97% of scientists were paid to promote this theory."""
# analyze_news(sample_news)


In [None]:
def ask_question(query):
    """Process a question using the RAG system"""
    result = process_query_with_rag(query, embedding_model, collection, qa_pipeline)
    print(f"Question: {query}")
    print("\nRAG Response:")
    print(result)
    print("\n" + "-"*50 + "\n")

def analyze_news(news_text):
    """Analyze if a news article might be fake"""
    result = detect_fake_news(news_text, qa_pipeline)
    print("Fake News Analysis:")
    print(result)
    print("\n" + "-"*50 + "\n")

# Example usage:
# ask_question("What are common characteristics of fake news about climate change?")

# sample_news = """BREAKING: Scientists Found that Climate Change is a Hoax Created by China to Decrease US Manufacturing. 
# Documents leaked by anonymous sources show that 97% of scientists were paid to promote this theory."""
# analyze_news(sample_news)


In [None]:
def interactive_mode():
    print("\nFake News Detection RAG System Ready!")
    print("Enter 'q' to quit")
    
    while True:
        query = input("\nEnter your question or paste news to analyze: ")
        if query.lower() == 'q':
            break
            
        print("\nProcessing...")
        
        # Detect if this is a query or news text to analyze (simple heuristic)
        if len(query.split()) > 30:  # Longer text is likely news to analyze
            result = detect_fake_news(query, qa_pipeline)
            print("\nFake News Analysis:")
        else:  # Shorter text is likely a question
            result = process_query_with_rag(query, embedding_model, collection, qa_pipeline)
            print("\nRAG Response:")
            
        print(result)

# Run this cell to start interactive mode
# interactive_mode()