In [47]:
from urllib.parse import urlparse
import tldextract
from typing import Dict, Union
import requests
from bs4 import BeautifulSoup
import re
from newsapi import NewsApiClient
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

In [48]:
def evaluate_reference_credibility(url: str) -> Dict[str, Union[float, str]]:
    base_score = 0.5  # Initialize as float
    try:
        # Load model and tokenizer
        tokenizer = AutoTokenizer.from_pretrained("mrm8488/bert-tiny-finetuned-fake-news-detection")
        model = AutoModelForSequenceClassification.from_pretrained("mrm8488/bert-tiny-finetuned-fake-news-detection")
        
        # Get some text content from the URL
        try:
            response = requests.get(url, timeout=5)
            soup = BeautifulSoup(response.text, 'html.parser')
            content = soup.get_text()[:512]  # Truncate to avoid token limits
        except:
            return {
                "score": 0.0,
                "explanation": "Failed to fetch URL content"
            }

        # Get model prediction
        try:
            inputs = tokenizer(content, return_tensors="pt", truncation=True, max_length=512)
            with torch.no_grad():
                outputs = model(**inputs)
                predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
                model_score = predictions[0][1].item()  # Assuming 1 is the "reliable" class
        except:
            return {
                "score": 0.0,
                "explanation": "Failed to evaluate content with model"
            }

        final_score = (0.7 * base_score) + (0.3 * model_score)  # Direct float multiplication
        
        return {
            "score": final_score,
            "explanation": f"Base confidence: {base_score:.2f} | ML model confidence: {model_score:.2f}"
        }
    except Exception as e:
        return {
            "score": 0.0,
            "explanation": f"Error evaluating URL: {str(e)}"
        }

In [49]:
def evaluate_fact_check(url: str) -> Dict[str, Union[float, str]]:
    """
    Basic fact-checking evaluation based on known fact-checking sources and scientific journals.
    
    Args:
        url (str): The URL to evaluate
        
    Returns:
        dict: Contains 'fact_check_score' (float) and 'explanation' (str)
    """
    try:
        # Parse the URL
        parsed_url = urlparse(url)
        domain = parsed_url.netloc.lower()
        
        # Remove 'www.' if present
        if domain.startswith('www.'):
            domain = domain[4:]
            
        # Define authoritative sources and their scores
        authoritative_sources = {
            # Government Health Organizations
            'ncbi.nlm.nih.gov': 0.95,    # National Center for Biotechnology Information
            'nih.gov': 0.95,             # National Institutes of Health
            'cdc.gov': 0.95,             # Centers for Disease Control
            'fda.gov': 0.95,             # Food and Drug Administration
            'who.int': 0.95,             # World Health Organization
            
            # Medical Research Institutions
            'mayoclinic.org': 0.9,       # Mayo Clinic
            'hopkinsmedicine.org': 0.9,   # Johns Hopkins Medicine
            'medlineplus.gov': 0.9,       # MedlinePlus (NIH)
            
            # Scientific Journals
            'nature.com': 0.9,
            'science.org': 0.9,
            'thelancet.com': 0.9,
            'nejm.org': 0.9,             # New England Journal of Medicine
            'jamanetwork.com': 0.9,      # Journal of American Medical Association
            
            # Fact-Checking Organizations
            'cochrane.org': 0.9,         # Cochrane Reviews
            'factcheck.org': 0.85,
            'snopes.com': 0.85,
            'reuters.com': 0.85,
            'apnews.com': 0.85
        }
        
        # Basic scoring logic
        score = 0.5  # Default score
        explanation = []
        
        # Check if it's a known authoritative source
        for source, source_score in authoritative_sources.items():
            if domain == source or domain.endswith('.' + source):
                score = source_score
                explanation.append(f"Recognized authoritative source ({source})")
                break
                
        # Additional score for .gov domains not in our list
        if domain.endswith('.gov') and score == 0.5:
            score = 0.8
            explanation.append("Government domain")
            
        # Additional score for academic institutions
        if domain.endswith('.edu'):
            score = max(score, 0.8)
            explanation.append("Academic institution")
            
        # Check for scientific article indicators in URL
        path = parsed_url.path.lower()
        if any(x in path for x in ['/article/', '/research/', '/study/', '/paper/']):
            score = min(score + 0.05, 1.0)
            explanation.append("Scientific article indicators")
            
        return {
            "fact_check_score": score,
            "explanation": " | ".join(explanation) if explanation else "Basic fact-check evaluation"
        }
        
    except Exception as e:
        return {
            "fact_check_score": 0.0,
            "explanation": f"Error evaluating URL: {str(e)}"
        }

In [50]:
def evaluate_relevance(url: str, user_question: str, api_key: str) -> Dict[str, Union[float, str]]:
    """
    Evaluates the relevance of a URL based on NewsAPI article matching.
    Args:
    url (str): The URL to evaluate
    user_question (str): The user's question for relevance matching
    api_key (str): NewsAPI key for querying articles
    Returns:
    dict: Contains 'relevance_score' (float) and 'explanation' (str)
    """
    try:
        # Initialize NewsAPI client
        newsapi = NewsApiClient(api_key=api_key)
        
        # Parse the URL domain
        parsed_url = urlparse(url)
        domain = parsed_url.netloc.lower()
        
        # Extract base domain (remove www. and subdomains)
        import tldextract
        extracted = tldextract.extract(url)
        base_domain = f"{extracted.domain}.{extracted.suffix}"
        
        # Initialize scoring
        score = 0.5  # Default score
        explanation = []
        
        # Search for articles related to the user's question
        articles = newsapi.get_everything(
            q=user_question,
            domains=base_domain,
            language='en',
            sort_by='relevancy',
            page_size=10
        )
        
        # Check if articles were found
        if articles['status'] == 'ok':
            total_results = articles['totalResults']
            if total_results > 0:
                # Increase score based on number of articles
                score += min(0.4, 0.1 * total_results)
                explanation.append(f"Found {total_results} relevant articles")
                
                # Look for exact URL match
                for article in articles['articles']:
                    if url in article.get('url', ''):
                        score = min(1.0, score + 0.3)
                        explanation.append("Exact URL match found in news articles")
                        break
            
            # Bonus for scientific/research domains
            scientific_domains = ['ncbi.nlm.nih.gov', 'nih.gov', 'nature.com', 'science.org']
            if any(sci_domain in domain for sci_domain in scientific_domains):
                score = max(score, 0.8)
                explanation.append("Recognized scientific research domain")
        
        # Cap the score at 1.0 and floor at 0.0
        score = min(1.0, max(0.0, score))
        
        return {
            "relevance_score": score,
            "explanation": " | ".join(explanation) if explanation else "No direct relevance found via NewsAPI"
        }
    except Exception as e:
        return {
            "relevance_score": 0.0,
            "explanation": f"Error evaluating relevance: {str(e)}"
        }

In [51]:
def evaluate_bias(url: str, api_key: str) -> Dict[str, Union[float, str]]:
    """
    Evaluates bias using NewsAPI source data.
    
    Args:
        url (str): The URL to evaluate
        api_key (str): NewsAPI key
        
    Returns:
        dict: Contains 'bias_score' (float) and 'explanation' (str)
    """
    try:
        # Initialize NewsAPI client
        newsapi = NewsApiClient(api_key=api_key)
        
        # Parse the URL
        parsed_url = urlparse(url)
        domain = parsed_url.netloc.lower()
        if domain.startswith('www.'):
            domain = domain[4:]
            
        # Get source information from NewsAPI
        sources = newsapi.get_sources()
        
        score = 0.5  # Default score
        explanation = []
        
        if sources['status'] == 'ok':
            # Look for matching source
            for source in sources['sources']:
                source_url = urlparse(source['url']).netloc.lower()
                if domain in source_url or source_url in domain:
                    # Evaluate based on source category
                    category = source['category']
                    if category == 'science':
                        score += 0.3
                        explanation.append("Scientific news source")
                    elif category == 'technology':
                        score += 0.2
                        explanation.append("Technology news source")
                    elif category == 'health':
                        score += 0.2
                        explanation.append("Health news source")
                        
                    # Consider language and country
                    if source['language'] == 'en':
                        score += 0.1
                        explanation.append("English language source")
                        
                    break
        
        # Add checks for .gov and .edu domains
        if domain.endswith('.gov'):
            score = max(score, 0.9)
            explanation.append("Government domain")
        elif domain.endswith('.edu'):
            score = max(score, 0.8)
            explanation.append("Educational institution")
            
        # Cap the score at 1.0
        score = min(1.0, score)
            
        return {
            "bias_score": score,
            "explanation": " | ".join(explanation) if explanation else "Basic bias evaluation using NewsAPI"
        }
        
    except Exception as e:
        return {
            "bias_score": 0.0,
            "explanation": f"Error evaluating bias: {str(e)}"
        }

In [52]:
def evaluate_citations(url: str) -> Dict[str, Union[float, str]]:
    """
    Evaluates citation count and reference quality from webpage content.
    
    Args:
        url (str): The URL to evaluate
        
    Returns:
        dict: Contains 'citation_score' (float) and 'explanation' (str)
    """
    try:
        # Fetch the webpage content
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        response = requests.get(url, headers=headers, timeout=10)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Initialize counters and score
        citation_count = 0
        reference_count = 0
        score = 0.5  # Default score
        explanation = []
        
        # Find citations in different formats
        # Look for <cite> tags
        cite_tags = soup.find_all('cite')
        citation_count += len(cite_tags)
        
        # Look for numbered references [1], [2], etc.
        numbered_refs = re.findall(r'\[\d+\]', response.text)
        citation_count += len(numbered_refs)
        
        # Look for reference or bibliography section
        ref_sections = soup.find_all(['div', 'section'], 
                                   class_=re.compile(r'reference|bibliography|citations', re.I))
        
        if ref_sections:
            # Count references in these sections
            for section in ref_sections:
                # Count list items in reference sections
                references = section.find_all('li')
                reference_count += len(references)
        
        # Calculate score based on citations
        total_citations = max(citation_count, reference_count)
        
        if total_citations > 0:
            # Adjust score based on number of citations
            if total_citations >= 50:
                score = 0.9
            elif total_citations >= 30:
                score = 0.8
            elif total_citations >= 15:
                score = 0.7
            elif total_citations >= 5:
                score = 0.6
            
            explanation.append(f"Found {total_citations} citations/references")
            
        # Look for DOI references
        doi_refs = re.findall(r'doi\.org/\d+\.\d+/\S+', response.text)
        if doi_refs:
            score += 0.1  # Bonus for having DOI references
            explanation.append(f"Found {len(doi_refs)} DOI references")
            score = min(score, 1.0)  # Cap at 1.0
            
        return {
            "citation_score": score,
            "explanation": " | ".join(explanation) if explanation else "No citations found",
            "citation_count": total_citations
        }
        
    except Exception as e:
        return {
            "citation_score": 0.0,
            "explanation": f"Error analyzing citations: {str(e)}",
            "citation_count": 0
        }

In [53]:
def aggregate_scores(result_domain, result_relevance, result_fact_check, result_bias, result_citations):
    """
    Aggregates individual scores into a final weighted score.
    """
    # Define weights for each score component
    weights = {
        'domain': 0.2,
        'relevance': 0.15,
        'fact_check': 0.25,
        'bias': 0.2,
        'citations': 0.2
    }
    
    # Extract scores
    scores = {
        'domain': result_domain.get('score', 0),
        'relevance': result_relevance.get('relevance_score', 0),
        'fact_check': result_fact_check.get('fact_check_score', 0),
        'bias': result_bias.get('bias_score', 0),
        'citations': result_citations.get('citation_score', 0)
    }
    
    # Calculate weighted final score
    final_score = sum(scores[key] * weights[key] for key in weights)
    
    return {
        'final_score': round(final_score, 2),
        'individual_scores': scores
    }

In [54]:
"""
Main method to call all evaluation functions and return the scores based on user_question and url
"""

user_question = "Are cigarettes addictive?"
url = "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2928221/"

newsapi_key = '5851f8384a5e4d93a44ac267f609dcd5'

result_domain = evaluate_reference_credibility(url)
result_relevance = evaluate_relevance(url, user_question, newsapi_key) 
result_fact_check = evaluate_fact_check(url)
result_bias = evaluate_bias(url, newsapi_key)
result_citations = evaluate_citations(url)

# Print individual results
print("Individual Scores:")
print(f"Domain Score: {result_domain}")
print(f"Relevance Score: {result_relevance}")
print(f"Fact Check Score: {result_fact_check}")
print(f"Bias Score: {result_bias}")
print(f"Citation Score: {result_citations}")

# Calculate and print aggregate score
final_results = aggregate_scores(result_domain, result_relevance, result_fact_check, result_bias, result_citations)
print("\nFinal Aggregated Score:")
print(final_results)

Individual Scores:
Domain Score: {'score': 0.6497420072555542, 'explanation': 'Base confidence: 0.50 | ML model confidence: 1.00'}
Relevance Score: {'relevance_score': 0.8, 'explanation': 'Recognized scientific research domain'}
Fact Check Score: {'fact_check_score': 0.95, 'explanation': 'Recognized authoritative source (ncbi.nlm.nih.gov)'}
Bias Score: {'bias_score': 0.9, 'explanation': 'Government domain'}
Citation Score: {'citation_score': 1.0, 'explanation': 'Found 80 citations/references | Found 80 DOI references', 'citation_count': 80}

Final Aggregated Score:
{'final_score': 0.87, 'individual_scores': {'domain': 0.6497420072555542, 'relevance': 0.8, 'fact_check': 0.95, 'bias': 0.9, 'citations': 1.0}}
