In [57]:
from urllib.parse import urlparse
import tldextract
from typing import Dict, Union
import requests
from bs4 import BeautifulSoup
import re

In [58]:


def evaluate_reference_credibility(url: str) -> Dict[str, Union[float, str]]:
    """
    Basic URL credibility evaluation based on domain type and known reliable sources.
    
    Args:
        url (str): The URL to evaluate
        
    Returns:
        dict: Contains 'score' (float) and 'explanation' (str)
    """
    try:
        # Parse the URL
        extracted = tldextract.extract(url)
        domain = f"{extracted.domain}.{extracted.suffix}"
        
        # Define trusted domains and their scores
        trusted_domains = {
            'edu': 0.9,    # Educational institutions
            'gov': 0.9,    # Government websites
            'org': 0.7,    # Non-profit organizations
        }
        
        # Define trusted sources
        trusted_sources = {
            'nature.com': 0.9,
            'science.org': 0.9,
            'scholar.google.com': 0.8,
        }
        
        # Basic scoring logic
        score = 0.5  # Default score
        explanation = []
        
        # Check domain type
        if extracted.suffix in trusted_domains:
            score = trusted_domains[extracted.suffix]
            explanation.append(f"Trusted domain type ({extracted.suffix})")
        
        # Check for trusted sources
        if domain in trusted_sources:
            score = trusted_sources[domain]
            explanation.append(f"Recognized trusted source ({domain})")
            
        return {
            "score": score,
            "explanation": " | ".join(explanation) if explanation else "Basic domain evaluation"
        }
        
    except Exception as e:
        return {
            "score": 0.0,
            "explanation": f"Error evaluating URL: {str(e)}"
        }

In [59]:
def evaluate_fact_check(url: str) -> Dict[str, Union[float, str]]:
    """
    Basic fact-checking evaluation based on known fact-checking sources and scientific journals.
    
    Args:
        url (str): The URL to evaluate
        
    Returns:
        dict: Contains 'fact_check_score' (float) and 'explanation' (str)
    """
    try:
        # Parse the URL
        parsed_url = urlparse(url)
        domain = parsed_url.netloc.lower()
        
        # Remove 'www.' if present
        if domain.startswith('www.'):
            domain = domain[4:]
            
        # Define fact-checking sources and their scores
        fact_check_sources = {
            'snopes.com': 0.9,
            'factcheck.org': 0.9,
            'politifact.com': 0.85,
            'reuters.com': 0.85,
            'apnews.com': 0.85,
            'nature.com': 0.95,       # Peer-reviewed scientific journal
            'science.org': 0.95,      # Peer-reviewed scientific journal
            'thelancet.com': 0.95,    # Medical journal
        }
        
        # Basic scoring logic
        score = 0.5  # Default score
        explanation = []
        
        # Check if it's a known fact-checking or scientific source
        for source, source_score in fact_check_sources.items():
            if domain == source or domain.endswith('.' + source):
                score = source_score
                if source in ['nature.com', 'science.org', 'thelancet.com']:
                    explanation.append(f"Peer-reviewed scientific source ({source})")
                else:
                    explanation.append(f"Recognized fact-checking source ({source})")
                break
                
        # Additional check for scientific article patterns
        path = parsed_url.path.lower()
        if any(x in path for x in ['/article/', '/research/', '/study/', '/paper/']):
            score += 0.1
            explanation.append("Contains scientific article indicators")
            score = min(score, 1.0)  # Cap at 1.0
            
        return {
            "fact_check_score": score,
            "explanation": " | ".join(explanation) if explanation else "Basic fact-check evaluation"
        }
        
    except Exception as e:
        return {
            "fact_check_score": 0.0,
            "explanation": f"Error evaluating URL: {str(e)}"
        }

In [60]:
def evaluate_relevance(url: str) -> Dict[str, Union[float, str]]:
    """
    Evaluates the relevance of a URL based on its structure and content indicators.
    
    Args:
        url (str): The URL to evaluate
        
    Returns:
        dict: Contains 'relevance_score' (float) and 'explanation' (str)
    """
    try:
        # Parse the URL
        parsed_url = urlparse(url)
        path = parsed_url.path.lower()
        
        # Initialize scoring
        score = 0.5  # Default score
        explanation = []
        
        # Check for article indicators in URL
        article_indicators = ['/article/', '/research/', '/study/', '/paper/', 
                            '/journal/', '/publication/', '/doi/']
        
        for indicator in article_indicators:
            if indicator in path:
                score += 0.1
                explanation.append(f"Article indicator found ({indicator.strip('/')})")
                break
                
        # Check for date patterns in URL
        date_patterns = [
            r'/20\d{2}/',              # Year pattern (2000-2099)
            r'/\d{4}/\d{2}/',          # Year/month pattern
            r'\d{4}\.\d{4,}',          # DOI-like date pattern
        ]
        
        for pattern in date_patterns:
            if re.search(pattern, path):
                score += 0.1
                explanation.append("Contains publication date indicator")
                break
        
        # Check for academic or scientific indicators
        academic_indicators = [
            'research', 'study', 'analysis', 'report', 'findings', 
            'investigation', 'experiment', 'trial', 'survey'
        ]
        
        for indicator in academic_indicators:
            if indicator in path:
                score += 0.1
                explanation.append(f"Contains research indicator ({indicator})")
                break
        
        # Cap the score at 1.0
        score = min(1.0, score)
            
        return {
            "relevance_score": score,
            "explanation": " | ".join(explanation) if explanation else "Basic relevance evaluation"
        }
        
    except Exception as e:
        return {
            "relevance_score": 0.0,
            "explanation": f"Error evaluating relevance: {str(e)}"
        }

In [61]:
def evaluate_bias(url: str) -> Dict[str, Union[float, str]]:
    """
    Evaluates bias using open data from Media Bias/Fact Check (MBFC) ratings.
    
    Args:
        url (str): The URL to evaluate
        
    Returns:
        dict: Contains 'bias_score' (float) and 'explanation' (str)
    """
    try:
        # Parse the URL
        parsed_url = urlparse(url)
        domain = parsed_url.netloc.lower()
        if domain.startswith('www.'):
            domain = domain[4:]
            
        # Open-source bias ratings based on MBFC data
        # Higher score = less biased/more neutral
        bias_ratings = {
            # Least Biased - Scientific Sources
            'nature.com': 0.95,
            'science.org': 0.95,
            'scientificamerican.com': 0.9,
            'sciencedaily.com': 0.9,
            
            # Least Biased - News Sources
            'reuters.com': 0.9,
            'apnews.com': 0.9,
            'apnews.com': 0.9,
            
            # Slight Bias
            'bbc.com': 0.8,
            'economist.com': 0.8,
            'bloomberg.com': 0.8,
            
            # Moderate Bias
            'wsj.com': 0.7,
            'nytimes.com': 0.7,
            'theguardian.com': 0.7,
            
            # Strong Bias
            'foxnews.com': 0.5,
            'huffpost.com': 0.5,
            
            # Very Strong Bias
            'breitbart.com': 0.3,
            'dailykos.com': 0.3,
        }
        
        # Initialize scoring
        score = 0.5  # Default score
        explanation = []
        
        # Check known bias ratings
        if domain in bias_ratings:
            score = bias_ratings[domain]
            explanation.append(f"Known source bias rating from MBFC data")
            
        # Check for scientific/academic indicators
        academic_indicators = ['/research/', '/study/', '/paper/', '/journal/', 
                             '/proceedings/', '/article/', '/publication/']
        
        for indicator in academic_indicators:
            if indicator in parsed_url.path.lower():
                score = min(score + 0.1, 1.0)  # Scientific content tends to be less biased
                explanation.append("Scientific/academic content indicator")
                break
                
        return {
            "bias_score": score,
            "explanation": " | ".join(explanation) if explanation else "Basic bias evaluation using MBFC data"
        }
        
    except Exception as e:
        return {
            "bias_score": 0.0,
            "explanation": f"Error evaluating bias: {str(e)}"
        }

In [62]:
def evaluate_citations(url: str) -> Dict[str, Union[float, str]]:
    """
    Evaluates citation count and reference quality from webpage content.
    
    Args:
        url (str): The URL to evaluate
        
    Returns:
        dict: Contains 'citation_score' (float) and 'explanation' (str)
    """
    try:
        # Fetch the webpage content
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        response = requests.get(url, headers=headers, timeout=10)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Initialize counters and score
        citation_count = 0
        reference_count = 0
        score = 0.5  # Default score
        explanation = []
        
        # Find citations in different formats
        # Look for <cite> tags
        cite_tags = soup.find_all('cite')
        citation_count += len(cite_tags)
        
        # Look for numbered references [1], [2], etc.
        numbered_refs = re.findall(r'\[\d+\]', response.text)
        citation_count += len(numbered_refs)
        
        # Look for reference or bibliography section
        ref_sections = soup.find_all(['div', 'section'], 
                                   class_=re.compile(r'reference|bibliography|citations', re.I))
        
        if ref_sections:
            # Count references in these sections
            for section in ref_sections:
                # Count list items in reference sections
                references = section.find_all('li')
                reference_count += len(references)
        
        # Calculate score based on citations
        total_citations = max(citation_count, reference_count)
        
        if total_citations > 0:
            # Adjust score based on number of citations
            if total_citations >= 50:
                score = 0.9
            elif total_citations >= 30:
                score = 0.8
            elif total_citations >= 15:
                score = 0.7
            elif total_citations >= 5:
                score = 0.6
            
            explanation.append(f"Found {total_citations} citations/references")
            
        # Look for DOI references
        doi_refs = re.findall(r'doi\.org/\d+\.\d+/\S+', response.text)
        if doi_refs:
            score += 0.1  # Bonus for having DOI references
            explanation.append(f"Found {len(doi_refs)} DOI references")
            score = min(score, 1.0)  # Cap at 1.0
            
        return {
            "citation_score": score,
            "explanation": " | ".join(explanation) if explanation else "No citations found",
            "citation_count": total_citations
        }
        
    except Exception as e:
        return {
            "citation_score": 0.0,
            "explanation": f"Error analyzing citations: {str(e)}",
            "citation_count": 0
        }

In [63]:
def aggregate_scores(result_domain, result_relevance, result_fact_check, result_bias, result_citations):
    """
    Aggregates individual scores into a final weighted score.
    """
    # Define weights for each score component
    weights = {
        'domain': 0.2,
        'relevance': 0.15,
        'fact_check': 0.25,
        'bias': 0.2,
        'citations': 0.2
    }
    
    # Extract scores
    scores = {
        'domain': result_domain.get('score', 0),
        'relevance': result_relevance.get('relevance_score', 0),
        'fact_check': result_fact_check.get('fact_check_score', 0),
        'bias': result_bias.get('bias_score', 0),
        'citations': result_citations.get('citation_score', 0)
    }
    
    # Calculate weighted final score
    final_score = sum(scores[key] * weights[key] for key in weights)
    
    return {
        'final_score': round(final_score, 2),
        'individual_scores': scores
    }

In [64]:
"""
Main method to call all evaluation functions and return the scores
"""
url = "https://www.nature.com/articles/s41586-020-2649-2"
result_domain = evaluate_reference_credibility(url)
result_relevance = evaluate_relevance(url)
result_fact_check = evaluate_fact_check(url)
result_bias = evaluate_bias(url)
result_citations = evaluate_citations(url)

# Print individual results
print("Individual Scores:")
print(f"Domain Score: {result_domain}")
print(f"Relevance Score: {result_relevance}")
print(f"Fact Check Score: {result_fact_check}")
print(f"Bias Score: {result_bias}")
print(f"Citation Score: {result_citations}")

# Calculate and print aggregate score
final_results = aggregate_scores(result_domain, result_relevance, result_fact_check, result_bias, result_citations)
print("\nFinal Aggregated Score:")
print(final_results)

Individual Scores:
Domain Score: {'score': 0.9, 'explanation': 'Recognized trusted source (nature.com)'}
Relevance Score: {'relevance_score': 0.5, 'explanation': 'Basic relevance evaluation'}
Fact Check Score: {'fact_check_score': 0.95, 'explanation': 'Peer-reviewed scientific source (nature.com)'}
Bias Score: {'bias_score': 0.95, 'explanation': 'Known source bias rating from MBFC data'}
Citation Score: {'citation_score': 0.6, 'explanation': 'Found 2 citations/references | Found 3 DOI references', 'citation_count': 2}

Final Aggregated Score:
{'final_score': 0.8, 'individual_scores': {'domain': 0.9, 'relevance': 0.5, 'fact_check': 0.95, 'bias': 0.95, 'citations': 0.6}}
