In [1]:
import os
from dotenv import load_dotenv

load_dotenv()

API_KEY = os.environ.get('API_KEY')
from google import genai

client = genai.Client(api_key=API_KEY)

## Ablation Studies

In [None]:
import pandas as pd
import time
from tqdm import tqdm
import csv
import os
import concurrent.futures
import re
import threading
from functools import lru_cache

# Regular expression to extract assessment value
ASSESSMENT_PATTERN = re.compile(r'<overall_assessment>([01]|REAL|FAKE)</overall_assessment>')

def single_model_detection(client, news_text, model_name):
    """
    Test a single model's performance on fake news detection
    
    Args:
        client: The Gemini API client
        news_text: The news text to analyze
        model_name: Either 'gemini-1.5-pro' or 'gemini-2.0-flash'
        
    Returns:
        Dictionary with verification result and processing time
    """
    start_time = time.time()
    
    prompt = f"""
    Analyze this news content and determine if it's real (1) or fake (0). 
    
    News content: {news_text}
    
    Respond using exactly this format:
    <verification>
      <overall_assessment>1 for REAL or 0 for FAKE</overall_assessment>
      <confidence_score>0-1</confidence_score>
      <key_issues>Brief issues</key_issues>
      <reasoning>Brief reasoning</reasoning>
    </verification>
    """
    
    try:
        response = client.models.generate_content(
            model=model_name,
            contents=prompt
        )
        result = response.text
        
        # Ensure it has the required XML format
        if not "<verification>" in result:
            result = f"""<verification>
  <overall_assessment>0</overall_assessment>
  <confidence_score>0.5</confidence_score>
  <key_issues>Malformed response</key_issues>
  <reasoning>Could not properly analyze the content</reasoning>
</verification>"""
            
    except Exception as e:
        print(f"Error with {model_name}: {e}")
        result = f"""<verification>
  <overall_assessment>0</overall_assessment>
  <confidence_score>0.0</confidence_score>
  <key_issues>API error</key_issues>
  <reasoning>{model_name} encountered an error</reasoning>
</verification>"""
        
    end_time = time.time()
    processing_time = end_time - start_time
    
    return {
        "verification": result,
        "processing_time": processing_time
    }

def two_agent_detection(client, news_text):
    """
    Our original 2-agent framework
    
    Args:
        client: The Gemini API client
        news_text: The news text to analyze
        
    Returns:
        Dictionary with facts, verification result and processing time
    """
    start_time = time.time()
    
    # Step 1: Use Agent 1 (Gemini 1.5 Pro) to extract facts
    facts_prompt = f"""
    Extract verifiable claims from this text as CSV with headers "claim,source,confidence":
    {news_text}
    """
    
    try:
        response1 = client.models.generate_content(
            model='gemini-1.5-pro',
            contents=facts_prompt
        )
        facts_csv = response1.text
        
        # Ensure it's valid CSV format
        if not "claim,source,confidence" in facts_csv:
            facts_csv = "claim,source,confidence\n" + facts_csv.replace('\n', ' ').strip()
            
    except Exception as e:
        print(f"Error in Agent 1: {e}")
        facts_csv = "claim,source,confidence\nError processing input,agent1,0.0"
    
    # Step 2: Use Agent 2 (Gemini 2.0 Flash) to verify using the facts
    # Create a more compact facts representation
    try:
        facts_formatted = facts_csv.replace("\n", "; ")
    except:
        facts_formatted = facts_csv
    
    verify_prompt = f"""
    Classify if this news is real (1) or fake (0) based on these facts. Return in XML:
    <verification>
      <overall_assessment>1 for REAL or 0 for FAKE</overall_assessment>
      <confidence_score>0-1</confidence_score>
      <key_issues>Brief issues</key_issues>
      <reasoning>Brief reasoning</reasoning>
    </verification>
    
    News: {news_text}
    Facts: {facts_formatted}
    """
    
    try:
        response2 = client.models.generate_content(
            model='gemini-2.0-flash',
            contents=verify_prompt
        )
        verification = response2.text
        
        # Ensure it has the required XML format
        if not "<verification>" in verification:
            verification = f"""<verification>
  <overall_assessment>0</overall_assessment>
  <confidence_score>0.5</confidence_score>
  <key_issues>Malformed response</key_issues>
  <reasoning>Could not properly analyze the content</reasoning>
</verification>"""
            
    except Exception as e:
        print(f"Error in Agent 2: {e}")
        verification = """<verification>
  <overall_assessment>0</overall_assessment>
  <confidence_score>0.0</confidence_score>
  <key_issues>Error processing input</key_issues>
  <reasoning>Agent 2 encountered an error</reasoning>
</verification>"""
        
    end_time = time.time()
    processing_time = end_time - start_time
        
    return {
        "facts_extracted": facts_csv,
        "verification": verification,
        "processing_time": processing_time
    }

def extract_assessment(verification_xml):
    """
    Extract the overall assessment (1 or 0) from the verification XML
    """
    try:
        # Use regex pattern for faster extraction
        match = ASSESSMENT_PATTERN.search(verification_xml)
        if match:
            assessment_str = match.group(1)
            if assessment_str == "1" or assessment_str.upper() == "REAL":
                return 1
            elif assessment_str == "0" or assessment_str.upper() == "FAKE":
                return 0
        
        # Fallback to simple string check
        if "<overall_assessment>1</overall_assessment>" in verification_xml:
            return 1
        elif "<overall_assessment>0</overall_assessment>" in verification_xml:
            return 0
        
        return None
    except Exception as e:
        print(f"Error extracting assessment: {e}")
        return None

def parse_dataset_line(line):
    """
    Parse a line from the Uni-Fakkedit-55k dataset
    """
    try:
        # Extract text
        text_parts = line.split("[TEXT]")
        if len(text_parts) != 2:
            return None
        after_text = text_parts[1]

        # Extract objects
        text_objects_parts = after_text.split("[OBJECTS]")
        if len(text_objects_parts) != 2:
            return None
        text_part = text_objects_parts[0].strip()
        after_objects = text_objects_parts[1]

        # Extract label
        objects_label_parts = after_objects.split("[LABEL]")
        if len(objects_label_parts) != 2:
            return None
        objects_part = objects_label_parts[0].strip()
        label_part = objects_label_parts[1].strip()

        # Combine text and objects - keep it short
        combined_input = f"{text_part}. Objects: {objects_part}"
        label_int = int(label_part)
        
        return (combined_input, label_int)
    except Exception as e:
        print(f"Error parsing line: {e}")
        return None

def process_entry(client, line):
    """
    Process a single dataset entry with all three approaches
    """
    parsed_data = parse_dataset_line(line)
    if parsed_data is None:
        return None
    
    combined_input, true_label = parsed_data
    
    # Test with Gemini 1.5 Pro
    pro_result = single_model_detection(client, combined_input, 'gemini-1.5-pro')
    pro_prediction = extract_assessment(pro_result["verification"])
    pro_correct = (pro_prediction == true_label) if pro_prediction is not None else False
    
    # Test with Gemini 2.0 Flash
    flash_result = single_model_detection(client, combined_input, 'gemini-2.0-flash')
    flash_prediction = extract_assessment(flash_result["verification"])
    flash_correct = (flash_prediction == true_label) if flash_prediction is not None else False
    
    # Test with two-agent approach
    two_agent_result = two_agent_detection(client, combined_input)
    two_agent_prediction = extract_assessment(two_agent_result["verification"])
    two_agent_correct = (two_agent_prediction == true_label) if two_agent_prediction is not None else False
    
    return {
        'text': combined_input,
        'true_label': true_label,
        
        'pro_prediction': pro_prediction,
        'pro_correct': pro_correct,
        'pro_verification': pro_result["verification"],
        'pro_time': pro_result["processing_time"],
        
        'flash_prediction': flash_prediction,
        'flash_correct': flash_correct,
        'flash_verification': flash_result["verification"],
        'flash_time': flash_result["processing_time"],
        
        'two_agent_prediction': two_agent_prediction,
        'two_agent_correct': two_agent_correct,
        'two_agent_facts': two_agent_result["facts_extracted"],
        'two_agent_verification': two_agent_result["verification"],
        'two_agent_time': two_agent_result["processing_time"]
    }

def process_batch(client, batch_lines, lock, writer=None):
    """
    Process a batch of dataset lines
    """
    batch_results = []
    
    for line in batch_lines:
        result = process_entry(client, line)
        if result is not None:
            batch_results.append(result)
            
            # Write result to CSV if provided with a writer
            if writer:
                with lock:
                    writer.writerow(result)
    
    return batch_results

def run_ablation_study(client, dataset_path, output_path, num_samples=500, batch_size=5, max_workers=2):
    """
    Run an ablation study comparing Gemini 1.5 Pro, Gemini 2.0 Flash, and 2-Agent approach
    """
    # Ensure output directory exists
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    
    # Read the dataset lines
    with open(dataset_path, 'r', encoding='utf-8') as f:
        lines = []
        for _ in range(num_samples):
            line = f.readline()
            if not line:
                break
            lines.append(line)
    
    # Create CSV file and writer
    csvfile = open(output_path, 'w', newline='')
    fieldnames = [
        'text', 'true_label', 
        'pro_prediction', 'pro_correct', 'pro_verification', 'pro_time',
        'flash_prediction', 'flash_correct', 'flash_verification', 'flash_time',
        'two_agent_prediction', 'two_agent_correct', 'two_agent_facts', 
        'two_agent_verification', 'two_agent_time'
    ]
    fieldnames = [
        'true_label', 'two_agent_prediction', 'two_agent_correct', 'two_agent_time'
    ]

    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    
    # Create a thread lock for CSV writing
    lock = threading.Lock()
    
    # Process batches with ThreadPoolExecutor for parallel processing
    results = []
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Submit batch processing tasks
        futures = []
        for i in range(0, len(lines), batch_size):
            batch_lines = lines[i:i+batch_size]
            futures.append(executor.submit(process_batch, client, batch_lines, lock, writer))
        
        # Process results as they complete
        for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures), desc="Processing batches"):
            batch_results = future.result()
            results.extend(batch_results)
    
    # Close CSV file
    csvfile.close()
    
    # Calculate metrics
    total_processed = len(results)
    
    pro_correct = sum(1 for r in results if r['pro_correct'])
    pro_accuracy = pro_correct / total_processed if total_processed > 0 else 0
    pro_avg_time = sum(r['pro_time'] for r in results) / total_processed if total_processed > 0 else 0
    
    flash_correct = sum(1 for r in results if r['flash_correct'])
    flash_accuracy = flash_correct / total_processed if total_processed > 0 else 0
    flash_avg_time = sum(r['flash_time'] for r in results) / total_processed if total_processed > 0 else 0
    
    two_agent_correct = sum(1 for r in results if r['two_agent_correct'])
    two_agent_accuracy = two_agent_correct / total_processed if total_processed > 0 else 0
    two_agent_avg_time = sum(r['two_agent_time'] for r in results) / total_processed if total_processed > 0 else 0
    
    # Print results
    print(f"\n=== ABLATION STUDY RESULTS ===")
    print(f"Total samples processed: {total_processed}")
    
    print(f"\n1. GEMINI 1.5 PRO")
    print(f"   Accuracy: {pro_accuracy:.4f} ({pro_correct}/{total_processed})")
    print(f"   Avg processing time: {pro_avg_time:.2f}s")
    
    print(f"\n2. GEMINI 2.0 FLASH")
    print(f"   Accuracy: {flash_accuracy:.4f} ({flash_correct}/{total_processed})")
    print(f"   Avg processing time: {flash_avg_time:.2f}s")
    
    print(f"\n3. TWO-AGENT APPROACH")
    print(f"   Accuracy: {two_agent_accuracy:.4f} ({two_agent_correct}/{total_processed})")
    print(f"   Avg processing time: {two_agent_avg_time:.2f}s")
    
    # Calculate performance improvement
    best_single_accuracy = max(pro_accuracy, flash_accuracy)
    best_single_model = "Gemini 1.5 Pro" if pro_accuracy > flash_accuracy else "Gemini 2.0 Flash"
    
    accuracy_improvement = two_agent_accuracy - best_single_accuracy
    relative_improvement = (accuracy_improvement / best_single_accuracy) * 100 if best_single_accuracy > 0 else 0
    
    print(f"\n=== PERFORMANCE COMPARISON ===")
    print(f"Best single model: {best_single_model} ({best_single_accuracy:.4f})")
    print(f"Two-agent approach: {two_agent_accuracy:.4f}")
    print(f"Absolute improvement: {accuracy_improvement:.4f}")
    print(f"Relative improvement: {relative_improvement:.2f}%")
    
    print(f"\nResults saved to {output_path}")
    
    return {
        'results': results,
        'pro_accuracy': pro_accuracy,
        'pro_avg_time': pro_avg_time,
        'flash_accuracy': flash_accuracy,
        'flash_avg_time': flash_avg_time,
        'two_agent_accuracy': two_agent_accuracy,
        'two_agent_avg_time': two_agent_avg_time,
        'accuracy_improvement': accuracy_improvement,
        'relative_improvement': relative_improvement
    }

def main(client, dataset_path, output_path="results/ablation_study_results.csv", num_samples=500):
    """
    Main function to run the ablation study
    """
    print(f"Starting ablation study with {num_samples} samples from {dataset_path}")
    start_time = time.time()
    
    # Run ablation study
    metrics = run_ablation_study(client, dataset_path, output_path, num_samples)
    
    end_time = time.time()
    total_time = end_time - start_time
    
    print(f"\nAblation study completed in {total_time:.2f} seconds")
    
    return metrics

dataset_path = r"C:\Users\CoolA\Code\dataset_2way_output.txt"

main(client, dataset_path)

In [4]:
import pandas as pd
import time
from tqdm import tqdm
import csv
import os
import concurrent.futures
import re
import threading
from functools import lru_cache

# Regular expression to extract assessment value
ASSESSMENT_PATTERN = re.compile(r'<overall_assessment>([01]|REAL|FAKE)</overall_assessment>')

def two_agent_detection(client, news_text):
    """
    Our original 2-agent framework
    
    Args:
        client: The Gemini API client
        news_text: The news text to analyze
        
    Returns:
        Dictionary with facts, verification result and processing time
    """
    start_time = time.time()
    
    # Step 1: Use Agent 1 (Gemini 1.5 Pro) to extract facts
    facts_prompt = f"""
    You are a fact-checking assistant. Given a news piece, extract a list of factual claims that can be verified.
    For each claim, provide:
    1. The claim
    2. The source of the claim (if mentioned in the text)
    3. A confidence score (0-1) on how verifiable this claim is based on specificity
    
    Format your output as CSV with headers: claim,source,confidence
    Do not include any other text, explanations, or formatting - ONLY the CSV data.
    
    News piece to analyze:
    {news_text}
    """
    
    try:
        response1 = client.models.generate_content(
            model='gemini-1.5-pro',
            contents=facts_prompt
        )
        facts_csv = response1.text
        
        # Ensure it's valid CSV format
        if not "claim,source,confidence" in facts_csv:
            facts_csv = "claim,source,confidence\n" + facts_csv.replace('\n', ' ').strip()
            
    except Exception as e:
        print(f"Error in Agent 1: {e}")
        facts_csv = "claim,source,confidence\nError processing input,agent1,0.0"
    
    # Step 2: Use Agent 2 (Gemini 2.0 Flash) to verify using the facts
    # Create a more compact facts representation
    try:
        facts_formatted = facts_csv.replace("\n", "; ")
    except:
        facts_formatted = facts_csv
    
    verify_prompt = f"""
    You are a news verification assistant. Analyze the provided news piece and fact-check results to determine if the news is likely fake or genuine.
    
    News piece:
    {news_text}
    
    Extracted facts and claims:
    {facts_formatted}
    
    Analyze the news piece with the following considerations:
    1. Do the extracted facts align with established knowledge?
    2. Are there logical inconsistencies in the text?
    3. Is the source reliable based on the extracted information?
    4. Does the writing use manipulative or emotional language?
    5. Are there any verifiable false claims?
    
    Provide your analysis in the following XML format, with no additional text before or after:
    <verification>
      <overall_assessment>1 for REAL or 0 for FAKE</overall_assessment>
      <confidence_score>0-1 score</confidence_score>
      <key_issues>List up to 3 key issues or red flags if any</key_issues>
      <reasoning>Brief explanation of your reasoning</reasoning>
    </verification>
    """
    
    try:
        response2 = client.models.generate_content(
            model='gemini-2.0-flash',
            contents=verify_prompt
        )
        verification = response2.text
        
        # Ensure it has the required XML format
        if not "<verification>" in verification:
            verification = f"""<verification>
  <overall_assessment>0</overall_assessment>
  <confidence_score>0.5</confidence_score>
  <key_issues>Malformed response</key_issues>
  <reasoning>Could not properly analyze the content</reasoning>
</verification>"""
            
    except Exception as e:
        print(f"Error in Agent 2: {e}")
        verification = """<verification>
  <overall_assessment>0</overall_assessment>
  <confidence_score>0.0</confidence_score>
  <key_issues>Error processing input</key_issues>
  <reasoning>Agent 2 encountered an error</reasoning>
</verification>"""
        
    end_time = time.time()
    processing_time = end_time - start_time
        
    return {
        "facts_extracted": facts_csv,
        "verification": verification,
        "processing_time": processing_time
    }

def extract_assessment(verification_xml):
    """
    Extract the overall assessment (1 or 0) from the verification XML
    """
    try:
        # Use regex pattern for faster extraction
        match = ASSESSMENT_PATTERN.search(verification_xml)
        if match:
            assessment_str = match.group(1)
            if assessment_str == "1" or assessment_str.upper() == "REAL":
                return 1
            elif assessment_str == "0" or assessment_str.upper() == "FAKE":
                return 0
        
        # Fallback to simple string check
        if "<overall_assessment>1</overall_assessment>" in verification_xml:
            return 1
        elif "<overall_assessment>0</overall_assessment>" in verification_xml:
            return 0
        
        return None
    except Exception as e:
        print(f"Error extracting assessment: {e}")
        return None

def parse_dataset_line(line):
    """
    Parse a line from the Uni-Fakkedit-55k dataset
    """
    try:
        # Extract text
        text_parts = line.split("[TEXT]")
        if len(text_parts) != 2:
            return None
        after_text = text_parts[1]

        # Extract objects
        text_objects_parts = after_text.split("[OBJECTS]")
        if len(text_objects_parts) != 2:
            return None
        text_part = text_objects_parts[0].strip()
        after_objects = text_objects_parts[1]

        # Extract label
        objects_label_parts = after_objects.split("[LABEL]")
        if len(objects_label_parts) != 2:
            return None
        objects_part = objects_label_parts[0].strip()
        label_part = objects_label_parts[1].strip()

        # Combine text and objects - keep it short
        combined_input = f"{text_part}. Objects: {objects_part}"
        label_int = int(label_part)
        
        return (combined_input, label_int)
    except Exception as e:
        print(f"Error parsing line: {e}")
        return None

def process_entry(client, line):
    """
    Process a single dataset entry with the two-agent approach only
    """
    parsed_data = parse_dataset_line(line)
    if parsed_data is None:
        return None
    
    combined_input, true_label = parsed_data
    
    # Test with two-agent approach
    two_agent_result = two_agent_detection(client, combined_input)
    two_agent_prediction = extract_assessment(two_agent_result["verification"])
    two_agent_correct = (two_agent_prediction == true_label) if two_agent_prediction is not None else False
    
    # Return only the fields we need for the CSV
    return {
        'true_label': true_label,
        'two_agent_prediction': two_agent_prediction,
        'two_agent_correct': two_agent_correct,
        'two_agent_time': two_agent_result["processing_time"]
    }

def process_batch(client, batch_lines, lock, writer=None):
    """
    Process a batch of dataset lines
    """
    batch_results = []
    
    for line in batch_lines:
        result = process_entry(client, line)
        if result is not None:
            batch_results.append(result)
            
            # Write result to CSV if provided with a writer
            if writer:
                with lock:
                    writer.writerow(result)
    
    return batch_results

def run_ablation_study(client, dataset_path, output_path, num_samples=500, batch_size=5, max_workers=2):
    """
    Run a study using the 2-Agent approach only
    """
    # Ensure output directory exists
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    
    # Read the dataset lines
    with open(dataset_path, 'r', encoding='utf-8') as f:
        lines = []
        for _ in range(num_samples):
            line = f.readline()
            if not line:
                break
            lines.append(line)
    
    # Create CSV file and writer
    fieldnames = [
        'true_label', 'two_agent_prediction', 'two_agent_correct', 'two_agent_time'
    ]
    
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    
    csvfile = open(output_path, 'w', newline='')
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    
    # Create a thread lock for CSV writing
    lock = threading.Lock()
    
    # Process batches with ThreadPoolExecutor for parallel processing
    results = []
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Submit batch processing tasks
        futures = []
        for i in range(0, len(lines), batch_size):
            batch_lines = lines[i:i+batch_size]
            futures.append(executor.submit(process_batch, client, batch_lines, lock, writer))
        
        # Process results as they complete
        for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures), desc="Processing batches"):
            batch_results = future.result()
            results.extend(batch_results)
    
    # Close CSV file
    csvfile.close()
    
    # Calculate metrics
    total_processed = len(results)
    
    two_agent_correct = sum(1 for r in results if r['two_agent_correct'])
    two_agent_accuracy = two_agent_correct / total_processed if total_processed > 0 else 0
    two_agent_avg_time = sum(r['two_agent_time'] for r in results) / total_processed if total_processed > 0 else 0
    
    # Print results
    print(f"\n=== STUDY RESULTS ===")
    print(f"Total samples processed: {total_processed}")
    
    print(f"\nTWO-AGENT APPROACH")
    print(f"   Accuracy: {two_agent_accuracy:.4f} ({two_agent_correct}/{total_processed})")
    print(f"   Avg processing time: {two_agent_avg_time:.2f}s")
    
    print(f"\nResults saved to {output_path}")
    
    return {
        'results': results,
        'two_agent_accuracy': two_agent_accuracy,
        'two_agent_avg_time': two_agent_avg_time
    }

# The client is initialized separately, so we can directly use it
dataset_path = r"C:\Users\CoolA\Code\dataset_2way_output.txt"
output_path="results/3_piece_study_results.csv"
num_samples=500

print(f"Starting study with {num_samples} samples from {dataset_path}")
start_time = time.time()

# Run the study
metrics = run_ablation_study(client, dataset_path, output_path, num_samples)

end_time = time.time()
total_time = end_time - start_time

print(f"\nStudy completed in {total_time:.2f} seconds")

Starting study with 500 samples from C:\Users\CoolA\Code\dataset_2way_output.txt


Processing batches: 100%|██████████| 100/100 [26:05<00:00, 15.66s/it]


=== STUDY RESULTS ===
Total samples processed: 500

TWO-AGENT APPROACH
   Accuracy: 0.7180 (359/500)
   Avg processing time: 6.23s

Results saved to results/3_piece_study_results.csv

Study completed in 1565.97 seconds





In [2]:
import pandas as pd
import time
from tqdm import tqdm
import csv
import os
import concurrent.futures
import re
import threading
from functools import lru_cache

# Regular expression to extract assessment value
ASSESSMENT_PATTERN = re.compile(r'<(overall_assessment|final_assessment)>([01]|REAL|FAKE)</(overall_assessment|final_assessment)>')

def three_agent_detection(client, news_text):
    """
    Three-agent framework for fake news detection
    
    Args:
        client: The Gemini API client
        news_text: The news text to analyze
        
    Returns:
        Dictionary with factbase output, verification, meta-verification result and processing time
    """
    start_time = time.time()
    
    # Step 1: FactBase (Evidence Collector) using Gemini 1.5 Pro
    factbase_prompt = f"""
    You are FactBase, an expert fact extraction system. Your task is to systematically analyze news content and extract all relevant factual claims, entities, and contextual elements.

    Given a news piece, extract:

    1. CORE CLAIMS: The central factual assertions (who, what, when, where, how)
    2. ENTITIES: People, organizations, locations mentioned
    3. SOURCES: Attribution of information (direct quotes, referenced studies, etc.)
    4. STATISTICS: Any numerical data or statistics presented
    5. TEMPORAL ELEMENTS: Dates, timeframes, sequence of events
    6. LOGICAL STRUCTURE: How claims relate to one another

    For each element, assign:
    - Specificity score (0-1): How precise and verifiable the element is
    - Internal consistency score (0-1): How well it aligns with other elements in the text

    Format your output as structured JSON with these exact fields:
    {{
      "core_claims": [{{"claim": "text", "specificity": 0.0, "consistency": 0.0}}],
      "entities": [{{"entity": "name", "type": "person/org/location", "context": "brief context"}}],
      "sources": [{{"source": "name", "attribution": "how information is attributed", "verifiability": 0.0}}],
      "statistics": [{{"claim": "statistical claim", "specificity": 0.0}}],
      "temporal_framework": {{"timeframe": "overall timeframe", "consistency": 0.0}},
      "red_flags": ["list any immediate inconsistencies, exaggerations, or vague claims"]
    }}

    News to analyze:
    {news_text}
    """
    
    try:
        response1 = client.models.generate_content(
            model='gemini-1.5-pro',
            contents=factbase_prompt
        )
        factbase_output = response1.text
    except Exception as e:
        print(f"Error in FactBase: {e}")
        factbase_output = """{"core_claims": [], "entities": [], "sources": [], "statistics": [], "temporal_framework": {"timeframe": "unknown", "consistency": 0.0}, "red_flags": ["Error processing input"]}"""
    
    # Step 2: Verifier (Analysis Engine) using Gemini 2.0 Flash
    verify_prompt = f"""
    You are Verifier, an expert news verification system. Your task is to determine whether a news piece is genuine or fabricated, using systematic analysis of evidence and application of media literacy principles.

    First, review all information:

    1. Original news text:
    {news_text}

    2. Evidence extracted by FactBase:
    {factbase_output}

    Now, perform a comprehensive verification using these specific criteria:

    1. CLAIM SPECIFICITY: Genuine news typically contains specific, verifiable claims
       Score: 0-1 (higher = more specific claims)
       
    2. SOURCE CREDIBILITY: Genuine news cites specific, checkable sources
       Score: 0-1 (higher = more credible sourcing)
       
    3. INTERNAL CONSISTENCY: Facts should align logically; contradictions suggest fabrication
       Score: 0-1 (higher = more internally consistent)
       
    4. LANGUAGE PATTERNS: Analyze for sensationalism, emotional manipulation, or narrative framing
       Score: 0-1 (higher = more neutral language)
       
    5. CONTEXTUAL COMPLETENESS: Real news provides necessary context; fake news often omits key details
       Score: 0-1 (higher = more complete context)

    Calculate a weighted verification score using this formula:
    Final Score = (Claim_Specificity*0.25 + Source_Credibility*0.25 + Internal_Consistency*0.2 + Language_Patterns*0.15 + Contextual_Completeness*0.15)

    Respond using exactly this format:
    <verification>
      <overall_assessment>{{0 for FAKE, 1 for REAL - use 0.5 threshold on final score}}</overall_assessment>
      <confidence_score>{{final_score}}</confidence_score>
      <key_issues>{{3 most significant red flags or strongest verification points}}</key_issues>
      <reasoning>{{concise explanation focusing on the strongest evidence for your conclusion}}</reasoning>
    </verification>
    """
    
    try:
        response2 = client.models.generate_content(
            model='gemini-2.0-flash',
            contents=verify_prompt
        )
        verification = response2.text
        
        # Ensure it has the required XML format
        if not "<verification>" in verification:
            verification = f"""<verification>
  <overall_assessment>0</overall_assessment>
  <confidence_score>0.5</confidence_score>
  <key_issues>Malformed response</key_issues>
  <reasoning>Could not properly analyze the content</reasoning>
</verification>"""
            
    except Exception as e:
        print(f"Error in Verifier: {e}")
        verification = """<verification>
  <overall_assessment>0</overall_assessment>
  <confidence_score>0.0</confidence_score>
  <key_issues>Error processing input</key_issues>
  <reasoning>Verifier encountered an error</reasoning>
</verification>"""
    
    # Step 3: Meta-Verifier (Ensemble Approach) using Gemini 1.5 Pro for deeper analysis
    meta_prompt = f"""
    You are Meta-Verifier, a specialized system designed to make final determinations about news authenticity. You've been given multiple assessments and must synthesize them into a final verdict.

    Review all information:

    1. Original news text:
    {news_text}

    2. FactBase evidence:
    {factbase_output}

    3. Primary verification:
    {verification}

    Now perform a higher-order analysis:

    1. LINGUISTIC RED FLAGS (detect for manipulative language, unusual patterns, or propaganda techniques)
       - Emotionally charged words
       - Black/white thinking
       - Unusual formatting or structure
       
    2. FACTUAL DENSITY ANALYSIS (evaluate the ratio of verifiable facts to opinion/commentary)

    3. SOURCE ANALYSIS (evaluate how information is attributed and sourced)

    4. COHERENCE ASSESSMENT (examine logical flow and contextual alignment)

    5. PLAUSIBILITY CHECK (evaluate if claims align with fundamental knowledge)

    Assign a final verification confidence, potentially overriding previous assessments if strong evidence exists.

    Respond using exactly this format with no additional text:
    <final_verification>
      <overall_assessment>{{0 for FAKE, 1 for REAL}}</overall_assessment>
      <confidence_score>{{0-1}}</confidence_score>
      <key_issues>{{1-3 most critical indicators}}</key_issues>
      <reasoning>{{concise explanation of final determination}}</reasoning>
    </final_verification>
    """
    
    try:
        response3 = client.models.generate_content(
            model='gemini-1.5-pro',  # Using 1.5 Pro for meta-verification for deeper analysis
            contents=meta_prompt
        )
        meta_verification = response3.text
        
        # Ensure it has the required XML format
        if not "<final_verification>" in meta_verification:
            meta_verification = f"""<final_verification>
  <overall_assessment>0</overall_assessment>
  <confidence_score>0.5</confidence_score>
  <key_issues>Malformed response</key_issues>
  <reasoning>Could not properly perform meta-analysis</reasoning>
</final_verification>"""
            
    except Exception as e:
        print(f"Error in Meta-Verifier: {e}")
        meta_verification = """<final_verification>
  <overall_assessment>0</overall_assessment>
  <confidence_score>0.0</confidence_score>
  <key_issues>Error processing input</key_issues>
  <reasoning>Meta-Verifier encountered an error</reasoning>
</final_verification>"""
        
    end_time = time.time()
    processing_time = end_time - start_time
        
    return {
        "factbase_output": factbase_output,
        "verification": verification,
        "meta_verification": meta_verification,
        "processing_time": processing_time
    }

def extract_assessment(verification_xml):
    """
    Extract the overall assessment (1 or 0) from the verification XML
    Works with both normal verification and meta-verification formats
    """
    try:
        # Use regex pattern for faster extraction
        match = ASSESSMENT_PATTERN.search(verification_xml)
        if match:
            assessment_str = match.group(2)
            if assessment_str == "1" or assessment_str.upper() == "REAL":
                return 1
            elif assessment_str == "0" or assessment_str.upper() == "FAKE":
                return 0
        
        # Fallback to simple string check
        if "<overall_assessment>1</overall_assessment>" in verification_xml or "<final_assessment>1</final_assessment>" in verification_xml:
            return 1
        elif "<overall_assessment>0</overall_assessment>" in verification_xml or "<final_assessment>0</final_assessment>" in verification_xml:
            return 0
        
        return None
    except Exception as e:
        print(f"Error extracting assessment: {e}")
        return None

def parse_dataset_line(line):
    """
    Parse a line from the Uni-Fakkedit-55k dataset
    """
    try:
        # Extract text
        text_parts = line.split("[TEXT]")
        if len(text_parts) != 2:
            return None
        after_text = text_parts[1]

        # Extract objects
        text_objects_parts = after_text.split("[OBJECTS]")
        if len(text_objects_parts) != 2:
            return None
        text_part = text_objects_parts[0].strip()
        after_objects = text_objects_parts[1]

        # Extract label
        objects_label_parts = after_objects.split("[LABEL]")
        if len(objects_label_parts) != 2:
            return None
        objects_part = objects_label_parts[0].strip()
        label_part = objects_label_parts[1].strip()

        # Combine text and objects - keep it short
        combined_input = f"{text_part}. Objects: {objects_part}"
        label_int = int(label_part)
        
        return (combined_input, label_int)
    except Exception as e:
        print(f"Error parsing line: {e}")
        return None

def process_entry(client, line):
    """
    Process a single dataset entry with the three-agent approach
    """
    parsed_data = parse_dataset_line(line)
    if parsed_data is None:
        return None
    
    combined_input, true_label = parsed_data
    
    # Test with three-agent approach
    three_agent_result = three_agent_detection(client, combined_input)
    three_agent_prediction = extract_assessment(three_agent_result["meta_verification"])
    three_agent_correct = (three_agent_prediction == true_label) if three_agent_prediction is not None else False
    
    # Extract confidence score
    confidence_pattern = re.compile(r'<confidence_score>([\d\.]+)</confidence_score>')
    match = confidence_pattern.search(three_agent_result["meta_verification"])
    confidence = float(match.group(1)) if match else 0.0
    
    # Return only the fields we need for the CSV
    return {
        'true_label': true_label,
        'three_agent_prediction': three_agent_prediction,
        'three_agent_correct': three_agent_correct,
        'three_agent_time': three_agent_result["processing_time"],
        'three_agent_confidence': confidence
    }

def process_batch(client, batch_lines, lock, writer=None):
    """
    Process a batch of dataset lines
    """
    batch_results = []
    
    for line in batch_lines:
        result = process_entry(client, line)
        if result is not None:
            batch_results.append(result)
            
            # Write result to CSV if provided with a writer
            if writer:
                with lock:
                    writer.writerow(result)
    
    return batch_results

def run_study(client, dataset_path, output_path, num_samples=500, batch_size=5, max_workers=2):
    """
    Run a study using the 3-Agent approach
    """
    # Ensure output directory exists
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    
    # Read the dataset lines
    with open(dataset_path, 'r', encoding='utf-8') as f:
        lines = []
        for _ in range(num_samples):
            line = f.readline()
            if not line:
                break
            lines.append(line)
    
    # Create CSV file and writer
    fieldnames = [
        'true_label', 'three_agent_prediction', 'three_agent_correct', 
        'three_agent_time', 'three_agent_confidence'
    ]
    
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    
    csvfile = open(output_path, 'w', newline='')
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    
    # Create a thread lock for CSV writing
    lock = threading.Lock()
    
    # Process batches with ThreadPoolExecutor for parallel processing
    results = []
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Submit batch processing tasks
        futures = []
        for i in range(0, len(lines), batch_size):
            batch_lines = lines[i:i+batch_size]
            futures.append(executor.submit(process_batch, client, batch_lines, lock, writer))
        
        # Process results as they complete
        for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures), desc="Processing batches"):
            batch_results = future.result()
            results.extend(batch_results)
    
    # Close CSV file
    csvfile.close()
    
    # Calculate metrics
    total_processed = len(results)
    
    three_agent_correct = sum(1 for r in results if r['three_agent_correct'])
    three_agent_accuracy = three_agent_correct / total_processed if total_processed > 0 else 0
    three_agent_avg_time = sum(r['three_agent_time'] for r in results) / total_processed if total_processed > 0 else 0
    three_agent_avg_confidence = sum(r['three_agent_confidence'] for r in results) / total_processed if total_processed > 0 else 0
    
    # Print results
    print(f"\n=== STUDY RESULTS ===")
    print(f"Total samples processed: {total_processed}")
    
    print(f"\nTHREE-AGENT APPROACH")
    print(f"   Accuracy: {three_agent_accuracy:.4f} ({three_agent_correct}/{total_processed})")
    print(f"   Avg processing time: {three_agent_avg_time:.2f}s")
    print(f"   Avg confidence: {three_agent_avg_confidence:.4f}")
    
    print(f"\nResults saved to {output_path}")
    
    return {
        'results': results,
        'three_agent_accuracy': three_agent_accuracy,
        'three_agent_avg_time': three_agent_avg_time,
        'three_agent_avg_confidence': three_agent_avg_confidence
    }

# The main execution code
if __name__ == "__main__":
    # The client is initialized separately, so we can directly use it
    dataset_path = r"C:\Users\CoolA\Code\dataset_2way_output.txt"
    output_path = "results/three_agent_study_results.csv"
    num_samples = 500

    print(f"Starting study with {num_samples} samples from {dataset_path}")
    start_time = time.time()

    # Run the study
    metrics = run_study(client, dataset_path, output_path, num_samples)

    end_time = time.time()
    total_time = end_time - start_time

    print(f"\nStudy completed in {total_time:.2f} seconds")

Starting study with 500 samples from C:\Users\CoolA\Code\dataset_2way_output.txt


Processing batches:  56%|█████▌    | 56/100 [33:51<33:10, 45.24s/it] 

Error in Meta-Verifier: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))


Processing batches:  86%|████████▌ | 86/100 [52:43<11:40, 50.02s/it]

Error in Meta-Verifier: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error in Verifier: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error in FactBase: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))


Processing batches: 100%|██████████| 100/100 [1:01:40<00:00, 37.01s/it]


=== STUDY RESULTS ===
Total samples processed: 500

THREE-AGENT APPROACH
   Accuracy: 0.5740 (287/500)
   Avg processing time: 14.78s
   Avg confidence: 0.3575

Results saved to results/three_agent_study_results.csv

Study completed in 3700.51 seconds





In [12]:
import pandas as pd

# Load the CSV file
file_path = r"C:\Users\CoolA\Code\results\final_results.csv"
df = pd.read_csv(file_path)

# Calculate total processed samples
total_processed = len(df)

# Compute accuracy and average processing time
pro_correct = df["pro_correct"].sum()
pro_accuracy = pro_correct / total_processed
pro_avg_time = df["pro_time"].mean()

flash_correct = df["flash_correct"].sum()
flash_accuracy = flash_correct / total_processed
flash_avg_time = df["flash_time"].mean()

two_agent_correct = df["two_agent_correct"].sum()
two_agent_accuracy = two_agent_correct / total_processed
two_agent_avg_time = df["two_agent_time"].mean()

# Calculate performance improvement
best_single_accuracy = max(pro_accuracy, flash_accuracy)
best_single_model = "Gemini 1.5 Pro" if pro_accuracy > flash_accuracy else "Gemini 2.0 Flash"

accuracy_improvement = two_agent_accuracy - best_single_accuracy
relative_improvement = (accuracy_improvement / best_single_accuracy) * 100 if best_single_accuracy > 0 else 0

# Print results
print(f"\n=== ABLATION STUDY RESULTS ===")
print(f"Total samples processed: {total_processed}")

print(f"\n1. GEMINI 1.5 PRO")
print(f"   Accuracy: {pro_accuracy:.4f} ({pro_correct}/{total_processed})")
print(f"   Avg processing time: {pro_avg_time:.2f}s")

print(f"\n2. GEMINI 2.0 FLASH")
print(f"   Accuracy: {flash_accuracy:.4f} ({flash_correct}/{total_processed})")
print(f"   Avg processing time: {flash_avg_time:.2f}s")

print(f"\n3. TWO-AGENT APPROACH")
print(f"   Accuracy: {two_agent_accuracy:.4f} ({two_agent_correct}/{total_processed})")
print(f"   Avg processing time: {two_agent_avg_time:.2f}s")

print(f"\n=== PERFORMANCE COMPARISON ===")
print(f"Best single model: {best_single_model} ({best_single_accuracy:.4f})")
print(f"Two-agent approach: {two_agent_accuracy:.4f}")
print(f"Absolute improvement: {accuracy_improvement:.4f}")
print(f"Relative improvement: {relative_improvement:.2f}%")


=== ABLATION STUDY RESULTS ===
Total samples processed: 487

1. GEMINI 1.5 PRO
   Accuracy: 0.6879 (335/487)
   Avg processing time: 4.78s

2. GEMINI 2.0 FLASH
   Accuracy: 0.7269 (354/487)
   Avg processing time: 2.96s

3. TWO-AGENT APPROACH
   Accuracy: 0.6694 (326/487)
   Avg processing time: 6.74s

=== PERFORMANCE COMPARISON ===
Best single model: Gemini 2.0 Flash (0.7269)
Two-agent approach: 0.6694
Absolute improvement: -0.0575
Relative improvement: -7.91%


In [1]:
import os
import random
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizerFast, BertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, f1_score, accuracy_score
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
from tqdm import tqdm

# Class name mapping
CLASS_NAMES = {
    0: "True",
    1: "False Content",
    #2: "Misleading Content",
    #3: "Manipulated Content",
    #4: "False Content",
    #5: "Imposter Content"
}

# Function to analyze class distribution
def analyze_class_distribution(labels):
    # Count instances of each class
    class_counts = Counter(labels)
    
    # Create pie chart
    plt.figure(figsize=(10, 8))
    plt.pie([class_counts.get(i, 0) for i in range(6)], 
            labels=[f"{CLASS_NAMES.get(i, f'Class {i}')}\n({class_counts.get(i, 0)} samples)" for i in range(6)],
            autopct='%1.1f%%')
    plt.title('Distribution of Classes in Dataset')
    plt.axis('equal')
    plt.savefig('class_distribution.png')
    plt.close()
    
    # Print class distribution
    print("\nClass Distribution:")
    for i in range(2):
        print(f"Class {i} ({CLASS_NAMES.get(i, f'Class {i}')}): {class_counts.get(i, 0)} samples")
    
    return class_counts

# Set random seeds for reproducibility
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

# Compute metrics function
def compute_metrics(predictions, labels):
    # Calculate metrics
    accuracy = accuracy_score(labels, predictions)
    
    # Calculate macro-averaged metrics (treats all classes equally) 
    precision_macro = precision_score(labels, predictions, average='macro', zero_division=0)
    f1_macro = f1_score(labels, predictions, average='macro')
    
    # Calculate weighted-averaged metrics (accounts for class imbalance)
    precision_weighted = precision_score(labels, predictions, average='weighted', zero_division=0)
    f1_weighted = f1_score(labels, predictions, average='weighted')
    
    # Calculate per-class metrics
    precision_per_class = precision_score(labels, predictions, average=None, zero_division=0)
    f1_per_class = f1_score(labels, predictions, average=None)
    
    return {
        "accuracy": accuracy,
        "precision_macro": precision_macro,
        "precision_weighted": precision_weighted,
        "f1_macro": f1_macro,
        "f1_weighted": f1_weighted,
        "precision_per_class": precision_per_class.tolist(),
        "f1_per_class": f1_per_class.tolist()
    }

# Set seed for reproducibility
set_seed(42)

# Path to the dataset
data_path = r"C:\Users\CoolA\Code\extracted_500.txt"

# Load the dataset
texts = []
labels = []

with open(data_path, 'r', encoding='utf-8') as f:
    for line in f:
        line = line.strip()
        if not line:
            continue
        
        # Parse the line
        parts = line.split("[TEXT]")
        if len(parts) != 2:
            continue
        after_text = parts[1]
        
        text_objects_parts = after_text.split("[OBJECTS]")
        if len(text_objects_parts) != 2:
            continue
        text_part = text_objects_parts[0].strip()
        after_objects = text_objects_parts[1]
        
        objects_label_parts = after_objects.split("[LABEL]")
        if len(objects_label_parts) != 2:
            continue
        objects_part = objects_label_parts[0].strip()
        label_part = objects_label_parts[1].strip()
        
        label_int = int(label_part)
        
        # Convert the objects_part (comma-separated) into a Python list
        objects_list = [obj.strip() for obj in objects_part.split(",") if obj.strip()]
        
        # Build a short descriptive sentence for the objects
        if len(objects_list) > 0:
            object_sentence = f"The image contains: {', '.join(objects_list)}."
        else:
            object_sentence = "No objects detected."
        
        # Combine the main text with the object sentence
        combined_input = f"{text_part}. {object_sentence}"
        
        texts.append(combined_input)
        labels.append(label_int)

# Split into train and test (70/30)
train_texts, test_texts, train_labels, test_labels = train_test_split(
    texts, labels, test_size=0.3, random_state=42, stratify=labels
)

# Analyze class distribution
print("Analyzing class distribution...")
analyze_class_distribution(labels)

# Load a BERT tokenizer
model_name = "bert-base-uncased"
tokenizer = BertTokenizerFast.from_pretrained(model_name)

# Tokenize the datasets
max_length = 128

# Tokenize training data
train_encodings = tokenizer(
    train_texts, 
    truncation=True, 
    padding="max_length", 
    max_length=max_length,
    return_tensors="pt"
)

# Tokenize test data
test_encodings = tokenizer(
    test_texts, 
    truncation=True, 
    padding="max_length", 
    max_length=max_length,
    return_tensors="pt"
)

# Convert labels to tensors
train_labels = torch.tensor(train_labels)
test_labels = torch.tensor(test_labels)

# Create PyTorch datasets
train_dataset = TensorDataset(
    train_encodings['input_ids'],
    train_encodings['attention_mask'],
    train_labels
)

test_dataset = TensorDataset(
    test_encodings['input_ids'],
    test_encodings['attention_mask'],
    test_labels
)

# Create DataLoaders 
batch_size = 8
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# Load model
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=6)

# Setup GPU/CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# If using multiple GPUs
if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs!")
    model = nn.DataParallel(model)

model.to(device)

# Training parameters
num_epochs = 5
learning_rate = 2e-5
weight_decay = 0.01

# Optimizer with weight decay
optimizer = optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

# Learning rate scheduler
total_steps = len(train_loader) * num_epochs
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=total_steps)

# Training loop
best_f1 = 0.0
best_model_path = "./best_model_custom.pt"

for epoch in range(num_epochs):
    # Training
    model.train()
    train_loss = 0.0
    
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs} [Train]")
    for batch in progress_bar:
        # Get batch data
        input_ids, attention_mask, labels = [b.to(device) for b in batch]
        
        # Zero gradients
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        
        # Update progress bar
        progress_bar.set_postfix({"loss": f"{loss.item():.4f}"})
        
        # Backward pass
        loss.backward()
        
        # Clip gradients to prevent exploding gradients
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        
        # Update parameters
        optimizer.step()
        scheduler.step()
        
        # Accumulate loss
        train_loss += loss.item()
    
    # Calculate average loss for the epoch
    avg_train_loss = train_loss / len(train_loader)
    print(f"Epoch {epoch+1} - Average Training Loss: {avg_train_loss:.4f}")
    
    # Evaluation
    model.eval()
    all_preds = []
    all_labels = []
    eval_loss = 0.0
    
    with torch.no_grad():
        progress_bar = tqdm(test_loader, desc=f"Epoch {epoch+1}/{num_epochs} [Eval]")
        for batch in progress_bar:
            # Get batch data
            input_ids, attention_mask, labels = [b.to(device) for b in batch]
            
            # Forward pass
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            logits = outputs.logits
            
            # Get predictions
            preds = torch.argmax(logits, dim=1).cpu().numpy()
            
            # Store predictions and labels
            all_preds.extend(preds)
            all_labels.extend(labels.cpu().numpy())
            
            # Accumulate loss
            eval_loss += loss.item()
            
            # Update progress bar
            progress_bar.set_postfix({"loss": f"{loss.item():.4f}"})
    
    # Calculate average evaluation loss
    avg_eval_loss = eval_loss / len(test_loader)
    
    # Compute metrics
    metrics = compute_metrics(all_preds, all_labels)
    
    # Print evaluation results
    print(f"Epoch {epoch+1} - Evaluation Results:")
    print(f"  Loss: {avg_eval_loss:.4f}")
    print(f"  Accuracy: {metrics['accuracy']:.4f}")
    print(f"  Macro F1: {metrics['f1_macro']:.4f}")
    print(f"  Weighted F1: {metrics['f1_weighted']:.4f}")
    
    # Save the best model
    if metrics['f1_macro'] > best_f1:
        best_f1 = metrics['f1_macro']
        if isinstance(model, nn.DataParallel):
            torch.save(model.module.state_dict(), best_model_path)
        else:
            torch.save(model.state_dict(), best_model_path)
        print(f"  New best model saved with F1 Macro: {best_f1:.4f}")

# Load the best model for final evaluation
if os.path.exists(best_model_path):
    if isinstance(model, nn.DataParallel):
        model.module.load_state_dict(torch.load(best_model_path))
    else:
        model.load_state_dict(torch.load(best_model_path))
    print(f"Loaded best model from {best_model_path}")

# Final evaluation
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in tqdm(test_loader, desc="Final Evaluation"):
        input_ids, attention_mask, labels = [b.to(device) for b in batch]
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1).cpu().numpy()
        all_preds.extend(preds)
        all_labels.extend(labels.cpu().numpy())

# Compute and print final metrics
final_metrics = compute_metrics(all_preds, all_labels)

print("\nFinal Test Set Metrics:")
print(f"Accuracy: {final_metrics['accuracy']:.4f}")
print(f"Macro-averaged Precision: {final_metrics['precision_macro']:.4f}")
print(f"Weighted-averaged Precision: {final_metrics['precision_weighted']:.4f}")
print(f"Macro-averaged F1: {final_metrics['f1_macro']:.4f}")
print(f"Weighted-averaged F1: {final_metrics['f1_weighted']:.4f}")

print("\nPer-class Precision:")
for i, p in enumerate(final_metrics['precision_per_class']):
    if i in CLASS_NAMES:
        print(f"Class {i} ({CLASS_NAMES[i]}): {p:.4f}")
    else:
        print(f"Class {i}: {p:.4f}")

print("\nPer-class F1:")
for i, f in enumerate(final_metrics['f1_per_class']):
    if i in CLASS_NAMES:
        print(f"Class {i} ({CLASS_NAMES[i]}): {f:.4f}")
    else:
        print(f"Class {i}: {f:.4f}")

Analyzing class distribution...

Class Distribution:
Class 0 (True): 219 samples
Class 1 (False Content): 281 samples


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using device: cuda


Epoch 1/5 [Train]: 100%|██████████| 44/44 [00:02<00:00, 18.51it/s, loss=0.8050]


Epoch 1 - Average Training Loss: 0.9985


Epoch 1/5 [Eval]: 100%|██████████| 19/19 [00:00<00:00, 66.84it/s, loss=0.7881]


Epoch 1 - Evaluation Results:
  Loss: 0.7569
  Accuracy: 0.5600
  Macro F1: 0.3590
  Weighted F1: 0.4021
  New best model saved with F1 Macro: 0.3590


Epoch 2/5 [Train]: 100%|██████████| 44/44 [00:02<00:00, 20.10it/s, loss=0.6426]


Epoch 2 - Average Training Loss: 0.7290


Epoch 2/5 [Eval]: 100%|██████████| 19/19 [00:00<00:00, 64.99it/s, loss=0.7687]


Epoch 2 - Evaluation Results:
  Loss: 0.6783
  Accuracy: 0.5867
  Macro F1: 0.4224
  Weighted F1: 0.4593
  New best model saved with F1 Macro: 0.4224


Epoch 3/5 [Train]: 100%|██████████| 44/44 [00:02<00:00, 20.20it/s, loss=0.5902]


Epoch 3 - Average Training Loss: 0.6067


Epoch 3/5 [Eval]: 100%|██████████| 19/19 [00:00<00:00, 65.27it/s, loss=0.5183]


Epoch 3 - Evaluation Results:
  Loss: 0.5760
  Accuracy: 0.7600
  Macro F1: 0.7600
  Weighted F1: 0.7600
  New best model saved with F1 Macro: 0.7600


Epoch 4/5 [Train]: 100%|██████████| 44/44 [00:02<00:00, 20.09it/s, loss=0.5220]


Epoch 4 - Average Training Loss: 0.4538


Epoch 4/5 [Eval]: 100%|██████████| 19/19 [00:00<00:00, 63.51it/s, loss=0.6905]


Epoch 4 - Evaluation Results:
  Loss: 0.5393
  Accuracy: 0.7400
  Macro F1: 0.7232
  Weighted F1: 0.7314


Epoch 5/5 [Train]: 100%|██████████| 44/44 [00:02<00:00, 20.03it/s, loss=0.2661]


Epoch 5 - Average Training Loss: 0.3522


Epoch 5/5 [Eval]: 100%|██████████| 19/19 [00:00<00:00, 64.66it/s, loss=0.7823]


Epoch 5 - Evaluation Results:
  Loss: 0.5577
  Accuracy: 0.7533
  Macro F1: 0.7312
  Weighted F1: 0.7405
Loaded best model from ./best_model_custom.pt


Final Evaluation: 100%|██████████| 19/19 [00:00<00:00, 69.01it/s]


Final Test Set Metrics:
Accuracy: 0.7600
Macro-averaged Precision: 0.7711
Weighted-averaged Precision: 0.7822
Macro-averaged F1: 0.7600
Weighted-averaged F1: 0.7600

Per-class Precision:
Class 0 (True): 0.6786
Class 1 (False Content): 0.8636

Per-class F1:
Class 0 (True): 0.7600
Class 1 (False Content): 0.7600





In [2]:
import os
import random
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, f1_score, accuracy_score
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
from tqdm import tqdm

# Class name mapping - updated for binary classification
CLASS_NAMES = {
    0: "True",
    1: "False Content"
}

# Function to analyze class distribution
def analyze_class_distribution(labels):
    # Count instances of each class
    class_counts = Counter(labels)
    
    # Create pie chart
    plt.figure(figsize=(10, 8))
    plt.pie([class_counts.get(i, 0) for i in range(2)], 
            labels=[f"{CLASS_NAMES.get(i, f'Class {i}')}\n({class_counts.get(i, 0)} samples)" for i in range(2)],
            autopct='%1.1f%%')
    plt.title('Distribution of Classes in Dataset')
    plt.axis('equal')
    plt.savefig('class_distribution.png')
    plt.close()
    
    # Print class distribution
    print("\nClass Distribution:")
    for i in range(2):
        print(f"Class {i} ({CLASS_NAMES.get(i, f'Class {i}')}): {class_counts.get(i, 0)} samples")
    
    return class_counts

# Set random seeds for reproducibility
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

# Compute metrics function
def compute_metrics(predictions, labels):
    # Calculate metrics
    accuracy = accuracy_score(labels, predictions)
    
    # Calculate macro-averaged metrics (treats all classes equally) 
    precision_macro = precision_score(labels, predictions, average='macro', zero_division=0)
    f1_macro = f1_score(labels, predictions, average='macro')
    
    # Calculate weighted-averaged metrics (accounts for class imbalance)
    precision_weighted = precision_score(labels, predictions, average='weighted', zero_division=0)
    f1_weighted = f1_score(labels, predictions, average='weighted')
    
    # Calculate per-class metrics
    precision_per_class = precision_score(labels, predictions, average=None, zero_division=0)
    f1_per_class = f1_score(labels, predictions, average=None)
    
    return {
        "accuracy": accuracy,
        "precision_macro": precision_macro,
        "precision_weighted": precision_weighted,
        "f1_macro": f1_macro,
        "f1_weighted": f1_weighted,
        "precision_per_class": precision_per_class.tolist(),
        "f1_per_class": f1_per_class.tolist()
    }

# Main function with model type parameter
def small_model_trainer(model_type="tinybert"):
    # Set seed for reproducibility
    set_seed(42)
    
    # Path to the dataset
    data_path = r"C:\Users\CoolA\Code\extracted_500.txt"
    
    # Load the dataset
    texts = []
    labels = []

    with open(data_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            
            # Parse the line
            parts = line.split("[TEXT]")
            if len(parts) != 2:
                continue
            after_text = parts[1]
            
            text_objects_parts = after_text.split("[OBJECTS]")
            if len(text_objects_parts) != 2:
                continue
            text_part = text_objects_parts[0].strip()
            after_objects = text_objects_parts[1]
            
            objects_label_parts = after_objects.split("[LABEL]")
            if len(objects_label_parts) != 2:
                continue
            objects_part = objects_label_parts[0].strip()
            label_part = objects_label_parts[1].strip()
            
            label_int = int(label_part)
            
            # Convert multi-class to binary (0 = True, 1-5 = False)
            binary_label = 0 if label_int == 0 else 1
            
            # Convert the objects_part (comma-separated) into a Python list
            objects_list = [obj.strip() for obj in objects_part.split(",") if obj.strip()]
            
            # Build a short descriptive sentence for the objects
            if len(objects_list) > 0:
                object_sentence = f"The image contains: {', '.join(objects_list)}."
            else:
                object_sentence = "No objects detected."
            
            # Combine the main text with the object sentence
            combined_input = f"{text_part}. {object_sentence}"
            
            texts.append(combined_input)
            labels.append(binary_label)

    # Split into train and test (70/30)
    train_texts, test_texts, train_labels, test_labels = train_test_split(
        texts, labels, test_size=0.3, random_state=42, stratify=labels
    )

    # Analyze class distribution
    print("Analyzing class distribution...")
    analyze_class_distribution(labels)

    # Load model and tokenizer based on model_type
    print(f"\nInitializing tokenizer and model for {model_type}...")
    
    if model_type.lower() == "tinybert":
        model_name = "huawei-noah/TinyBERT_General_4L_312D"
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForSequenceClassification.from_pretrained(
            model_name,
            num_labels=2,  # 2-way classification
            problem_type="single_label_classification"
        )
    elif model_type.lower() == "distilbert":
        model_name = "distilbert-base-uncased"
        tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)
        model = DistilBertForSequenceClassification.from_pretrained(
            model_name, 
            num_labels=2  # 2-way classification
        )
    else:
        raise ValueError(f"Unknown model type: {model_type}. Choose 'tinybert' or 'distilbert'.")
        
    print(f"Using {model_name}")

    # Tokenize the datasets
    max_length = 128
    
    # Tokenize training data
    print("Tokenizing training data...")
    train_encodings = tokenizer(
        train_texts, 
        truncation=True, 
        padding="max_length", 
        max_length=max_length,
        return_tensors="pt"
    )
    
    # Tokenize test data
    print("Tokenizing test data...")
    test_encodings = tokenizer(
        test_texts, 
        truncation=True, 
        padding="max_length", 
        max_length=max_length,
        return_tensors="pt"
    )
    
    # Convert labels to tensors
    train_labels = torch.tensor(train_labels)
    test_labels = torch.tensor(test_labels)
    
    # Create PyTorch datasets
    train_dataset = TensorDataset(
        train_encodings['input_ids'],
        train_encodings['attention_mask'],
        train_labels
    )
    
    test_dataset = TensorDataset(
        test_encodings['input_ids'],
        test_encodings['attention_mask'],
        test_labels
    )
    
    # Create DataLoaders - optimize batch size for smaller models
    batch_size = 16  # Increased batch size for faster training with smaller models
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)
    
    # Setup GPU/CPU
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")
    
    # If using multiple GPUs
    if torch.cuda.device_count() > 1:
        print(f"Using {torch.cuda.device_count()} GPUs!")
        model = nn.DataParallel(model)
    
    model.to(device)
    
    # Training parameters - optimize for smaller models
    num_epochs = 5
    learning_rate = 5e-5  # Slightly higher learning rate for smaller models
    weight_decay = 0.01
    
    # Optimizer with weight decay
    optimizer = optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
    
    # Learning rate scheduler 
    # Using OneCycleLR for faster convergence with smaller models
    scheduler = optim.lr_scheduler.OneCycleLR(
        optimizer, 
        max_lr=learning_rate,
        steps_per_epoch=len(train_loader),
        epochs=num_epochs
    )
    
    # Training loop
    best_f1 = 0.0
    best_model_path = f"./best_{model_type}_model.pt"
    
    for epoch in range(num_epochs):
        # Training
        model.train()
        train_loss = 0.0
        
        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs} [Train]")
        for batch in progress_bar:
            # Get batch data
            input_ids, attention_mask, labels = [b.to(device) for b in batch]
            
            # Zero gradients
            optimizer.zero_grad()
            
            # Forward pass
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            
            # Update progress bar
            progress_bar.set_postfix({"loss": f"{loss.item():.4f}"})
            
            # Backward pass
            loss.backward()
            
            # Clip gradients to prevent exploding gradients
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            
            # Update parameters
            optimizer.step()
            scheduler.step()
            
            # Accumulate loss
            train_loss += loss.item()
        
        # Calculate average loss for the epoch
        avg_train_loss = train_loss / len(train_loader)
        print(f"Epoch {epoch+1} - Average Training Loss: {avg_train_loss:.4f}")
        
        # Evaluation
        model.eval()
        all_preds = []
        all_labels = []
        eval_loss = 0.0
        
        with torch.no_grad():
            progress_bar = tqdm(test_loader, desc=f"Epoch {epoch+1}/{num_epochs} [Eval]")
            for batch in progress_bar:
                # Get batch data
                input_ids, attention_mask, labels = [b.to(device) for b in batch]
                
                # Forward pass
                outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs.loss
                logits = outputs.logits
                
                # Get predictions
                preds = torch.argmax(logits, dim=1).cpu().numpy()
                
                # Store predictions and labels
                all_preds.extend(preds)
                all_labels.extend(labels.cpu().numpy())
                
                # Accumulate loss
                eval_loss += loss.item()
                
                # Update progress bar
                progress_bar.set_postfix({"loss": f"{loss.item():.4f}"})
        
        # Calculate average evaluation loss
        avg_eval_loss = eval_loss / len(test_loader)
        
        # Compute metrics
        metrics = compute_metrics(all_preds, all_labels)
        
        # Print evaluation results
        print(f"Epoch {epoch+1} - Evaluation Results:")
        print(f"  Loss: {avg_eval_loss:.4f}")
        print(f"  Accuracy: {metrics['accuracy']:.4f}")
        print(f"  Macro F1: {metrics['f1_macro']:.4f}")
        print(f"  Weighted F1: {metrics['f1_weighted']:.4f}")
        
        # Save the best model
        if metrics['f1_macro'] > best_f1:
            best_f1 = metrics['f1_macro']
            if isinstance(model, nn.DataParallel):
                torch.save(model.module.state_dict(), best_model_path)
            else:
                torch.save(model.state_dict(), best_model_path)
            print(f"  New best model saved with F1 Macro: {best_f1:.4f}")
    
    # Load the best model for final evaluation
    if os.path.exists(best_model_path):
        if isinstance(model, nn.DataParallel):
            model.module.load_state_dict(torch.load(best_model_path))
        else:
            model.load_state_dict(torch.load(best_model_path))
        print(f"Loaded best model from {best_model_path}")
    
    # Final evaluation
    model.eval()
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Final Evaluation"):
            input_ids, attention_mask, labels = [b.to(device) for b in batch]
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1).cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(labels.cpu().numpy())
    
    # Compute and print final metrics
    final_metrics = compute_metrics(all_preds, all_labels)
    
    print("\nFinal Test Set Metrics:")
    print(f"Accuracy: {final_metrics['accuracy']:.4f}")
    print(f"Macro-averaged Precision: {final_metrics['precision_macro']:.4f}")
    print(f"Weighted-averaged Precision: {final_metrics['precision_weighted']:.4f}")
    print(f"Macro-averaged F1: {final_metrics['f1_macro']:.4f}")
    print(f"Weighted-averaged F1: {final_metrics['f1_weighted']:.4f}")
    
    print("\nPer-class Precision:")
    for i, p in enumerate(final_metrics['precision_per_class']):
        if i in CLASS_NAMES:
            print(f"Class {i} ({CLASS_NAMES[i]}): {p:.4f}")
        else:
            print(f"Class {i}: {p:.4f}")
    
    print("\nPer-class F1:")
    for i, f in enumerate(final_metrics['f1_per_class']):
        if i in CLASS_NAMES:
            print(f"Class {i} ({CLASS_NAMES[i]}): {f:.4f}")
        else:
            print(f"Class {i}: {f:.4f}")

In [3]:
# To run both models sequentially:
print("\n" + "="*50)
print("Running TinyBERT training:")
small_model_trainer("tinybert")
print("\n" + "="*50)


Running TinyBERT training:
Analyzing class distribution...

Class Distribution:
Class 0 (True): 219 samples
Class 1 (False Content): 281 samples

Initializing tokenizer and model for tinybert...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at huawei-noah/TinyBERT_General_4L_312D and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using huawei-noah/TinyBERT_General_4L_312D
Tokenizing training data...
Tokenizing test data...
Using device: cuda


Epoch 1/5 [Train]: 100%|██████████| 22/22 [00:00<00:00, 77.36it/s, loss=0.7448]


Epoch 1 - Average Training Loss: 0.6928


Epoch 1/5 [Eval]: 100%|██████████| 10/10 [00:00<00:00, 150.80it/s, loss=0.6969]


Epoch 1 - Evaluation Results:
  Loss: 0.6873
  Accuracy: 0.5600
  Macro F1: 0.3590
  Weighted F1: 0.4021
  New best model saved with F1 Macro: 0.3590


Epoch 2/5 [Train]: 100%|██████████| 22/22 [00:00<00:00, 73.82it/s, loss=0.7032]


Epoch 2 - Average Training Loss: 0.6890


Epoch 2/5 [Eval]: 100%|██████████| 10/10 [00:00<00:00, 207.57it/s, loss=0.6874]


Epoch 2 - Evaluation Results:
  Loss: 0.6883
  Accuracy: 0.5600
  Macro F1: 0.3590
  Weighted F1: 0.4021


Epoch 3/5 [Train]: 100%|██████████| 22/22 [00:00<00:00, 77.50it/s, loss=0.6546]


Epoch 3 - Average Training Loss: 0.6770


Epoch 3/5 [Eval]: 100%|██████████| 10/10 [00:00<00:00, 195.49it/s, loss=0.6580]


Epoch 3 - Evaluation Results:
  Loss: 0.6721
  Accuracy: 0.6200
  Macro F1: 0.6171
  Weighted F1: 0.6211
  New best model saved with F1 Macro: 0.6171


Epoch 4/5 [Train]: 100%|██████████| 22/22 [00:00<00:00, 78.78it/s, loss=0.6204]


Epoch 4 - Average Training Loss: 0.6269


Epoch 4/5 [Eval]: 100%|██████████| 10/10 [00:00<00:00, 170.26it/s, loss=0.6248]


Epoch 4 - Evaluation Results:
  Loss: 0.6494
  Accuracy: 0.6533
  Macro F1: 0.6523
  Weighted F1: 0.6501
  New best model saved with F1 Macro: 0.6523


Epoch 5/5 [Train]: 100%|██████████| 22/22 [00:00<00:00, 79.59it/s, loss=0.6868]


Epoch 5 - Average Training Loss: 0.6037


Epoch 5/5 [Eval]: 100%|██████████| 10/10 [00:00<00:00, 203.72it/s, loss=0.6115]


Epoch 5 - Evaluation Results:
  Loss: 0.6421
  Accuracy: 0.6333
  Macro F1: 0.6296
  Weighted F1: 0.6341
Loaded best model from ./best_tinybert_model.pt


Final Evaluation: 100%|██████████| 10/10 [00:00<00:00, 171.90it/s]


Final Test Set Metrics:
Accuracy: 0.6533
Macro-averaged Precision: 0.6760
Weighted-averaged Precision: 0.6880
Macro-averaged F1: 0.6523
Weighted-averaged F1: 0.6501

Per-class Precision:
Class 0 (True): 0.5761
Class 1 (False Content): 0.7759

Per-class F1:
Class 0 (True): 0.6709
Class 1 (False Content): 0.6338






In [4]:
print("\n" + "="*50)
print("Running DistilBERT training:")
small_model_trainer("distilbert")
print("\n" + "="*50)


Running DistilBERT training:
Analyzing class distribution...

Class Distribution:
Class 0 (True): 219 samples
Class 1 (False Content): 281 samples

Initializing tokenizer and model for distilbert...


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using distilbert-base-uncased
Tokenizing training data...
Tokenizing test data...
Using device: cuda


Epoch 1/5 [Train]: 100%|██████████| 22/22 [00:00<00:00, 22.72it/s, loss=0.6783]


Epoch 1 - Average Training Loss: 0.6888


Epoch 1/5 [Eval]: 100%|██████████| 10/10 [00:00<00:00, 70.08it/s, loss=0.6970]


Epoch 1 - Evaluation Results:
  Loss: 0.6764
  Accuracy: 0.5733
  Macro F1: 0.4037
  Weighted F1: 0.4419
  New best model saved with F1 Macro: 0.4037


Epoch 2/5 [Train]: 100%|██████████| 22/22 [00:00<00:00, 24.04it/s, loss=0.7400]


Epoch 2 - Average Training Loss: 0.6410


Epoch 2/5 [Eval]: 100%|██████████| 10/10 [00:00<00:00, 76.54it/s, loss=0.7613]


Epoch 2 - Evaluation Results:
  Loss: 0.6762
  Accuracy: 0.6133
  Macro F1: 0.4890
  Weighted F1: 0.5192
  New best model saved with F1 Macro: 0.4890


Epoch 3/5 [Train]: 100%|██████████| 22/22 [00:00<00:00, 24.67it/s, loss=0.4641]


Epoch 3 - Average Training Loss: 0.4185


Epoch 3/5 [Eval]: 100%|██████████| 10/10 [00:00<00:00, 77.49it/s, loss=0.5062]


Epoch 3 - Evaluation Results:
  Loss: 0.5878
  Accuracy: 0.7200
  Macro F1: 0.6962
  Weighted F1: 0.7064
  New best model saved with F1 Macro: 0.6962


Epoch 4/5 [Train]: 100%|██████████| 22/22 [00:00<00:00, 24.03it/s, loss=0.0782]


Epoch 4 - Average Training Loss: 0.1689


Epoch 4/5 [Eval]: 100%|██████████| 10/10 [00:00<00:00, 76.96it/s, loss=0.4713]


Epoch 4 - Evaluation Results:
  Loss: 0.7272
  Accuracy: 0.6867
  Macro F1: 0.6614
  Weighted F1: 0.6725


Epoch 5/5 [Train]: 100%|██████████| 22/22 [00:00<00:00, 24.66it/s, loss=0.0639]


Epoch 5 - Average Training Loss: 0.0957


Epoch 5/5 [Eval]: 100%|██████████| 10/10 [00:00<00:00, 75.49it/s, loss=0.1399]


Epoch 5 - Evaluation Results:
  Loss: 0.6906
  Accuracy: 0.7200
  Macro F1: 0.7182
  Weighted F1: 0.7209
  New best model saved with F1 Macro: 0.7182
Loaded best model from ./best_distilbert_model.pt


Final Evaluation: 100%|██████████| 10/10 [00:00<00:00, 82.86it/s]


Final Test Set Metrics:
Accuracy: 0.7200
Macro-averaged Precision: 0.7179
Weighted-averaged Precision: 0.7241
Macro-averaged F1: 0.7182
Weighted-averaged F1: 0.7209

Per-class Precision:
Class 0 (True): 0.6667
Class 1 (False Content): 0.7692

Per-class F1:
Class 0 (True): 0.6957
Class 1 (False Content): 0.7407






In [None]:
import os
import random
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, f1_score, accuracy_score
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
from tqdm import tqdm

# Class name mapping - updated for binary classification
CLASS_NAMES = {
    0: "True",
    1: "False Content"
}

# Function to analyze class distribution
def analyze_class_distribution(labels):
    # Count instances of each class
    class_counts = Counter(labels)
    
    # Create pie chart
    plt.figure(figsize=(10, 8))
    plt.pie([class_counts.get(i, 0) for i in range(2)], 
            labels=[f"{CLASS_NAMES.get(i, f'Class {i}')}\n({class_counts.get(i, 0)} samples)" for i in range(2)],
            autopct='%1.1f%%')
    plt.title('Distribution of Classes in Dataset')
    plt.axis('equal')
    plt.savefig('class_distribution.png')
    plt.close()
    
    # Print class distribution
    print("\nClass Distribution:")
    for i in range(2):
        print(f"Class {i} ({CLASS_NAMES.get(i, f'Class {i}')}): {class_counts.get(i, 0)} samples")
    
    return class_counts

# Set random seeds for reproducibility
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

# Compute metrics function
def compute_metrics(predictions, labels):
    # Calculate metrics
    accuracy = accuracy_score(labels, predictions)
    
    # Calculate macro-averaged metrics (treats all classes equally) 
    precision_macro = precision_score(labels, predictions, average='macro', zero_division=0)
    f1_macro = f1_score(labels, predictions, average='macro')
    
    # Calculate weighted-averaged metrics (accounts for class imbalance)
    precision_weighted = precision_score(labels, predictions, average='weighted', zero_division=0)
    f1_weighted = f1_score(labels, predictions, average='weighted')
    
    # Calculate per-class metrics
    precision_per_class = precision_score(labels, predictions, average=None, zero_division=0)
    f1_per_class = f1_score(labels, predictions, average=None)
    
    return {
        "accuracy": accuracy,
        "precision_macro": precision_macro,
        "precision_weighted": precision_weighted,
        "f1_macro": f1_macro,
        "f1_weighted": f1_weighted,
        "precision_per_class": precision_per_class.tolist(),
        "f1_per_class": f1_per_class.tolist()
    }

# Main function with model type parameter
def small_model_trainer(model_type="tinybert"):
    # Set seed for reproducibility
    set_seed(42)
    
    # Path to the dataset
    data_path = r"C:\Users\CoolA\Code\extracted_500.txt"
    
    # Load the dataset
    texts = []
    labels = []

    with open(data_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            
            # Parse the line
            parts = line.split("[TEXT]")
            if len(parts) != 2:
                continue
            after_text = parts[1]
            
            text_objects_parts = after_text.split("[OBJECTS]")
            if len(text_objects_parts) != 2:
                continue
            text_part = text_objects_parts[0].strip()
            after_objects = text_objects_parts[1]
            
            objects_label_parts = after_objects.split("[LABEL]")
            if len(objects_label_parts) != 2:
                continue
            objects_part = objects_label_parts[0].strip()
            label_part = objects_label_parts[1].strip()
            
            label_int = int(label_part)
            
            # Convert multi-class to binary (0 = True, 1-5 = False)
            binary_label = 0 if label_int == 0 else 1
            
            # Convert the objects_part (comma-separated) into a Python list
            objects_list = [obj.strip() for obj in objects_part.split(",") if obj.strip()]
            
            # Build a short descriptive sentence for the objects
            if len(objects_list) > 0:
                object_sentence = f"The image contains: {', '.join(objects_list)}."
            else:
                object_sentence = "No objects detected."
            
            # Combine the main text with the object sentence
            combined_input = f"{text_part}. {object_sentence}"
            
            texts.append(combined_input)
            labels.append(binary_label)

    # Split into train and test (70/30)
    train_texts, test_texts, train_labels, test_labels = train_test_split(
        texts, labels, test_size=0.3, random_state=42, stratify=labels
    )

    # Analyze class distribution
    print("Analyzing class distribution...")
    analyze_class_distribution(labels)

    # Load model and tokenizer based on model_type
    print(f"\nInitializing tokenizer and model for {model_type}...")
    
    if model_type.lower() == "tinybert":
        model_name = "huawei-noah/TinyBERT_General_4L_312D"
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForSequenceClassification.from_pretrained(
            model_name,
            num_labels=2,  # 2-way classification
            problem_type="single_label_classification"
        )
    elif model_type.lower() == "distilbert":
        model_name = "distilbert-base-uncased"
        tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)
        model = DistilBertForSequenceClassification.from_pretrained(
            model_name, 
            num_labels=2  # 2-way classification
        )
    elif model_type.lower() == "roberta":
        model_name = "FacebookAI/xlm-roberta-base"
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForSequenceClassification.from_pretrained(
            model_name,
            num_labels=2,  # 2-way classification
            problem_type="single_label_classification"
        )
    elif model_type.lower() == "deberta":
        model_name = "microsoft/deberta-v3-large"
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForSequenceClassification.from_pretrained(
            model_name,
            num_labels=2,  # 2-way classification
            problem_type="single_label_classification"
        )
    else:
        raise ValueError(f"Unknown model type: {model_type}. Choose 'tinybert', 'distilbert', 'roberta', or 'deberta'.")
        
    print(f"Using {model_name}")

    # Tokenize the datasets - adjust max_length based on model
    # DeBERTa and RoBERTa can handle longer sequences efficiently
    if model_type.lower() in ["deberta", "roberta"]:
        max_length = 256  # Increased for larger models
    else:
        max_length = 128
    
    # Tokenize training data
    print(f"Tokenizing training data with max_length={max_length}...")
    train_encodings = tokenizer(
        train_texts, 
        truncation=True, 
        padding="max_length", 
        max_length=max_length,
        return_tensors="pt"
    )
    
    # Tokenize test data
    print("Tokenizing test data...")
    test_encodings = tokenizer(
        test_texts, 
        truncation=True, 
        padding="max_length", 
        max_length=max_length,
        return_tensors="pt"
    )
    
    # Convert labels to tensors
    train_labels = torch.tensor(train_labels)
    test_labels = torch.tensor(test_labels)
    
    # Create PyTorch datasets
    train_dataset = TensorDataset(
        train_encodings['input_ids'],
        train_encodings['attention_mask'],
        train_labels
    )
    
    test_dataset = TensorDataset(
        test_encodings['input_ids'],
        test_encodings['attention_mask'],
        test_labels
    )
    
    # Create DataLoaders - adjust batch size based on model size
    if model_type.lower() == "deberta":
        batch_size = 8  # Smaller batch size for larger models
    elif model_type.lower() == "roberta":
        batch_size = 12  # Medium batch size for medium models
    else:
        batch_size = 16  # Larger batch size for smaller models
    
    print(f"Using batch size of {batch_size} for {model_type}")
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)
    
    # Setup GPU/CPU
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")
    
    # If using multiple GPUs
    if torch.cuda.device_count() > 1:
        print(f"Using {torch.cuda.device_count()} GPUs!")
        model = nn.DataParallel(model)
    
    model.to(device)
    
    # Training parameters - adjust based on model size
    if model_type.lower() == "deberta":
        num_epochs = 3  # Fewer epochs for larger models
        learning_rate = 2e-5  # Lower learning rate for larger models
        weight_decay = 0.01
    elif model_type.lower() == "roberta":
        num_epochs = 4  # Medium number of epochs
        learning_rate = 3e-5  # Medium learning rate
        weight_decay = 0.01
    else:
        num_epochs = 5  # More epochs for smaller models
        learning_rate = 5e-5  # Higher learning rate for smaller models
        weight_decay = 0.01
    
    print(f"Training for {num_epochs} epochs with learning rate {learning_rate}")
    
    # Optimizer with weight decay
    optimizer = optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
    
    # Learning rate scheduler 
    # Using OneCycleLR for faster convergence
    scheduler = optim.lr_scheduler.OneCycleLR(
        optimizer, 
        max_lr=learning_rate,
        steps_per_epoch=len(train_loader),
        epochs=num_epochs
    )
    
    # Training loop
    best_f1 = 0.0
    #best_model_path = f"./best_{model_type}_model.pt"
    
    for epoch in range(num_epochs):
        # Training
        model.train()
        train_loss = 0.0
        
        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs} [Train]")
        for batch in progress_bar:
            # Get batch data
            input_ids, attention_mask, labels = [b.to(device) for b in batch]
            
            # Zero gradients
            optimizer.zero_grad()
            
            # Forward pass
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            
            # Update progress bar
            progress_bar.set_postfix({"loss": f"{loss.item():.4f}"})
            
            # Backward pass
            loss.backward()
            
            # Clip gradients to prevent exploding gradients
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            
            # Update parameters
            optimizer.step()
            scheduler.step()
            
            # Accumulate loss
            train_loss += loss.item()
        
        # Calculate average loss for the epoch
        avg_train_loss = train_loss / len(train_loader)
        print(f"Epoch {epoch+1} - Average Training Loss: {avg_train_loss:.4f}")
        
        # Evaluation
        model.eval()
        all_preds = []
        all_labels = []
        eval_loss = 0.0
        
        with torch.no_grad():
            progress_bar = tqdm(test_loader, desc=f"Epoch {epoch+1}/{num_epochs} [Eval]")
            for batch in progress_bar:
                # Get batch data
                input_ids, attention_mask, labels = [b.to(device) for b in batch]
                
                # Forward pass
                outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs.loss
                logits = outputs.logits
                
                # Get predictions
                preds = torch.argmax(logits, dim=1).cpu().numpy()
                
                # Store predictions and labels
                all_preds.extend(preds)
                all_labels.extend(labels.cpu().numpy())
                
                # Accumulate loss
                eval_loss += loss.item()
                
                # Update progress bar
                progress_bar.set_postfix({"loss": f"{loss.item():.4f}"})
        
        # Calculate average evaluation loss
        avg_eval_loss = eval_loss / len(test_loader)
        
        # Compute metrics
        metrics = compute_metrics(all_preds, all_labels)
        
        # Print evaluation results
        print(f"Epoch {epoch+1} - Evaluation Results:")
        print(f"  Loss: {avg_eval_loss:.4f}")
        print(f"  Accuracy: {metrics['accuracy']:.4f}")
        print(f"  Macro F1: {metrics['f1_macro']:.4f}")
        print(f"  Weighted F1: {metrics['f1_weighted']:.4f}")
        
        # # Save the best model
        # if metrics['f1_macro'] > best_f1:
        #     best_f1 = metrics['f1_macro']
        #     if isinstance(model, nn.DataParallel):
        #         torch.save(model.module.state_dict(), best_model_path)
        #     else:
        #         torch.save(model.state_dict(), best_model_path)
        #     print(f"  New best model saved with F1 Macro: {best_f1:.4f}")
    
    # Load the best model for final evaluation
    if os.path.exists(best_model_path):
        if isinstance(model, nn.DataParallel):
            model.module.load_state_dict(torch.load(best_model_path))
        else:
            model.load_state_dict(torch.load(best_model_path))
        print(f"Loaded best model from {best_model_path}")
    
    # Final evaluation
    model.eval()
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Final Evaluation"):
            input_ids, attention_mask, labels = [b.to(device) for b in batch]
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1).cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(labels.cpu().numpy())
    
    # Compute and print final metrics
    final_metrics = compute_metrics(all_preds, all_labels)
    
    print("\nFinal Test Set Metrics:")
    print(f"Accuracy: {final_metrics['accuracy']:.4f}")
    print(f"Macro-averaged Precision: {final_metrics['precision_macro']:.4f}")
    print(f"Weighted-averaged Precision: {final_metrics['precision_weighted']:.4f}")
    print(f"Macro-averaged F1: {final_metrics['f1_macro']:.4f}")
    print(f"Weighted-averaged F1: {final_metrics['f1_weighted']:.4f}")
    
    print("\nPer-class Precision:")
    for i, p in enumerate(final_metrics['precision_per_class']):
        if i in CLASS_NAMES:
            print(f"Class {i} ({CLASS_NAMES[i]}): {p:.4f}")
        else:
            print(f"Class {i}: {p:.4f}")
    
    print("\nPer-class F1:")
    for i, f in enumerate(final_metrics['f1_per_class']):
        if i in CLASS_NAMES:
            print(f"Class {i} ({CLASS_NAMES[i]}): {f:.4f}")
        else:
            print(f"Class {i}: {f:.4f}")

    # Return metrics for comparison
    return final_metrics

In [2]:
print("\n" + "="*50)
print("Running DeBERTa training:")
small_model_trainer("deberta")
print("\n" + "="*50)


Running DeBERTa training:
Analyzing class distribution...

Class Distribution:
Class 0 (True): 219 samples
Class 1 (False Content): 281 samples

Initializing tokenizer and model for deberta...


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using microsoft/deberta-v3-large
Tokenizing training data with max_length=256...
Tokenizing test data...
Using batch size of 8 for deberta
Using device: cuda
Training for 3 epochs with learning rate 2e-05


Epoch 1/3 [Train]: 100%|██████████| 44/44 [00:13<00:00,  3.19it/s, loss=0.6687]


Epoch 1 - Average Training Loss: 0.7007


Epoch 1/3 [Eval]: 100%|██████████| 19/19 [00:01<00:00, 12.17it/s, loss=0.6913]


Epoch 1 - Evaluation Results:
  Loss: 0.6812
  Accuracy: 0.5600
  Macro F1: 0.3590
  Weighted F1: 0.4021


Epoch 2/3 [Train]: 100%|██████████| 44/44 [00:13<00:00,  3.28it/s, loss=0.5027]


Epoch 2 - Average Training Loss: 0.6510


Epoch 2/3 [Eval]: 100%|██████████| 19/19 [00:01<00:00, 12.20it/s, loss=0.7040]


Epoch 2 - Evaluation Results:
  Loss: 0.6112
  Accuracy: 0.6000
  Macro F1: 0.4802
  Weighted F1: 0.5102


Epoch 3/3 [Train]: 100%|██████████| 44/44 [00:13<00:00,  3.34it/s, loss=0.4947]


Epoch 3 - Average Training Loss: 0.4351


Epoch 3/3 [Eval]: 100%|██████████| 19/19 [00:01<00:00, 12.47it/s, loss=0.6623]


Epoch 3 - Evaluation Results:
  Loss: 0.5421
  Accuracy: 0.7467
  Macro F1: 0.7228
  Weighted F1: 0.7326


Final Evaluation: 100%|██████████| 19/19 [00:01<00:00, 12.62it/s]


Final Test Set Metrics:
Accuracy: 0.7467
Macro-averaged Precision: 0.7795
Weighted-averaged Precision: 0.7711
Macro-averaged F1: 0.7228
Weighted-averaged F1: 0.7326

Per-class Precision:
Class 0 (True): 0.8500
Class 1 (False Content): 0.7091

Per-class F1:
Class 0 (True): 0.6415
Class 1 (False Content): 0.8041






In [2]:
print("\n" + "="*50)
print("Running RoBERTa training:")
small_model_trainer("roberta")
print("\n" + "="*50)


Running RoBERTa training:
Analyzing class distribution...

Class Distribution:
Class 0 (True): 219 samples
Class 1 (False Content): 281 samples

Initializing tokenizer and model for roberta...


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using FacebookAI/xlm-roberta-base
Tokenizing training data with max_length=256...
Tokenizing test data...
Using batch size of 12 for roberta
Using device: cuda
Training for 4 epochs with learning rate 3e-05


Epoch 1/4 [Train]: 100%|██████████| 30/30 [00:03<00:00,  8.79it/s, loss=0.7512]


Epoch 1 - Average Training Loss: 0.7225


Epoch 1/4 [Eval]: 100%|██████████| 13/13 [00:00<00:00, 37.37it/s, loss=0.6944]


Epoch 1 - Evaluation Results:
  Loss: 0.6941
  Accuracy: 0.4400
  Macro F1: 0.3056
  Weighted F1: 0.2689


Epoch 2/4 [Train]: 100%|██████████| 30/30 [00:03<00:00,  9.06it/s, loss=0.6108]


Epoch 2 - Average Training Loss: 0.6867


Epoch 2/4 [Eval]: 100%|██████████| 13/13 [00:00<00:00, 36.24it/s, loss=0.6554]


Epoch 2 - Evaluation Results:
  Loss: 0.6811
  Accuracy: 0.5600
  Macro F1: 0.3590
  Weighted F1: 0.4021


Epoch 3/4 [Train]: 100%|██████████| 30/30 [00:03<00:00,  9.04it/s, loss=0.8294]


Epoch 3 - Average Training Loss: 0.6971


Epoch 3/4 [Eval]: 100%|██████████| 13/13 [00:00<00:00, 36.12it/s, loss=0.6959]


Epoch 3 - Evaluation Results:
  Loss: 0.6858
  Accuracy: 0.5600
  Macro F1: 0.3590
  Weighted F1: 0.4021


Epoch 4/4 [Train]: 100%|██████████| 30/30 [00:03<00:00,  9.05it/s, loss=0.7972]


Epoch 4 - Average Training Loss: 0.6858


Epoch 4/4 [Eval]: 100%|██████████| 13/13 [00:00<00:00, 36.76it/s, loss=0.6951]


Epoch 4 - Evaluation Results:
  Loss: 0.6861
  Accuracy: 0.5600
  Macro F1: 0.3590
  Weighted F1: 0.4021


Final Evaluation: 100%|██████████| 13/13 [00:00<00:00, 38.61it/s]



Final Test Set Metrics:
Accuracy: 0.5600
Macro-averaged Precision: 0.2800
Weighted-averaged Precision: 0.3136
Macro-averaged F1: 0.3590
Weighted-averaged F1: 0.4021

Per-class Precision:
Class 0 (True): 0.0000
Class 1 (False Content): 0.5600

Per-class F1:
Class 0 (True): 0.0000
Class 1 (False Content): 0.7179



In [1]:
import os
import random
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, f1_score, accuracy_score
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
from tqdm import tqdm

# Class name mapping - updated for binary classification
CLASS_NAMES = {
    0: "True",
    1: "False Content"
}

# Function to analyze class distribution
def analyze_class_distribution(labels):
    # Count instances of each class
    class_counts = Counter(labels)
    
    # Create pie chart
    plt.figure(figsize=(10, 8))
    plt.pie([class_counts.get(i, 0) for i in range(2)], 
            labels=[f"{CLASS_NAMES.get(i, f'Class {i}')}\n({class_counts.get(i, 0)} samples)" for i in range(2)],
            autopct='%1.1f%%')
    plt.title('Distribution of Classes in Dataset')
    plt.axis('equal')
    plt.savefig('class_distribution.png')
    plt.close()
    
    # Print class distribution
    print("\nClass Distribution:")
    for i in range(2):
        print(f"Class {i} ({CLASS_NAMES.get(i, f'Class {i}')}): {class_counts.get(i, 0)} samples")
    
    return class_counts

# Set random seeds for reproducibility
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

# Compute metrics function
def compute_metrics(predictions, labels):
    # Calculate metrics
    accuracy = accuracy_score(labels, predictions)
    
    # Calculate macro-averaged metrics (treats all classes equally) 
    precision_macro = precision_score(labels, predictions, average='macro', zero_division=0)
    f1_macro = f1_score(labels, predictions, average='macro')
    
    # Calculate weighted-averaged metrics (accounts for class imbalance)
    precision_weighted = precision_score(labels, predictions, average='weighted', zero_division=0)
    f1_weighted = f1_score(labels, predictions, average='weighted')
    
    # Calculate per-class metrics
    precision_per_class = precision_score(labels, predictions, average=None, zero_division=0)
    f1_per_class = f1_score(labels, predictions, average=None)
    
    return {
        "accuracy": accuracy,
        "precision_macro": precision_macro,
        "precision_weighted": precision_weighted,
        "f1_macro": f1_macro,
        "f1_weighted": f1_weighted,
        "precision_per_class": precision_per_class.tolist(),
        "f1_per_class": f1_per_class.tolist()
    }

# Main function with model type parameter
def small_model_trainer(model_type="tinybert"):
    # Set seed for reproducibility
    set_seed(42)
    
    # Path to the dataset
    data_path = r"C:\Users\CoolA\Code\extracted_500.txt"
    
    # Load the dataset
    texts = []
    labels = []

    with open(data_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            
            # Parse the line
            parts = line.split("[TEXT]")
            if len(parts) != 2:
                continue
            after_text = parts[1]
            
            text_objects_parts = after_text.split("[OBJECTS]")
            if len(text_objects_parts) != 2:
                continue
            text_part = text_objects_parts[0].strip()
            after_objects = text_objects_parts[1]
            
            objects_label_parts = after_objects.split("[LABEL]")
            if len(objects_label_parts) != 2:
                continue
            objects_part = objects_label_parts[0].strip()
            label_part = objects_label_parts[1].strip()
            
            label_int = int(label_part)
            
            # Convert multi-class to binary (0 = True, 1-5 = False)
            binary_label = 0 if label_int == 0 else 1
            
            # Convert the objects_part (comma-separated) into a Python list
            objects_list = [obj.strip() for obj in objects_part.split(",") if obj.strip()]
            
            # Build a short descriptive sentence for the objects
            if len(objects_list) > 0:
                object_sentence = f"The image contains: {', '.join(objects_list)}."
            else:
                object_sentence = "No objects detected."
            
            # Combine the main text with the object sentence
            combined_input = f"{text_part}. {object_sentence}"
            
            texts.append(combined_input)
            labels.append(binary_label)

    # Split into train and test (70/30)
    train_texts, test_texts, train_labels, test_labels = train_test_split(
        texts, labels, test_size=0.3, random_state=42, stratify=labels
    )

    # Analyze class distribution
    print("Analyzing class distribution...")
    analyze_class_distribution(labels)

    # Load model and tokenizer based on model_type
    print(f"\nInitializing tokenizer and model for {model_type}...")
    
    if model_type.lower() == "tinybert":
        model_name = "huawei-noah/TinyBERT_General_4L_312D"
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForSequenceClassification.from_pretrained(
            model_name,
            num_labels=2,  # 2-way classification
            problem_type="single_label_classification"
        )
    elif model_type.lower() == "distilbert":
        model_name = "distilbert-base-uncased"
        tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)
        model = DistilBertForSequenceClassification.from_pretrained(
            model_name, 
            num_labels=2  # 2-way classification
        )
    elif model_type.lower() == "roberta-large":
        model_name = "FacebookAI/roberta-large"
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForSequenceClassification.from_pretrained(
            model_name,
            num_labels=2,  # 2-way classification
            problem_type="single_label_classification"
        )
    elif model_type.lower() == "roberta-base":
        model_name = "FacebookAI/roberta-base"
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForSequenceClassification.from_pretrained(
            model_name,
            num_labels=2,  # 2-way classification
            problem_type="single_label_classification"
        )
    else:
        raise ValueError(f"Unknown model type: {model_type}. Choose 'tinybert', 'distilbert', 'roberta-large', or 'roberta-base'.")
        
    print(f"Using {model_name}")

    # Tokenize the datasets - adjust max_length based on model
    # DeBERTa and RoBERTa can handle longer sequences efficiently
    if model_type.lower() in ["deberta", "roberta"]:
        max_length = 256  # Increased for larger models
    else:
        max_length = 128
    
    # Tokenize training data
    print(f"Tokenizing training data with max_length={max_length}...")
    train_encodings = tokenizer(
        train_texts, 
        truncation=True, 
        padding="max_length", 
        max_length=max_length,
        return_tensors="pt"
    )
    
    # Tokenize test data
    print("Tokenizing test data...")
    test_encodings = tokenizer(
        test_texts, 
        truncation=True, 
        padding="max_length", 
        max_length=max_length,
        return_tensors="pt"
    )
    
    # Convert labels to tensors
    train_labels = torch.tensor(train_labels)
    test_labels = torch.tensor(test_labels)
    
    # Create PyTorch datasets
    train_dataset = TensorDataset(
        train_encodings['input_ids'],
        train_encodings['attention_mask'],
        train_labels
    )
    
    test_dataset = TensorDataset(
        test_encodings['input_ids'],
        test_encodings['attention_mask'],
        test_labels
    )
    
    # Create DataLoaders - adjust batch size based on model size
    if model_type.lower() == "deberta":
        batch_size = 8  # Smaller batch size for larger models
    elif model_type.lower() == "roberta":
        batch_size = 12  # Medium batch size for medium models
    else:
        batch_size = 16  # Larger batch size for smaller models
    
    print(f"Using batch size of {batch_size} for {model_type}")
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)
    
    # Setup GPU/CPU
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")
    
    # If using multiple GPUs
    if torch.cuda.device_count() > 1:
        print(f"Using {torch.cuda.device_count()} GPUs!")
        model = nn.DataParallel(model)
    
    model.to(device)
    
    # Training parameters - adjust based on model size
    if model_type.lower() == "deberta":
        num_epochs = 3  # Fewer epochs for larger models
        learning_rate = 2e-5  # Lower learning rate for larger models
        weight_decay = 0.01
    elif model_type.lower() == "roberta-large":
        num_epochs = 4  # Medium number of epochs
        learning_rate = 3e-5  # Medium learning rate
        weight_decay = 0.01
    else:
        num_epochs = 5  # More epochs for smaller models
        learning_rate = 5e-5  # Higher learning rate for smaller models
        weight_decay = 0.01
    
    print(f"Training for {num_epochs} epochs with learning rate {learning_rate}")
    
    # Optimizer with weight decay
    optimizer = optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
    
    # Learning rate scheduler 
    # Using OneCycleLR for faster convergence
    scheduler = optim.lr_scheduler.OneCycleLR(
        optimizer, 
        max_lr=learning_rate,
        steps_per_epoch=len(train_loader),
        epochs=num_epochs
    )
    
    # Training loop
    best_f1 = 0.0
    best_model_path = f"./best_{model_type}_model.pt"
    
    for epoch in range(num_epochs):
        # Training
        model.train()
        train_loss = 0.0
        
        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs} [Train]")
        for batch in progress_bar:
            # Get batch data
            input_ids, attention_mask, labels = [b.to(device) for b in batch]
            
            # Zero gradients
            optimizer.zero_grad()
            
            # Forward pass
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            
            # Update progress bar
            progress_bar.set_postfix({"loss": f"{loss.item():.4f}"})
            
            # Backward pass
            loss.backward()
            
            # Clip gradients to prevent exploding gradients
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            
            # Update parameters
            optimizer.step()
            scheduler.step()
            
            # Accumulate loss
            train_loss += loss.item()
        
        # Calculate average loss for the epoch
        avg_train_loss = train_loss / len(train_loader)
        print(f"Epoch {epoch+1} - Average Training Loss: {avg_train_loss:.4f}")
        
        # Evaluation
        model.eval()
        all_preds = []
        all_labels = []
        eval_loss = 0.0
        
        with torch.no_grad():
            progress_bar = tqdm(test_loader, desc=f"Epoch {epoch+1}/{num_epochs} [Eval]")
            for batch in progress_bar:
                # Get batch data
                input_ids, attention_mask, labels = [b.to(device) for b in batch]
                
                # Forward pass
                outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs.loss
                logits = outputs.logits
                
                # Get predictions
                preds = torch.argmax(logits, dim=1).cpu().numpy()
                
                # Store predictions and labels
                all_preds.extend(preds)
                all_labels.extend(labels.cpu().numpy())
                
                # Accumulate loss
                eval_loss += loss.item()
                
                # Update progress bar
                progress_bar.set_postfix({"loss": f"{loss.item():.4f}"})
        
        # Calculate average evaluation loss
        avg_eval_loss = eval_loss / len(test_loader)
        
        # Compute metrics
        metrics = compute_metrics(all_preds, all_labels)
        
        # Print evaluation results
        print(f"Epoch {epoch+1} - Evaluation Results:")
        print(f"  Loss: {avg_eval_loss:.4f}")
        print(f"  Accuracy: {metrics['accuracy']:.4f}")
        print(f"  Macro F1: {metrics['f1_macro']:.4f}")
        print(f"  Weighted F1: {metrics['f1_weighted']:.4f}")
        
        # # Save the best model
        # if metrics['f1_macro'] > best_f1:
        #     best_f1 = metrics['f1_macro']
        #     if isinstance(model, nn.DataParallel):
        #         torch.save(model.module.state_dict(), best_model_path)
        #     else:
        #         torch.save(model.state_dict(), best_model_path)
        #     print(f"  New best model saved with F1 Macro: {best_f1:.4f}")
    
    # Load the best model for final evaluation
    if os.path.exists(best_model_path):
        if isinstance(model, nn.DataParallel):
            model.module.load_state_dict(torch.load(best_model_path))
        else:
            model.load_state_dict(torch.load(best_model_path))
        print(f"Loaded best model from {best_model_path}")
    
    # Final evaluation
    model.eval()
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Final Evaluation"):
            input_ids, attention_mask, labels = [b.to(device) for b in batch]
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1).cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(labels.cpu().numpy())
    
    # Compute and print final metrics
    final_metrics = compute_metrics(all_preds, all_labels)
    
    print("\nFinal Test Set Metrics:")
    print(f"Accuracy: {final_metrics['accuracy']:.4f}")
    print(f"Macro-averaged Precision: {final_metrics['precision_macro']:.4f}")
    print(f"Weighted-averaged Precision: {final_metrics['precision_weighted']:.4f}")
    print(f"Macro-averaged F1: {final_metrics['f1_macro']:.4f}")
    print(f"Weighted-averaged F1: {final_metrics['f1_weighted']:.4f}")
    
    print("\nPer-class Precision:")
    for i, p in enumerate(final_metrics['precision_per_class']):
        if i in CLASS_NAMES:
            print(f"Class {i} ({CLASS_NAMES[i]}): {p:.4f}")
        else:
            print(f"Class {i}: {p:.4f}")
    
    print("\nPer-class F1:")
    for i, f in enumerate(final_metrics['f1_per_class']):
        if i in CLASS_NAMES:
            print(f"Class {i} ({CLASS_NAMES[i]}): {f:.4f}")
        else:
            print(f"Class {i}: {f:.4f}")

    # Return metrics for comparison
    return final_metrics

In [2]:
print("\n" + "="*50)
print("Running RoBERTa training:")
small_model_trainer("roberta-base")
print("\n" + "="*50)


Running RoBERTa training:
Analyzing class distribution...

Class Distribution:
Class 0 (True): 219 samples
Class 1 (False Content): 281 samples

Initializing tokenizer and model for roberta-base...


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using FacebookAI/roberta-base
Tokenizing training data with max_length=128...
Tokenizing test data...
Using batch size of 16 for roberta-base
Using device: cuda
Training for 5 epochs with learning rate 5e-05


Epoch 1/5 [Train]: 100%|██████████| 22/22 [00:01<00:00, 11.47it/s, loss=0.6613]


Epoch 1 - Average Training Loss: 0.7042


Epoch 1/5 [Eval]: 100%|██████████| 10/10 [00:00<00:00, 41.03it/s, loss=0.7064]


Epoch 1 - Evaluation Results:
  Loss: 0.6836
  Accuracy: 0.5600
  Macro F1: 0.3590
  Weighted F1: 0.4021


Epoch 2/5 [Train]: 100%|██████████| 22/22 [00:01<00:00, 12.64it/s, loss=0.7094]


Epoch 2 - Average Training Loss: 0.6820


Epoch 2/5 [Eval]: 100%|██████████| 10/10 [00:00<00:00, 40.17it/s, loss=0.7619]


Epoch 2 - Evaluation Results:
  Loss: 0.6759
  Accuracy: 0.5933
  Macro F1: 0.4372
  Weighted F1: 0.4728


Epoch 3/5 [Train]: 100%|██████████| 22/22 [00:01<00:00, 12.40it/s, loss=0.7617]


Epoch 3 - Average Training Loss: 0.6551


Epoch 3/5 [Eval]: 100%|██████████| 10/10 [00:00<00:00, 40.06it/s, loss=0.7287]


Epoch 3 - Evaluation Results:
  Loss: 0.6499
  Accuracy: 0.5667
  Macro F1: 0.3754
  Weighted F1: 0.4169


Epoch 4/5 [Train]: 100%|██████████| 22/22 [00:01<00:00, 12.66it/s, loss=0.3544]


Epoch 4 - Average Training Loss: 0.4864


Epoch 4/5 [Eval]: 100%|██████████| 10/10 [00:00<00:00, 40.02it/s, loss=0.7456]


Epoch 4 - Evaluation Results:
  Loss: 0.5722
  Accuracy: 0.7267
  Macro F1: 0.7231
  Weighted F1: 0.7269


Epoch 5/5 [Train]: 100%|██████████| 22/22 [00:01<00:00, 12.70it/s, loss=0.2083]


Epoch 5 - Average Training Loss: 0.3130


Epoch 5/5 [Eval]: 100%|██████████| 10/10 [00:00<00:00, 39.98it/s, loss=0.7906]


Epoch 5 - Evaluation Results:
  Loss: 0.6016
  Accuracy: 0.7333
  Macro F1: 0.7294
  Weighted F1: 0.7333


Final Evaluation: 100%|██████████| 10/10 [00:00<00:00, 42.04it/s]


Final Test Set Metrics:
Accuracy: 0.7333
Macro-averaged Precision: 0.7294
Weighted-averaged Precision: 0.7333
Macro-averaged F1: 0.7294
Weighted-averaged F1: 0.7333

Per-class Precision:
Class 0 (True): 0.6970
Class 1 (False Content): 0.7619

Per-class F1:
Class 0 (True): 0.6970
Class 1 (False Content): 0.7619






In [2]:
print("\n" + "="*50)
print("Running RoBERTa training:")
small_model_trainer("roberta-large")
print("\n" + "="*50)


Running RoBERTa training:
Analyzing class distribution...

Class Distribution:
Class 0 (True): 219 samples
Class 1 (False Content): 281 samples

Initializing tokenizer and model for roberta-large...


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using FacebookAI/roberta-large
Tokenizing training data with max_length=128...
Tokenizing test data...
Using batch size of 16 for roberta-large
Using device: cuda
Training for 4 epochs with learning rate 3e-05


Epoch 1/4 [Train]: 100%|██████████| 22/22 [00:04<00:00,  5.08it/s, loss=0.7509]


Epoch 1 - Average Training Loss: 0.7336


Epoch 1/4 [Eval]: 100%|██████████| 10/10 [00:00<00:00, 20.71it/s, loss=0.7008]


Epoch 1 - Evaluation Results:
  Loss: 0.7011
  Accuracy: 0.4400
  Macro F1: 0.3056
  Weighted F1: 0.2689


Epoch 2/4 [Train]: 100%|██████████| 22/22 [00:04<00:00,  5.37it/s, loss=0.7095]


Epoch 2 - Average Training Loss: 0.6975


Epoch 2/4 [Eval]: 100%|██████████| 10/10 [00:00<00:00, 20.66it/s, loss=0.6986]


Epoch 2 - Evaluation Results:
  Loss: 0.6736
  Accuracy: 0.6800
  Macro F1: 0.6791
  Weighted F1: 0.6811


Epoch 3/4 [Train]: 100%|██████████| 22/22 [00:04<00:00,  5.39it/s, loss=0.5043]


Epoch 3 - Average Training Loss: 0.5913


Epoch 3/4 [Eval]: 100%|██████████| 10/10 [00:00<00:00, 20.70it/s, loss=0.9110]


Epoch 3 - Evaluation Results:
  Loss: 0.6307
  Accuracy: 0.6667
  Macro F1: 0.6579
  Weighted F1: 0.6645


Epoch 4/4 [Train]: 100%|██████████| 22/22 [00:04<00:00,  5.39it/s, loss=0.1319]


Epoch 4 - Average Training Loss: 0.2968


Epoch 4/4 [Eval]: 100%|██████████| 10/10 [00:00<00:00, 20.57it/s, loss=1.1453]


Epoch 4 - Evaluation Results:
  Loss: 0.7634
  Accuracy: 0.6733
  Macro F1: 0.6640
  Weighted F1: 0.6707


Final Evaluation: 100%|██████████| 10/10 [00:00<00:00, 20.93it/s]


Final Test Set Metrics:
Accuracy: 0.6733
Macro-averaged Precision: 0.6682
Weighted-averaged Precision: 0.6711
Macro-averaged F1: 0.6640
Weighted-averaged F1: 0.6707

Per-class Precision:
Class 0 (True): 0.6441
Class 1 (False Content): 0.6923

Per-class F1:
Class 0 (True): 0.6080
Class 1 (False Content): 0.7200






In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = "microsoft/deberta-v3-large" 
model = AutoModelForSequenceClassification.from_pretrained(         
    model_name,         
    num_labels=2,         
    problem_type="single_label_classification"     
) 
tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-large")
total_params = sum(p.numel() for p in model.parameters())

# Count trainable parameters
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"Total parameters: {total_params}")
print(f"Trainable parameters: {trainable_params}")

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Total parameters: 435063810
Trainable parameters: 435063810


# Results

| Model              | Training Time | VRAM   | Accuracy | Model Size |
|--------------------|--------------|--------|----------|------------|
| 2-agent framework | N/A          | N/A     | 72.86%   | N/A        |
| 3-agent framework | N/A          | N/A     | 57.40%   | N/A        |
| Zero-shot Gemini 1.5 Pro | N/A          | N/A     | 68.79%   | N/A        |
| Zero-shot Gemini 2.0 Flash | N/A          | N/A     | 72.69%   | N/A        |
| DistilBERT        | 43.5s        | 1.1 GB  | 72.00%   | 66M        |
| TinyBERT          | 4.6s         | 0.2 GB  | 65.33%   | 14.5M      |
| BERT             | 18.6s        | 2.5 GB  | **76.00%**   | 110M       |
| DeBERTa V3 Large | 50.1s        | 15.4 GB | 74.67%   | 435M       |
| RoBERTa-Base     | 12.6s        | 4.3 GB  | 73.33%   | 125M       |
| RoBERTa-Large    | 21.6s        | 7.9 GB  | 67.33%   | 355M       |
