# SysCRED - Système Neuro-Symbolique de Vérification de Crédibilité

**PhD Thesis Prototype** - Dominique S. Loyer  
*Citation Key: loyerModelingHybridSystem2025*

Ce notebook intègre:
- Moteur de recherche TREC (BM25, QLD, TF-IDF)
- Analyse NLP avec Transformers (GPU/TPU)
- Ontologie RDF pour l'explicabilité

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/DominiqueLoyer/syscred/blob/main/syscred_colab.ipynb)
[![Kaggle](https://kaggle.com/static/images/open-in-kaggle.svg)](https://kaggle.com/kernels/welcome?src=https://github.com/DominiqueLoyer/syscred/blob/main/syscred_kaggle.ipynb)

---

## 1. Configuration de l'Environnement Colab

In [None]:
# === Vérification GPU/TPU (Colab) ===
import torch
import subprocess
import sys
import os

# Suppress TensorFlow warnings
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
os.environ["WANDB_DISABLED"] = "true"

print("=" * 60)
print("SysCRED - Google Colab Environment Check")
print("=" * 60)

# Check for GPU
if torch.cuda.is_available():
    print(f"✓ GPU: {torch.cuda.get_device_name(0)}")
    print(f"  Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
    DEVICE = 'cuda'
else:
    print("✗ No GPU - using CPU")
    print("  → Runtime > Change runtime type > GPU pour activer")
    DEVICE = 'cpu'

# Check for TPU (Colab)
try:
    import torch_xla.core.xla_model as xm
    print(f"✓ TPU available")
    DEVICE = xm.xla_device()
except:
    pass

print(f"\nDevice: {DEVICE}")

# Colab-specific: Check if running in Colab
IN_COLAB = 'google.colab' in sys.modules
print(f"Running in Colab: {IN_COLAB}")

In [None]:
# === Installation des dépendances ===
!pip install transformers[torch] datasets accelerate evaluate -q
!pip install pyserini rdflib pytrec_eval nltk beautifulsoup4 python-whois -q

# NLTK resources
import nltk
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)
nltk.download('wordnet', quiet=True)

print("✓ Dépendances installées")

In [None]:
# === (Optionnel) Monter Google Drive pour sauvegarder les résultats ===
# Décommenter si vous voulez sauvegarder les résultats sur Drive

# from google.colab import drive
# drive.mount('/content/drive')
# OUTPUT_DIR = '/content/drive/MyDrive/SysCRED_Results'
# os.makedirs(OUTPUT_DIR, exist_ok=True)
# print(f"✓ Results will be saved to: {OUTPUT_DIR}")

OUTPUT_DIR = '/content/syscred_results'
os.makedirs(OUTPUT_DIR, exist_ok=True)
print(f"✓ Output directory: {OUTPUT_DIR}")

## 2. Modules SysCRED

In [None]:
# === IR Engine (extrait de TREC_AP88-90) ===
import re
import math
from typing import Dict, List, Tuple, Optional, Any
from dataclasses import dataclass
from collections import Counter

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

@dataclass
class SearchResult:
    doc_id: str
    score: float
    rank: int
    snippet: Optional[str] = None

class IREngine:
    """Moteur IR avec BM25, QLD, TF-IDF (Citation: loyerEvaluationModelesRecherche2025)"""
    
    BM25_K1 = 0.9
    BM25_B = 0.4
    
    def __init__(self, use_stemming: bool = True):
        self.stopwords = set(stopwords.words('english'))
        self.stemmer = PorterStemmer() if use_stemming else None
        self.searcher = None
    
    def preprocess(self, text: str) -> str:
        """Prétraitement avec stemming Porter."""
        if not isinstance(text, str):
            return ""
        tokens = word_tokenize(text.lower())
        filtered = [t for t in tokens if t.isalpha() and t not in self.stopwords]
        if self.stemmer:
            filtered = [self.stemmer.stem(t) for t in filtered]
        return ' '.join(filtered)
    
    def calculate_bm25(self, query_terms: List[str], doc_terms: List[str],
                       doc_length: int, avg_doc_length: float,
                       doc_freq: Dict[str, int], corpus_size: int) -> float:
        """Calcul BM25."""
        doc_counts = Counter(doc_terms)
        score = 0.0
        for term in query_terms:
            if term not in doc_counts:
                continue
            tf = doc_counts[term]
            df = doc_freq.get(term, 1)
            idf = math.log((corpus_size - df + 0.5) / (df + 0.5) + 1)
            num = tf * (self.BM25_K1 + 1)
            den = tf + self.BM25_K1 * (1 - self.BM25_B + self.BM25_B * doc_length / avg_doc_length)
            score += idf * (num / den)
        return score
    
    def pseudo_relevance_feedback(self, query: str, top_docs: List[str], n_terms: int = 10) -> str:
        """Expansion de requête par PRF."""
        query_tokens = self.preprocess(query).split()
        expansion = Counter()
        for doc in top_docs:
            for token in self.preprocess(doc).split():
                if token not in query_tokens:
                    expansion[token] += 1
        expansion_terms = [t for t, _ in expansion.most_common(n_terms)]
        return query + ' ' + ' '.join(expansion_terms)

# Test
ir = IREngine()
print("Test preprocess:", ir.preprocess("Information Retrieval systems help find documents"))

In [None]:
# === SEO Analyzer ===
class SEOAnalyzer:
    """Analyse SEO avec TF-IDF, BM25, PageRank estimé."""
    
    BM25_K1 = 1.5
    BM25_B = 0.75
    
    STOPWORDS = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
                 'of', 'with', 'by', 'from', 'as', 'is', 'was', 'are', 'were', 'been',
                 'le', 'la', 'les', 'un', 'une', 'des', 'du', 'de', 'et', 'ou'}
    
    def __init__(self):
        self.avg_doc_length = 500
        self.corpus_size = 1000
    
    def tokenize(self, text: str) -> List[str]:
        if not text:
            return []
        tokens = re.findall(r'\b[a-zA-Z]{2,}\b', text.lower())
        return [t for t in tokens if t not in self.STOPWORDS]
    
    def calculate_tf_idf(self, text: str) -> Dict[str, float]:
        tokens = self.tokenize(text)
        if not tokens:
            return {}
        counts = Counter(tokens)
        total = len(tokens)
        tf_idf = {}
        for term, count in counts.items():
            tf = count / total
            idf = math.log(self.corpus_size / (1 + len(term)))  # Simplified IDF
            tf_idf[term] = tf * idf
        return tf_idf
    
    def estimate_pagerank(self, domain_age_days: int = None, source_reputation: str = None) -> float:
        d = 0.85
        base = 1 - d
        contrib = 0
        if domain_age_days and domain_age_days > 365*5:
            contrib += 0.3
        elif domain_age_days and domain_age_days > 365*2:
            contrib += 0.2
        if source_reputation == 'High':
            contrib += 0.3
        elif source_reputation == 'Medium':
            contrib += 0.15
        return min(1.0, base + d * contrib)

# Test
seo = SEOAnalyzer()
print("TF-IDF top terms:", sorted(seo.calculate_tf_idf("credibility verification system").items(), 
                                  key=lambda x: x[1], reverse=True)[:3])

## 3. NLP avec Transformers (GPU/TPU)

In [None]:
# === Chargement des modèles NLP ===
from transformers import pipeline

print("Chargement des modèles NLP sur", DEVICE, "...")

# Sentiment Analysis
sentiment_analyzer = pipeline(
    "sentiment-analysis",
    model="distilbert-base-uncased-finetuned-sst-2-english",
    device=0 if DEVICE == 'cuda' else -1
)

# Named Entity Recognition
ner_model = pipeline(
    "ner",
    model="dbmdz/bert-large-cased-finetuned-conll03-english",
    aggregation_strategy="simple",
    device=0 if DEVICE == 'cuda' else -1
)

print("✓ Modèles chargés")

In [None]:
# === Test NLP ===
test_text = "According to researchers at Harvard University, the new study published in Nature shows significant results."

print("Text:", test_text)
print("\nSentiment:", sentiment_analyzer(test_text)[0])
print("\nEntities:")
for ent in ner_model(test_text):
    print(f"  - {ent['word']}: {ent['entity_group']} ({ent['score']:.2f})")

## 4. Système de Vérification de Crédibilité

In [None]:
# === SysCRED - Système complet ===
from datetime import datetime

class SysCRED:
    """Système neuro-symbolique de vérification de crédibilité."""
    
    WEIGHTS = {
        'source_reputation': 0.25,
        'domain_age': 0.10,
        'sentiment_neutrality': 0.15,
        'entity_presence': 0.15,
        'coherence': 0.15,
        'fact_check': 0.20
    }
    
    def __init__(self):
        self.ir_engine = IREngine()
        self.seo_analyzer = SEOAnalyzer()
        self.sentiment = sentiment_analyzer
        self.ner = ner_model
    
    def verify(self, text: str, source_reputation: str = 'Unknown', domain_age_days: int = 0) -> Dict:
        """Vérifier la crédibilité d'un texte."""
        scores = {}
        
        # 1. Source reputation
        rep_map = {'High': 1.0, 'Medium': 0.6, 'Low': 0.3, 'Unknown': 0.5}
        scores['source_reputation'] = rep_map.get(source_reputation, 0.5)
        
        # 2. Domain age
        if domain_age_days > 365*5:
            scores['domain_age'] = 1.0
        elif domain_age_days > 365*2:
            scores['domain_age'] = 0.7
        elif domain_age_days > 365:
            scores['domain_age'] = 0.5
        else:
            scores['domain_age'] = 0.3
        
        # 3. Sentiment neutrality
        sent = self.sentiment(text[:512])[0]
        confidence = sent['score']
        # Neutral = high credibility, extreme = lower
        scores['sentiment_neutrality'] = 1 - abs(confidence - 0.5) * 2
        
        # 4. Entity presence (sources, institutions)
        entities = self.ner(text[:512])
        credible_entities = sum(1 for e in entities if e['entity_group'] in ['ORG', 'PER'])
        scores['entity_presence'] = min(1.0, credible_entities * 0.2)
        
        # 5. Text coherence (based on preprocessing)
        preprocessed = self.ir_engine.preprocess(text)
        unique_ratio = len(set(preprocessed.split())) / max(1, len(preprocessed.split()))
        scores['coherence'] = unique_ratio
        
        # 6. Fact check (placeholder - would use external API)
        scores['fact_check'] = 0.5  # Neutral by default
        
        # Calculate weighted score
        overall = sum(scores[k] * self.WEIGHTS[k] for k in self.WEIGHTS)
        
        # Determine level
        if overall >= 0.7:
            level = 'HIGH'
        elif overall >= 0.4:
            level = 'MEDIUM'
        else:
            level = 'LOW'
        
        return {
            'score': round(overall, 3),
            'level': level,
            'components': {k: round(v, 3) for k, v in scores.items()},
            'sentiment': sent,
            'entities': entities[:5],
            'timestamp': datetime.now().isoformat()
        }

# Initialize
syscred = SysCRED()
print("✓ SysCRED initialized")

In [None]:
# === Test SysCRED ===
test_texts = [
    {
        'text': "According to a study published by Harvard University in the journal Science, researchers have discovered a new method for detecting misinformation.",
        'source': 'High',
        'age': 3650
    },
    {
        'text': "SHOCKING!!! You won't BELIEVE what scientists found! This changes EVERYTHING!!!",
        'source': 'Unknown',
        'age': 30
    }
]

print("=" * 60)
print("SysCRED - Tests de Vérification")
print("=" * 60)

# Store last result for use in next cell
last_result = None

for i, test in enumerate(test_texts, 1):
    print(f"\n--- Test {i} ---")
    print(f"Text: {test['text'][:80]}...")
    
    last_result = syscred.verify(
        text=test['text'],
        source_reputation=test['source'],
        domain_age_days=test['age']
    )
    
    print(f"\nScore: {last_result['score']} ({last_result['level']})")
    print("Components:")
    for k, v in last_result['components'].items():
        print(f"  {k}: {v}")

## 5. Ontologie RDF (Explicabilité)

In [None]:
# === Sauvegarde des résultats en RDF ===
from rdflib import Graph, Namespace, Literal, URIRef
from rdflib.namespace import RDF, RDFS, XSD

CRED = Namespace("http://example.org/credibility#")

def save_to_ontology(result: Dict, text_id: str) -> Graph:
    """Convertir un résultat de vérification en triplets RDF."""
    g = Graph()
    g.bind('cred', CRED)
    
    eval_uri = CRED[f"Evaluation_{text_id}"]
    
    g.add((eval_uri, RDF.type, CRED.CredibilityEvaluation))
    g.add((eval_uri, CRED.hasScore, Literal(result['score'], datatype=XSD.float)))
    g.add((eval_uri, CRED.hasLevel, Literal(result['level'], datatype=XSD.string)))
    g.add((eval_uri, CRED.timestamp, Literal(result['timestamp'], datatype=XSD.dateTime)))
    
    for comp_name, comp_value in result['components'].items():
        comp_uri = CRED[f"{text_id}_{comp_name}"]
        g.add((eval_uri, CRED.hasComponent, comp_uri))
        g.add((comp_uri, CRED.componentName, Literal(comp_name)))
        g.add((comp_uri, CRED.componentScore, Literal(comp_value, datatype=XSD.float)))
    
    return g

# Create a test result if last_result is not defined
if 'last_result' not in dir() or last_result is None:
    print("Creating test result for RDF demo...")
    last_result = syscred.verify(
        text="A study by MIT researchers shows promising results.",
        source_reputation='High',
        domain_age_days=2000
    )

# Test
g = save_to_ontology(last_result, "test_001")
print(f"Triplets RDF générés: {len(g)}")
print(g.serialize(format='turtle')[:500])

# Save to file
rdf_file = f"{OUTPUT_DIR}/syscred_results.ttl"
g.serialize(destination=rdf_file, format='turtle')
print(f"\n✓ Saved to: {rdf_file}")

---

## Notes

- **GPU**: Runtime > Change runtime type > GPU
- **Google Drive**: Décommenter la cellule 'mount_drive' pour sauvegarder sur Drive
- **Citation**: loyerModelingHybridSystem2025, loyerEvaluationModelesRecherche2025

### Synchronisation
- **GitHub**: https://github.com/DominiqueLoyer/syscred
- **Kaggle**: Même notebook disponible sur Kaggle
- **Colab**: Ouvrir directement depuis GitHub avec le badge ci-dessus