In [2]:
import spacy
from spacy import displacy

def load_nlp_model() -> spacy.Language:
	try:
		return spacy.load('fr_core_news_md')
	except OSError:
		from spacy.cli import download
		download('fr_core_news_md')
		return spacy.load('fr_core_news_md')
nlp = load_nlp_model()


In [3]:
text = """Les pénicillines sont des antibiotiques de la classe des bêta-lactamines. La grippe ou influenza est une maladie infectieuse et contagieuse fréquente. Les plaquettes jouent un rôle essentiel dans la coagulation sanguine. La pénicilline détruit la paroi cellulaire des bactéries, ce qui entraîne leur mort."""

doc = nlp(text)

for sent in doc.sents:
    displacy.render(sent, style="dep")

In [10]:
text = "La grippe ou influenza est une maladie infectieuse et contagieuse fréquente."

doc = nlp(text)
displacy.render(doc, style="dep")

In [16]:
for token in doc:
	if token.pos_ == "DET":
		continue
	deps = [(child, child.dep_) for child in token.children if child.pos_ != "DET"]

	print(token, deps)

grippe [(influenza, 'conj')]
ou []
influenza [(ou, 'cc')]
est []
maladie [(grippe, 'nsubj'), (est, 'cop'), (infectieuse, 'amod'), (fréquente, 'amod'), (., 'punct')]
infectieuse [(contagieuse, 'conj')]
et []
contagieuse [(et, 'cc')]
fréquente []
. []


In [21]:
text = """Les pénicillines sont des antibiotiques de la classe des bêta-lactamines."""

doc = nlp(text)

for sent in doc.sents:
	print(sent)
	for token in doc:
		if token.pos_ == "DET":
			continue
		deps = [{child.dep_ : (token, child)} for child in token.children if child.pos_ != "DET"]
		print("\t", deps)


Les pénicillines sont des antibiotiques de la classe des bêta-lactamines.
	 []
	 []
	 [{'nsubj': (antibiotiques, pénicillines)}, {'cop': (antibiotiques, sont)}, {'nmod': (antibiotiques, classe)}, {'punct': (antibiotiques, .)}]
	 []
	 [{'case': (classe, de)}, {'nmod': (classe, bêta-lactamines)}]
	 []
	 [{'case': (bêta-lactamines, des)}]
	 []


On arrive a detecter des relations importante via cette structure.

Nous allons maintenant créer des reconisers pour chacune de ses relations.

In [288]:
from __future__ import annotations
from dataclasses import dataclass, field 
import re
from rich import print  as rprint
from spacy.tokens.token import Token
from typing import Dict, List, Optional

@dataclass
class Dependency:
	token		: Token					= None
	head		: Dependency			= None
	children	: dict[str, list[Dependency]] = field(default_factory=dict)

	
@dataclass
class Relation:
	pattern:		str
	relation_type:	str
	sujet:			str
	objet:			str
	start:			str = '' # only content source
	end:			str = '' # only content source
	source:			str = '' # infobox ou content

	def set_start_and_end(self, sujet : tuple[int,int], pattern: tuple[int,int], objet: tuple[int,int]) -> None:
		self.start += f"{sujet[0]};{pattern[0]};{objet[0]}"
		self.end   += f"{sujet[1]};{pattern[1]};{objet[1]}"
		
	def get_start_end(self, attribute:str) -> tuple[int,int]: 
		"""Get the start & end of a attribute : (sujet, objet or pattern)"""
		st_suj, st_rel, st_obj = self.start.split(';')
		end_suj, end_rel, end_obj = self.end.split(';') 
		match attribute:
			case "sujet":
				return (st_suj, end_suj)
			case "pattern":
				return (st_rel, end_rel)
			case "objet":
				return (st_obj, end_obj)

@dataclass
class BasicToken:
	text	: str
	idx		: int
	
	def __len__(self):
		return len(self.text)

@dataclass
class CompositeToken:
	main_token		: Token
	modifier_tokens	: List[Token]

	_composite_word : str			= ""
	
	def _compute_text(self):
		"""Calcule le texte complet du token composé"""
		tokens = [self.main_token] + self.modifier_tokens
		tokens.sort(key=lambda t: t.idx)
		
		return " ".join([t.text for t in tokens])
	
	@property
	def idx(self):
		return self.main_token.idx
	
	@property
	def text(self):
		if(len(self._composite_word) > 0):
			return self._composite_word
		
		self._composite_word =  self._compute_text()
		return self._composite_word
	
	@property
	def lemma_(self):
		"""Renvoie le lemme composé"""
		return " ".join([self.main_token.lemma_] + [t.lemma_ for t in self.modifier_tokens])
	
	@property
	def pos_(self):
		"""Renvoie la partie du discours du token principal"""
		return self.main_token.pos_
	
	@property
	def tag_(self):
		"""Renvoie le tag du token principal"""
		return self.main_token.tag_
	
	def __len__(self):
		return len(self.text)
	def __str__(self):
		return self.text
	

In [312]:
class BaseRelationExtractor:
	relation_name = "undefined"
	relation_source = "content"

	def _get_composite_words(self, tree: Dependency) -> CompositeToken | Token | BasicToken:
		"""Extrait les mots composés formés par un nom et ses modificateurs adjectivaux ou nominaux, récursivement"""
		if isinstance(tree, BasicToken):
			return tree
		
		if ('amod' in tree.children or 'nmod' in tree.children) and not ('cop' in tree.children):
			main_token = tree.token
			modifier_tokens = []

			for rel in ('amod', 'nmod'):
				if rel in tree.children:
					for child_dep in tree.children[rel]:
						# appel récursif pour gérer les modificateurs imbriqués
						mod_token = self._get_composite_words(child_dep)
						modifier_tokens.append(mod_token)

			return CompositeToken(main_token, modifier_tokens)

		else:
			# pas de modificateurs, on retourne juste le token simple
			return tree.token
	
	def create_relation(self, sujet: Dependency, pattern: Dependency, objet: Dependency, relation_type: str) -> Relation:
		
		sujet_token 	: CompositeToken | Token | BasicToken	= self._get_composite_words(sujet)
		objet_token 	: CompositeToken | Token | BasicToken	= self._get_composite_words(objet)
		pattern_token 	: CompositeToken | Token | BasicToken	= self._get_composite_words(pattern)

		rel = Relation(
			sujet=sujet_token.text,
			objet=objet_token.text,
			pattern=pattern_token.text,
			relation_type=relation_type,
			source = self.relation_source,
		)

		rel.set_start_and_end(
			sujet	= self._get_position(sujet_token),
			pattern	= self._get_position(pattern_token),
			objet	= self._get_position(objet_token)
		)

		return rel
		

	def _get_position(self, token: CompositeToken | Token) -> tuple[int,int]:
		return (token.idx, token.idx + len(token.text))

	def _check_children_keys(self, keys: set, tree: Dependency) -> bool:
		return keys.issubset(tree.children.keys())

	def extract(self, tree: Dependency, known_relations : list[Relation]) -> list[Relation] | None:
		"""
		Extrait les relations de la phrase pour une relation donnée.
		"""
		raise NotImplementedError


class GenericExtractor(BaseRelationExtractor):
	relation_name = "r_isa"
	
	def extract(self, tree: Dependency, known_relations : list[Relation]) -> list[Relation] | None:
		
		if self._check_children_keys({'nsubj', 'cop'}, tree):
			objet = tree

			sujet = tree.children['nsubj'][0]
			pattern = tree.children['cop'][0]

			rel = self.create_relation(sujet, pattern, objet, self.relation_name)
			return [rel]
		return None

class SynonymeExtractor(BaseRelationExtractor):
	relation_name = "r_syn"
	
	def extract(self, tree: Dependency, known_relations : list[Relation]) -> list[Relation] | None:
		
		if self._check_children_keys({'conj'}, tree):
			objet = tree

			sujet = tree.children['conj'][0]
			pattern = sujet.children['cc'][0]

			if pattern.token.text.lower() != "ou":
				return

			rel = self.create_relation(sujet, pattern, objet, self.relation_name)
			return [rel]
		return None

class HeritageExtractor(BaseRelationExtractor):

	def extract(self, tree: Dependency, known_relations : list[Relation]) -> list[Relation] | None:
		if tree.head is None:
			return None
		
		if self._check_children_keys({'conj'}, tree):
			parent = tree

			et_child_dep = tree.children['conj'][0]

			cc = et_child_dep.children['cc'][0]
			if cc.token.text.lower() != "et":
				return
			
			infered_rels = [rel for rel in known_relations if rel.sujet == parent.token.text or rel.objet == parent.token.text]
			relations = []
			for infered_rel in infered_rels:
				sujet = et_child_dep if infered_rel.sujet == parent.token.text else BasicToken(infered_rel.sujet, int(infered_rel.get_start_end("sujet")[0]))
				objet = et_child_dep if infered_rel.objet == parent.token.text else BasicToken(infered_rel.objet, int(infered_rel.get_start_end("objet")[0]))
				pattern = BasicToken(infered_rel.pattern, int(infered_rel.get_start_end("pattern")[0]))
				
				
				rel = self.create_relation(sujet, pattern, objet, infered_rel.relation_type)
				relations.append(rel)

			return relations
		return None


class CaracteristicExtractor(BaseRelationExtractor):
	relation_name = "r_caract"
	
	def extract(self, tree: Dependency, known_relations : list[Relation]) -> list[Relation] | None:
		"""
		Extrait les relations de la phrase pour une relation donnée.
		"""
		
		if self._check_children_keys({'amod', 'nsubj', 'cop'}, tree):

			sujet = tree.children['nsubj'][0]
			pattern = tree.children['cop'][0] # le verbe 'être' aussi
			
			relations = []
			for objet in tree.children['amod']:

				rel = self.create_relation(sujet, pattern, objet, self.relation_name)
				relations.append(rel)

			return relations
		return None

# a faire
class AgentActionExtractor(BaseRelationExtractor):
	"""
	r_agent-1 : 
	Que peut faire l'agent ? Un chat mange, Un antibiotique détruit, ... Sujet > Action.

	les dep intéressant : nsubj:pass, et obl:agent
	"""
	relation_name = "r_agent-1"
	
	def extract(self, tree: Dependency, known_relations : list[Relation]) -> list[Relation] | None:
		
		if self._check_children_keys({'amod', 'nsubj', 'cop'}, tree):

			sujet = tree.children['nsubj'][0]
			pattern = tree.children['cop'][0] # le verbe 'être' aussi
			
			relations = []
			for objet in tree.children['amod']:

				rel = self.create_relation(sujet, pattern, objet, self.relation_name)
				relations.append(rel)

			return relations
		return None

class AgainstExtractor(BaseRelationExtractor):
	relation_name = "r_against"
	
	def extract(self, tree: Dependency, known_relations : list[Relation]) -> list[Relation] | None:
		if tree.token.pos_ != "VERB":
			return None

		if self._check_children_keys({'nsubj', 'obj'}, tree):

			sujet = tree.children['nsubj'][0]
			pattern = tree # le verbe 'detruit' aussi
			objet = tree.children['obj'][0]
			
			rel = self.create_relation(sujet, pattern, objet, self.relation_name)

			return [rel]
		return None
	
class RoleTelicExtractor(BaseRelationExtractor):
	"""
	r_telic_role : 
	Le rôle télique indique la fonction du nom ou du verbe.
	
		pénicillines sont utilisées dans le traitement d'infections bactériennes.
		pénicillines r_telic_role traitement infections bactériennes
	"""
	relation_name = "r_telic_role"
	
	def extract(self, tree: Dependency, known_relations : list[Relation]) -> list[Relation] | None:
		if tree.token.pos_ not in {"VERB", "NOUN"}:
			return None
		
		if self._check_children_keys({'nsubj:pass', 'obl:mod'}, tree):

			sujet = tree.children['nsubj:pass'][0]
			objet = tree.children['obl:mod'][0]
			pattern = tree # le verbe 'utilisé, servir, destiner'
			rel = self.create_relation(sujet, pattern, objet, self.relation_name)
			return [rel]
		return None

In [285]:

class ContentAnalyzer:
	def __init__(self, nlp_model):
		self.nlp = nlp_model
		self.rejected_pos = ("PUNCT", "DET")
		self.extractors: list[BaseRelationExtractor] = [GenericExtractor(), SynonymeExtractor(), CaracteristicExtractor(), HeritageExtractor(), RoleTelicExtractor(), AgainstExtractor() ]

	def _extract_marked_entities(text):
		return re.findall(r"\[([^\[\]]+)\]", text)
	
	def build_dependency_tree(self, sent) -> Dict[Token, Dependency]:
		"""Construit un arbre de dépendances pour une phrase."""
		token_to_dep = {}
		root = None
		
		# Première passe : créer les objets Dependency pour chaque token
		for token in sent:
			if token.pos_ in self.rejected_pos:
				continue
			
			if token not in token_to_dep:
				token_to_dep[token] = Dependency(token=token)
		
		# Deuxième passe : établir les relations head/children
		for token in sent:
			if token not in token_to_dep:
				continue

			dep_obj = token_to_dep[token]

			# Détecter le ROOT
			if token.dep_ == "ROOT":
				root = dep_obj

			# Établir la relation avec le head
			if token.head != token and token.head in token_to_dep:
				dep_obj.head = token_to_dep[token.head]

			# Établir les enfants
			for child in token.children:
				if child.pos_ in self.rejected_pos:
					continue
				if child in token_to_dep:
					child_dep = token_to_dep[child]
					dep_obj.children.setdefault(child.dep_, []).append(child_dep)
			
		return root
	
	def walk_tree(self, tree: Dependency, known_relations=None) -> list[Relation]:
		results = []
		known_relations = known_relations or []

		for extractor in self.extractors:
			rels = extractor.extract(tree, known_relations)
			if rels:
				results.extend(rels)
				known_relations.extend(rels)

		for children in tree.children.values():
			for child in children:
				results.extend(self.walk_tree(child, known_relations))

		return results
	
	def analyse_content(self, content: str, verbose = False) -> list[Relation]:
		doc = self.nlp(content)
		relations = []
		for sent in doc.sents:
			sent_root = self.build_dependency_tree(sent)
			sent_relations = self.walk_tree(sent_root)
			relations.extend(sent_relations)
			if verbose:
				print(f"[green bold]{sent}[/green bold]")
				print(sent_root)
				for relation in sent_relations:
					print(f"{relation.sujet} → {relation.relation_type} → {relation.objet}")
		return relations

In [313]:

analyzer = ContentAnalyzer(nlp)
# text = """La grippe ou influenza est une maladie infectieuse et contagieuse fréquente."""
text = """Les pénicillines et benzathine sont des antibiotiques de la classe des bêta-lactamines. La grippe ou influenza est une maladie infectieuse et contagieuse fréquente. La pénicilline détruit la paroi cellulaire des bactéries, ce qui entraîne leur mort.Les pénicillines sont utilisées dans le traitement d'infections bactériennes. """
relations = analyzer.analyse_content(text, verbose = True)