In [77]:
import re

def find_context(text, article):
    """check for a given article the surrounding text and try to find some reference to codes"""
    before = r"(?P<before>.{1,80})"
    after = r"(?P<after>.{1,80})"
    regex_article = before + article + after
    regex_article = re.compile(regex_article)
    context_list = re.findall(regex_article, text)
    if context_list:
        context = " ".join(context_list[0]).lower()
        return context
    else:
        print("article menntion not found")
        
def find_articles_position(text):
    """find articles mentions in raw string"""
    regex_article = re.compile("((?:R|L|D|l|r|d)(?:\s|\.|\.\s)?\d{3,4}(?:-\d{1,2})?(?:-\d{1,2})?)")
    articles = re.finditer(regex_article, text)
    articles = [(art.group(), art.span()) for art in articles]
    return list(zip(*articles))
    
def find_articles_code(text, code_strings = ["code du travail"]):
    """detect articles and reference to a code in the surrounding 80 characters"""
    articles, positions = find_articles_position(text)
    contextes = [find_context(text, art) for art in articles]
    assert len(articles) == len(contextes), "problem in function find_context"
    codes = [detect_code(context, code_strings) for context in contextes]
    articles_normalized = [normalize_digit(art) for art in articles]
    
    return list(zip(articles_normalized, codes)), (articles, positions)

def detect_code(context, code_strings):
    """detect in the given surrounding context a code reference"""
    return set(code for code in code_strings if code in context)

In [78]:
def replace_space(text):
    "remove any kind of space"
    return re.sub(r"\xa0|\n|\s","", text)

def replace_lower(text):
    return text.upper()

def replace_point(text):
    "remove point in article mention"
    return re.sub("\.", "", text)

def normalize_digit(text):
    """take ill-defined article reference and turn it into Legifrance ready
    ex: l. 783-2 --> L783-2"""
    text = replace_space(text)
    text = replace_lower(text)
    text = replace_point(text)
    return text
 
def build_url(legiart):
    """turn LEGIARTI ID into url"""
    url = "https://www.legifrance.gouv.fr/affichCodeArticle.do?idArticle=" + legiart + "&cidTexte=LEGITEXT000006072050&dateTexte=20191231"
    return url

In [99]:
text = """Une pause de 20 minutes est obligatoire au bout de six heures de travail échues.
     Cette obligation est énoncée aux l’article L3121-33 et L. L3121-31 du Code du travail:
    mais toutefois cela contredit l'article  L3187-1 du code de la sécurité sociale"""

In [8]:
import json
with open("../data/mapping-articles-cdtn.json", "r") as f:
    code_json = json.load(f)
    
def find_article_id(article):
    """return article legi id, if not found: either article is obsolete or not in the detected code"""
    article_id = code_json.get(article)
    if not article_id: 
        return None
    return build_url(article_id)

In [101]:
def make_href_mark(url, text):
    return '<a href="' + url + '">' + text + '</a>'

def put_links(text):
    articles_normalized, (articles, positions) = find_articles_code(text)
    
    articles_normalized = [art[0] for art in articles_normalized] # change this line to add new codes
    links = [find_article_id(art) for art in articles_normalized]
    markups = [(make_href_mark(link, art_norm), art, pos) for link, art_norm, art, pos in zip(links, articles_normalized, articles, positions) if link]
    offset = 0
    for markup in markups:
        text , offset = add_single_markup(text, markup, offset)
        
    return text

In [102]:
def add_single_markup(text, markup, offset):
    markup_text, art_len, position = markup
    start, end = position
    start, end = start + offset, end + offset
    new_text = text[:start] + markup_text + text[end:]
    return new_text, len(markup_text) - len(art_len)  + offset

In [103]:
from IPython.display import HTML
HTML(put_links(text))

In [15]:
text[4275:]

'r 122 jours minimum pour ouvrir des droits .\n\nVoir toute la réglementation\xa0Unedic : www.unedic.org\n\net en\xa0particulier la fiche 5 - page 80\n\nEXTRAIT : \nCette condition n?est pas non plus opposable aux salariés qui ne justifient pas de 91 jours ou 455 heures de travail depuis la date de la dernière ouverture de droits ou la dernière date à laquelle les allocations leur ont été refusées.\n\nde même que le réglement Général Unedic\xa0: www.unedic.org\n\nExtrait : e)\xa0n\'avoir pas quitté volontairement, sauf cas prévus par un accord d\'application, leur dernière activité professionnelle salariée, ou une activité professionnelle salariée autre que la dernière dès lors que, depuis le départ volontaire, il ne peut être justifié d\'une période d\'affiliation d\'au moins 91\xa0jours ou d\'une période de travail d\'au moins 455\xa0heures\xa0;\n\nBonne lecture \n SUITE MILOU\n\nEtant entendu que si votre CDD ne fait que 120 jours , Vous\xa0pourrez bénéficier du chomage calculé sur 

In [94]:
articles_references, positions = find_articles_code(text)

In [73]:
res = find_articles_position(text)

In [117]:
articles_references

[('R5424-2', {'code du travail'}),
 ('R5424-2', {'code du travail'}),
 ('L5422-2', set()),
 ('L5424-1', set()),
 ('L5312-1', set()),
 ('L5427-1', set()),
 ('L5424-1', set())]

In [95]:
articles_references

[('R5424-2', {'code du travail'}),
 ('R5424-2', {'code du travail'}),
 ('L5422-2', set()),
 ('L5424-1', set()),
 ('L5312-1', set()),
 ('L5427-1', set()),
 ('L5424-1', set())]

In [97]:
t = "abc"

In [100]:
t[1:2] = "aer"

TypeError: 'str' object does not support item assignment

In [111]:
link_text = "yo"
url = "url"
make_href_mark(url, link_text)

'<a href="url">yo</a>'

In [109]:
re

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from sequellecode import SequellText
st = SequellText("../data/mapping-articles-cdtn.json")

In [14]:
st.put_links("L1121-1")

article menntion not found


TypeError: argument of type 'NoneType' is not iterable

In [23]:
list(zip(*[['a', "b"], ['a',]]))

[('a', 'a')]

In [11]:
code_json["L1121-1"]

'LEGIARTI000006900785'

In [4]:
list(iter)

TypeError: 'builtin_function_or_method' object is not iterable

In [8]:
import re

In [15]:
regex_article = re.compile(r"((?:R|L|D|l|r|d)(?:\s|\.|\.\s)?\d{3,4}(?:-\d{1,2})?(?:-\d{1,2})?)")
v = re.finditer( regex_article, "L1121-1")

In [16]:
list(v)

[<_sre.SRE_Match object; span=(0, 7), match='L1121-1'>]

In [7]:
list(v)

[]

In [31]:
def find_context(text, article):
    """check for a given article the surrounding text and try to find some reference to codes"""
    before = r"(?P<before>.{1,80})?"
    after = r"(?P<after>.{1,80})?"
    regex_article = before + article + after
    regex_article = re.compile(regex_article)
    context_list = re.findall(regex_article, text)

    if context_list:
        context = " ".join(context_list[0]).lower()
        return context
    else:
        print("yolo")

In [33]:
find_context("L1121-1", "L1121-1")

' '

In [1]:
from juritagger.juritagger import JuriMatcher

In [2]:
opts = {
        "spacy_model" : "fr_core_news_md"
        }
jm = JuriMatcher(**opts)

In [4]:
text = """L'indemnisation du salarié dépend du nombre de jours de fermeture
de l'entreprise et du nombre de jours de congés JUR acquis par le salarié."""

matches, doc = jm.tag_doc(text, mode = "class")
matches

ValueError: [E098] Trying to set conflicting doc.ents: '(3, 4, 'PERS')' and '(3, 4, 'PERS')'. A token can only be part of one entity, so make sure the entities you're setting don't overlap.

In [1]:
import spacy

AttributeError: module 'cymem.cymem' has no attribute 'PyMalloc'

In [2]:
from sklearn.metrics import recall_score

In [3]:
recall_score?

In [4]:

def ranking_precision_score(y_true, y_score, k=10):
    """Precision at rank k
    Parameters
    ----------
    y_true : array-like, shape = [n_samples]
        Ground truth (true relevance labels).
    y_score : array-like, shape = [n_samples]
        Predicted scores.
    k : int
        Rank.
    Returns
    -------
    precision @k : float
    """
    unique_y = np.unique(y_true)

    if len(unique_y) > 2:
        raise ValueError("Only supported for two relevance levels.")

    pos_label = unique_y[1]
    n_pos = np.sum(y_true == pos_label)

    order = np.argsort(y_score)[::-1]
    y_true = np.take(y_true, order[:k])
    n_relevant = np.sum(y_true == pos_label)

    # Divide by min(n_pos, k) such that the best achievable score is always 1.0.
    return float(n_relevant) / min(n_pos, k)

In [None]:
y_pred = [2, 3, 1, 0, 0]
y_true = [1, 2, 1, 0, 0]

In [5]:
import pandas as pd

In [13]:
a = pd.read_excel("/Users/armand/Downloads/partie_L_ancien_nouveau (1).xlsx", sheetname="Sheet12")

In [14]:
a

Unnamed: 0,Texte,Ancienne référence,Nouvelle
0,,,référence
1,,art. L. 122-14-8,L. 1231-5
2,,art. L. 122-14-9,non repris
3,,art. L. 122-14-10,non repris
4,,art. L. 122-14-11,L. 1237-1
5,,art. L. 122-14-11,L. 1224-4
6,,art. L. 122-14-11,L. 1231-6
7,,art. L. 122-14-11,L. 1232-5
8,,art. L. 122-14-11,L. 1232-6
9,,art. L. 122-14-11,L. 1233-14
