# NLP+

Fichier pris en entrée :
- clean_{date}.json -> provient du notebook **nettoyeur**

Fichiers de sortie produits par le notebook :
- org.txt -> point de référence pour comparer la qualité du nettoyage
- NLP_{type_pipeline}_{date}.html -> **fichier HTML des résultats de la NER**


In [23]:
import pandas as pd

import spacy
from spacy import displacy
from collections import Counter

# Pipelines au choix small, medium, large (du - au + précis)
# import fr_core_news_sm
import fr_core_news_md
# import fr_core_news_lg
nlp = fr_core_news_md.load()

from pprint import pprint

from datetime import date
import time

import json

In [24]:
# Variables à changer par l'utilisateur
source_json = 'clean_11-Dec-22.json'
pipeline = "fr_core_news_md" # /!\ PENSER À CHANGER PLUS HAUT
nb_a_traiter = 20

In [25]:
# Importation des données depuis le JSON
df = pd.read_json(source_json, orient='index')

# Définition de l'en tête général
date = time.strftime("%d-%b-%y")

# HTML description des labels
labels_description = """
<p><strong>PERSON:</strong> People, including fictional.&emsp;<strong>NORP:</strong> Nationalities or religious or political groups.&emsp;
<strong>FAC:</strong> Buildings, airports, highways, bridges, etc.&emsp;<strong>ORG:</strong> Companies, agencies, institutions, etc.&emsp;
<strong>GPE:</strong> Countries, cities, states.&emsp;<strong>LOC:</strong> Non-GPE locations, mountain ranges, bodies of water.&emsp;
<strong>PRODUCT:</strong> Objects, vehicles, foods, etc. (Not services.)&emsp;<strong>EVENT:</strong> Named hurricanes, battles, wars, sports events, etc.&emsp;
<strong>WORK_OF_ART:</strong> Titles of books, songs, etc.&emsp;<strong>LAW:</strong> Named documents made into laws.&emsp;<strong>LANGUAGE:</strong> Any named language.&emsp;
<strong>DATE:</strong> Absolute or relative dates or periods.&emsp;<strong>TIME:</strong> Times smaller than a day.&emsp;<strong>PERCENT:</strong> Percentage, including "%".&emsp;
<strong>MONEY:</strong> Monetary values, including unit.&emsp;<strong>QUANTITY:</strong> Measurements, as of weight or distance.&emsp;<strong>ORDINAL:</strong> "first", "second", etc.&emsp;
<strong>CARDINAL:</strong> Numerals that do not fall under another type.</p>
"""

heading = f"""
    <p><strong>date:</strong> {date}</p>
    <p><strong>source:</strong> {source_json}</p>
    <p><strong>pipeline:</strong> {pipeline}</p>
    <p><strong>quantity processessed:</strong> {nb_a_traiter}</p>
    <hr>
    {labels_description}
    """

# (Re)création d'un fichier de sortie propre + en tête
nom_fichier = f"NLP_{pipeline[-2:]}_{date}"

with open(f"{nom_fichier}.html", "w") as fichier:
    fichier.write(heading)

In [26]:
# Personnalisation des couleurs du rendu
colors = {
    "PER": "#97C7E8",
    "ORG": "#A4DBA4",
    "GPE": "#F2937C",
    "LOC": "#AE9DF2",
    "EVENT": "#E8BC76",
    "WORK_OF_ART": "#DB99DB",
    "MISC": "#F2A99D",
    "DATE": "#A7F2BD",
    "ORDINAL": "#E8D3A2",
    "CARDINAL": "#E8D3A2",
}

In [27]:
# Boucle de constitution du HTML
for i in range(nb_a_traiter): # range(len(df.index))
    title = df.loc[i, 'area_title']
    text = df.loc[i, 'area_text']
    
    doc = nlp(text)

    sentence_tokens = len([[token.text for token in sent] for sent in doc.sents])

    # Pour inscrire le titre direcement dans le rendu displacy
    # doc.user_data["title"] = f"index = {i} | " + title

    # Génération du rendu displacy
    html = displacy.render(doc, style="ent", jupyter=False, page=True, options={"colors": colors})

    # Définition de l'en tête pour chaque résultat
    headings = f"""
    <hr>
    <p><strong>index:</strong> {i}</p>
    <p><strong>title:</strong> {title}</p>
    <p><strong>number of sentecences:</strong> {sentence_tokens}</p>
    """

    # Inscription de l'en tête + inscription de résultat
    with open(f"{nom_fichier}.html", 'a') as fichier:
        fichier.write(headings)
        fichier.write(str(html))

In [28]:
# Nombre de requêtes dans la dernière
sentence_tokens = [[token.text for token in sent] for sent in doc.sents]
print(len(sentence_tokens))

44


In [29]:
text = df.loc[301, "area_text"]

doc = nlp(text)

# [print(f'ent.text = {ent.text}', f'ent.lablel_ = {ent.label_}') for ent in doc.ents]
# [(ent.text, ent.label_) for ent in doc.ents[5]]

# Liste de personnes par doc
pers = [(ent.text, ent.label_) for ent in doc.ents if ent.label_ == "PER"]
pers = list(dict.fromkeys(pers))
# Suppression des doublons

pers
# for ent in doc.ents:
#     if ent.label_ == 'PER':
#         print(ent.text, ent.label_)


[('A. Meillet', 'PER'),
 ('M. Meillet', 'PER'),
 ('M. Brugmann', 'PER'),
 ('J. Vendhyes', 'PER')]

In [33]:
# Chargement des données
with open("data/liste_auteurs_JSON.json", "r") as f:
    data = json.loads(f.read())

# Descente dans l'arborescence du JSON afin de trouver les résultats
# -> dans le JSON, 'bindings' contient nos résultats d'où le nom de la variable
bindings = data['results']['bindings']

# Déclaration de la master liste
total = []

for binding in bindings:
    # Liste de champs utilisée juste le temps de la boucle
    champs = []

    # Constitution de la liste
    URI_doc = binding['Document']['value']
    URI_person = binding['Creator']['value']
    name = binding['Name']['value']
    champs.append(URI_doc)
    champs.append(URI_person)
    champs.append(name)

    # Ajout à la master liste
    total.append(champs)

# Convertion en dataframe
colonnes = ['URI_doc', 'URI_person', 'name']
df_JSON_total = pd.DataFrame(total, columns=colonnes)
