## Preprocessing json file


In [3]:
import json

# preprocess json file for nlp
def json_to_text(json_file):
    with open(json_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    # Transform JSON to text
    texts = []
    for article in data:
        text = (
            f"ID: {article['id']}\n"
            f"Slug: {article['slug']}\n"
            f"Title: {article['title']}\n"
            f"Type: {article['type']}\n"
            f"Dossier: {article['dossierLabel']}\n"
            f"Summary: {article['summary']}\n"
            f"Reading Time: {article['readingTime']} min\n"
            f"Published: {article['publishedFrom']}\n"
            f"Redaction Info: {article['redactedByTeamRedactionInfo']}\n"
        )
        texts.append(text)

    # Join all article texts into a single string
    global_text = "\n\n---\n\n".join(texts)
    return global_text

global_text = json_to_text("data.json")

# save global_text in .txt file
with open("global_text.txt", "w", encoding="utf-8") as file:
    file.write(global_text)


## Preprocess text Basic Text Cleaning and tokenization

In [None]:
import spacy
import re
from bs4 import BeautifulSoup
from collections import Counter

# Load the spaCy model
spacy.cli.download("fr_core_news_sm")
nlp = spacy.load("fr_core_news_sm")

nlp.max_length = 5000000

# Load the text
with open("global_text.txt", "r", encoding="utf-8") as file:
    text = file.read()

# 1. Convert to lowercase
text = text.lower()

# 2. Remove HTML tags
text = BeautifulSoup(text, "html.parser").get_text()

# 3. Remove URLs and email addresses
text = re.sub(r'http\S+|www\S+|https\S+', '', text)
text = re.sub(r'\S+@\S+', '', text)

# 4. Remove punctuation and special characters
text = re.sub(r'[^a-zA-Z\s]', '', text)

# 5. NLP processing with spaCy for tokenization, stop word removal, lemmatization
doc = nlp(text)
tokens = [token.lemma_ for token in doc if token.is_alpha and not token.is_stop]


# 7. Remove rare or too frequent words
word_freq = Counter(tokens)
tokens = [token for token in tokens if 1 < word_freq[token] < 1000]  # Example filtering

# Reconstruct the cleaned text
cleaned_text = ' '.join(tokens)

print(cleaned_text)



[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('fr_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


## Part-of-Speech Tagging, chunking and lemmatization

In [None]:
# Process the cleaned_text with spaCy
doc = nlp(cleaned_text)

# Step 1: Part-of-Speech Tagging
for token in doc:
    print(f"Text: {token.text}, POS: {token.pos_}, Tag: {token.tag_}")

# Step 2: Lemmatization
lemmas = [token.lemma_ for token in doc if not token.is_stop]
print("Lemmas:", lemmas)

# Step 3: Chunking (Noun Phrase Extraction)
noun_chunks = [chunk.text for chunk in doc.noun_chunks]
print("Noun Chunks:", noun_chunks)

## Word embeddings