# Setup & imports


In [4]:
# Activation du rechargement automatique (utile en dev local)
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
from pathlib import Path
import os 
import sys

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [5]:
base_dir = Path().resolve().parent
inputs_dir = base_dir / "data"
notebook_dir = base_dir / "notebooks"
src_dir = base_dir / "src"

for dir in [base_dir, inputs_dir, notebook_dir, src_dir]:
    dir.mkdir(parents=True, exist_ok=True)
    sys.path.insert(0, str(dir.resolve()))
    

In [6]:


# Modules personnalisés (chemins selon ton projet)
from preprocessing.text_cleaner import clean_pipeline
from preprocessing.labeling import (
    generate_toxic_labels,
    analyze_label_distribution,
    preview_labeled_samples,
    add_toxic_source,
    create_balanced_subset
)
from preprocessing.feature_engineering import enrich_text_features



A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.3.1 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "C:\Users\beedi.goua_square-ma\AppData\Roaming\Python\Python311\site-packages\ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "C:\Users\beedi.goua_square-ma\AppData\Roaming\Python\Python311\site-packages\traitlets\config\application.py", line 1075, in launch_instance
    app.start()
  File "C:\Users\beedi.goua_square-ma\AppData\Roaming\Python\Python311\site-packages\ipykernel\ker

# Chargement des données issues de la phase 1 (EDA)

In [7]:
# Définition des chemins
base_dir = Path().resolve().parent
input_path = base_dir / "data" / "processed" / "merged_reviews_eda.csv"

# Chargement
df = pd.read_csv(input_path)
print(f"Données chargées : {df.shape[0]} lignes")
df.head(2)


Données chargées : 15000 lignes


Unnamed: 0,sentiment,title,text,source,id_review,n_sentences,n_words,n_chars,avg_word_len,n_unique_words,lexical_density,potential_toxic,flag_badwords
0,1,Stuning even for the non-gamer,This sound track was beautiful! It paints the ...,amazon,R_000000,3,75,394,5.253333,57,0.76,True,True
1,1,The best soundtrack ever to anything.,I'm reading a lot of reviews saying that this ...,amazon,R_000001,5,91,470,5.164835,69,0.758242,False,False


#### Remarque : 
Ces données sont le résultat de l’analyse exploratoire de la phase 1. Elles contiennent des colonnes comme n_words, potential_toxic, flag_badwords, etc.

## Netoyage textuel

In [8]:
df_clean = clean_pipeline(df, text_col="text", min_words=5, max_words=500)
print(f"Données nettoyées : {df_clean.shape[0]} lignes restantes après filtrage")


Données nettoyées : 14957 lignes restantes après filtrage


#### Commentaires :

Texte nettoyé disponible dans text_clean

Filtrage automatique des textes trop courts ou trop longs

Punctuation, HTML, emojis, emails, chiffres → supprimés

Lemmatisation avec SpaCy

Stopwords supprimés avec NLTK

# Génération de l’étiquette label_toxic

In [9]:
df_labeled = generate_toxic_labels(df_clean, toxic_col="potential_toxic", badword_col="flag_badwords")

# Ajout de colonne explicative (source de toxicité)
df_labeled = add_toxic_source(df_labeled)



INFO:preprocessing.labeling:Étiquetage terminé : 3648 textes toxiques sur 14957


In [10]:
# Distribution
analyze_label_distribution(df_labeled)
preview_labeled_samples(df_labeled)



Distribution des étiquettes :
 - Classe 0 : 11309 avis (75.61%)
 - Classe 1 : 3648 avis (24.39%)

Vérifie s’il y a un déséquilibre significatif avant la modélisation (phase 2.4).

Exemples de textes étiquetés (5 par classe) :

--- Classe 0 ---
→ Great movie and so glad to see this released to (Blu-Ray). Keep anything of John Wayne's great movies released on (blu-ray) coming as soon as they Ava...
→ This book has it's place--and that place is in an American Literature course at a university. It was entirely too technical to be enjoyable. I was ass...
→ Orwell's tale of a society where not only behaviour, but also thought is controlled, seems to grow ever more relevant as the years pass and new techno...
→ This was the first novel I've read by Shari MacDonald. It was great. She ranks right up there with Lori Wick and Lori Copeland. This is a must read....
→ but I had to make the time to finish the book to make sure that I wasn't missing anything. At times I felt there was something else

#### Commentaires :

label_toxic = 1 si le texte est potentiellement toxique ou contient des injures

Distribution des classes affichée

Des exemples de chaque classe sont visualisés

toxic_source donne la source du flag (injure, toxicité, ou les deux)

# Enrichissement linguistique

In [11]:
df_enriched = enrich_text_features(df_labeled, text_col="text_clean")
# Aperçu enrichissement
df_enriched[[
    "flesch_score", "fk_grade",
    "sentiment_polarity", "sentiment_subjectivity",
    "capital_ratio", "nb_exclamations", "nb_questions",
    "has_url", "has_email", "has_phone",
    "has_repeated_chars", "has_emoji"
]].head(3)



Unnamed: 0,flesch_score,fk_grade,sentiment_polarity,sentiment_subjectivity,capital_ratio,nb_exclamations,nb_questions,has_url,has_email,has_phone,has_repeated_chars,has_emoji
0,37.22,18.005,-0.075,0.51,0.0,0,0,False,False,False,False,False
1,18.8775,22.550833,0.15,0.522449,0.007194,0,0,False,False,False,False,False
2,-4.997105,32.836842,0.196719,0.572899,0.0,0,0,False,False,False,False,False


# Rééquilibrage des classes

In [12]:
df_balanced = create_balanced_subset(df_enriched, label_col="label_toxic")


INFO:preprocessing.labeling:Jeu équilibré créé : 7296 lignes (classe 0: 3648, classe 1: 3648)


In [None]:
output_dir = base_dir / "data" / "processed"
output_dir.mkdir(parents=True, exist_ok=True)

df_clean.to_csv(output_dir / "clean_reviews.csv", index=False)
df_enriched.to_csv(output_dir / "enriched_reviews.csv", index=False)
df_balanced.to_csv(output_dir / "balanced_reviews.csv", index=False)

print("Jeux sauvegardés : clean / enriched / balanced")
