In [10]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import sys
import os

# Ajoute automatiquement le dossier src au path
sys.path.append(os.path.abspath(".."))
sys.path.append(os.path.abspath("."))

def remove_emails(text: str) -> str:
    """Supprime les adresses email du texte."""
    if not isinstance(text, str):
        return ""
    return re.sub(r"\S+@\S+", " ", text)

def remove_urls(text: str) -> str:
    """Supprime les URLs (https, www, liens divers)."""
    if not isinstance(text, str):
        return ""
    return re.sub(r"http\S+|www\S+|https\S+", " ", text)

def remove_numbers(text: str) -> str:
    """Supprime les numéros (téléphone, dates, chiffres)."""
    if not isinstance(text, str):
        return ""
    return re.sub(r"\d+", " ", text)

def basic_preprocess(text: str) -> str:
    """
    Pipeline simple :
    - Enlever emails
    - Enlever URLs
    - Enlever les numéros
    """
    if not isinstance(text, str):
        return ""

    text = remove_emails(text)
    text = remove_urls(text)
    text = remove_numbers(text)

    # Nettoyer espaces multiples
    text = re.sub(r"\s+", " ", text).strip()

    return text   # ← correctement indenté maintenant

print(sys.path)

ModuleNotFoundError: No module named 'nltk'

In [6]:
sample = "Email : test@email.com et mon site : https://google.com  24 ans"
print(basic_preprocess(sample))

Email : et mon site : ans


In [7]:
processed_dir = "../data/processed"

# Crée le dossier s'il n'existe pas
os.makedirs(processed_dir, exist_ok=True)


In [9]:
# Téléchargements nécessaires (une seule fois)
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("omw-1.4")

stop_words = set(stopwords.words("english")) | set(stopwords.words("french"))
lemmatizer = WordNetLemmatizer()

def lemmatize_text(text: str) -> str:
    """
    Lemmatisation + suppression stopwords
    """
    if not isinstance(text, str):
        return ""

    # Mettre en minuscules
    text = text.lower()

    # Supprimer ponctuation
    text = re.sub(r"[^\w\s]", " ", text)

    # Tokenisation + lemmatisation
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words and len(word) > 2]

    return " ".join(tokens)

def full_preprocess(text: str) -> str:
    """
    Pipeline complet :
    - supprimer emails, urls, numéros
    - lemmatisation et suppression stopwords
    """
    if not isinstance(text, str):
        return ""

    text = remove_emails(text)
    text = remove_urls(text)
    text = remove_numbers(text)
    text = lemmatize_text(text)

    # Nettoyer espaces multiples
    text = re.sub(r"\s+", " ", text).strip()
    return text

NameError: name 'nltk' is not defined

In [None]:
# CSV complet nettoyé
df.to_csv(os.path.join(processed_dir, "cvs_clean.csv"), index=False)
print("✅ cvs_clean.csv sauvegardé dans data/processed/")

# Séparer train / test (80/20)
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

train_df.to_csv(os.path.join(processed_dir, "cvs_train.csv"), index=False)
test_df.to_csv(os.path.join(processed_dir, "cvs_test.csv"), index=False)

print("✅ cvs_train.csv et cvs_test.csv sauvegardés dans data/processed/")
