In [2]:
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

path = "./archive/medium_articles.csv"

df = pd.read_csv(path, delimiter=",", quotechar='"')

# Télécharger les ressources nécessaires
nltk.download('punkt')  # Pour la tokenisation (word_tokenize)
nltk.download('stopwords')  # Pour les stopwords
nltk.download('wordnet')  # Pour la lemmatisation (WordNetLemmatizer)
nltk.download('averaged_perceptron_tagger')  # Optionnel, pour la lemmatisation avancée

nltk.data.path.append('C:/Users/scien/AppData/Roaming/nltk_data')

# Initialisation des outils de NLP
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    if isinstance(text, str):  # Vérifie que la valeur est bien une chaîne de caractères
        # 1️⃣ Suppression de la ponctuation et des balises HTML
        text = re.sub(r'<.*?>', '', text)  # Enlever les balises HTML
        text = re.sub(r'[^a-zA-Z\s]', '', text)  # Garde uniquement les lettres et espaces

        # 2️⃣ Mise en minuscules
        text = text.lower()

        # 3️⃣ Tokenization
        tokens = word_tokenize(text)

        # 4️⃣ Suppression des stopwords
        tokens = [word for word in tokens if word not in stop_words]

        # 5️⃣ Stemming
        #tokens = [stemmer.stem(word) for word in tokens]

        # 6️⃣ Lemmatisation
        tokens = [lemmatizer.lemmatize(word) for word in tokens]

        return ' '.join(tokens)  # Reconstruction du texte
    return text  # Retourne tel quel si ce n'est pas une string

# Appliquer le prétraitement sur la colonne "text"
df['clean_text'] = df['text'].apply(preprocess_text)

# Afficher un échantillon des textes nettoyés
print(df[['clean_text']].head())

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\bapti\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\bapti\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\bapti\AppData\Roaming\nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\bapti\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


                                          clean_text
0  photo josh riemer unsplash merry christmas hap...
1  brain coronavirus guide curious troubling impa...
2  mind nose smell training change brain six week...
3  passionate synergy science technology provide ...
4  youve heard havent phineas gage railroad worke...


In [3]:
# Sauvegarde du nouveau dataset avec la colonne 'clean_text'
df.to_csv("./archive/lemmetised_clean_data.csv", index=False)
