In [50]:
import pandas as pd

ntlk_dir = 'nltk_data'
df = pd.read_csv('data/chatgpt-reddit-comments.csv')

In [51]:
import re
from bs4 import BeautifulSoup

class TextCleaner:
	"""
	Class for cleaning and preprocessing text data with multiple processing steps.
	"""
	
	def __init__(self):
		pass
	
	def remove_html(self, text):
		"""Remove HTML tags from text."""
		if not isinstance(text, str):
			return ""
		return BeautifulSoup(text, "html.parser").get_text()
	
	def convert_to_lowercase(self, text):
		"""Convert text to lowercase."""
		if not isinstance(text, str):
			return ""
		return text.lower()
	
	def remove_urls(self, text):
		"""Remove URLs from text."""
		if not isinstance(text, str):
			return ""
		return re.sub(r"http\S+", "", text)
	
	def remove_mentions_hashtags(self, text):
		"""Remove mentions (@username) and hashtags (#hashtag) from text."""
		if not isinstance(text, str):
			return ""
		return re.sub(r"@\w+|#\w+", "", text)
	
	def remove_punctuation(self, text):
		"""Remove punctuation from text."""
		if not isinstance(text, str):
			return ""
		return re.sub(r"[^\w\s]", "", text)
	
	def remove_digits(self, text):
		"""Remove digits from text."""
		if not isinstance(text, str):
			return ""
		return re.sub(r"\d+", "", text)
	
	def normalize_whitespace(self, text):
		"""Normalize whitespace in text."""
		if not isinstance(text, str):
			return ""
		return re.sub(r"\s+", " ", text).strip()
	
	def clean_text(self, text):
		"""
		Apply all cleaning steps to the text.
		
		Args:
			text (str): The text to clean
			
		Returns:
			str: The cleaned text
		"""
		if not isinstance(text, str):
			return ""
		
		# Apply all cleaning steps in sequence
		text = self.remove_html(text)
		text = self.convert_to_lowercase(text)
		text = self.remove_urls(text)
		text = self.remove_mentions_hashtags(text)
		text = self.remove_punctuation(text)
		text = self.remove_digits(text)
		text = self.normalize_whitespace(text)
		
		return text

# Create an instance for backward compatibility
cleaner = TextCleaner()

def clean_text(text):
	"""
	Legacy function for backward compatibility.
	"""
	return cleaner.clean_text(text)


In [52]:
import nltk
from nltk.tokenize import word_tokenize
import os

# Configure NLTK data path
nltk.data.path.append(os.path.abspath(ntlk_dir))

# Comment once done for the first time
nltk.download('punkt_tab', download_dir=ntlk_dir)

class TextTokenizer:
	"""
	Class for tokenizing cleaned text into word-level tokens.
	"""
	
	def __init__(self):
		pass
	
	def tokenize(self, text):
		"""
		Tokenize text into words.
		
		Args:
			text (str): The cleaned input text
		
		Returns:
			List[str]: List of tokens
		"""
		if not isinstance(text, str):
			return []
		return word_tokenize(text)

[nltk_data] Downloading package punkt_tab to nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [53]:
import nltk
from nltk.corpus import stopwords

# Comment once done for the first time
nltk.download('stopwords', download_dir=ntlk_dir)

class StopwordRemover:
	"""
	Class for removing stopwords from tokenized text.
	"""
	
	def __init__(self, language="english"):
		"""
		Initialize the stopword remover with a given language.
		
		Args:
			language (str): Language of the stopwords (default is 'english')
		"""
		self.stop_words = set(stopwords.words(language))
	
	def remove_stopwords(self, tokens):
		"""
		Remove stopwords from a list of tokens.
		
		Args:
			tokens (List[str]): List of word tokens
		
		Returns:
			List[str]: Tokens without stopwords
		"""
		if not isinstance(tokens, list):
			return []
		return [word for word in tokens if word.lower() not in self.stop_words]


[nltk_data] Downloading package stopwords to nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [54]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

# Comment once done for the first time
nltk.download('wordnet', download_dir=ntlk_dir)
nltk.download('omw-1.4', download_dir=ntlk_dir)
nltk.download('averaged_perceptron_tagger_eng', download_dir=ntlk_dir)

class TextLemmatizer:
	"""
	Class for lemmatizing word tokens using NLTK's WordNetLemmatizer.
	"""
	
	def __init__(self):
		self.lemmatizer = WordNetLemmatizer()
	
	def get_wordnet_pos(self, treebank_tag):
		"""
		Convert POS tag from Treebank to WordNet format for better lemmatization.
		"""
		if treebank_tag.startswith('J'):
			return wordnet.ADJ
		elif treebank_tag.startswith('V'):
			return wordnet.VERB
		elif treebank_tag.startswith('N'):
			return wordnet.NOUN
		elif treebank_tag.startswith('R'):
			return wordnet.ADV
		else:
			return wordnet.NOUN  # fallback
	
	def lemmatize(self, tokens):
		"""
		Lemmatize a list of word tokens.
		
		Args:
			tokens (List[str]): List of word tokens
		
		Returns:
			List[str]: Lemmatized tokens
		"""
		if not isinstance(tokens, list):
			return []

		pos_tags = nltk.pos_tag(tokens)  # POS tagging
		return [
			self.lemmatizer.lemmatize(token, self.get_wordnet_pos(pos))
			for token, pos in pos_tags
		]


[nltk_data] Downloading package wordnet to nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger_eng.zip.


In [55]:
# Exemple d'utilisation avec des chaînes de caractères aléatoires

# Créer les instances des classes
text_cleaner = TextCleaner()
tokenizer = TextTokenizer()
stopword_remover = StopwordRemover()
lemmatizer = TextLemmatizer()

# Exemples de textes avec différents problèmes
sample_texts = [
	"<p>Hello @user123! Check out this amazing #AI tool: https://example.com/awesome-tool 🚀</p>",
	"I'm loving the new ChatGPT updates!!! It's so much better than before... 😍",
	"Why are people still using OLD technologies in 2024??? Makes NO sense to me!!!",
	"<div>Visit our website www.example.com for more info about #MachineLearning and #DataScience</div>",
	"Running, jumped, better, good, children, mice, feet - testing different word forms"
]

print("=== DÉMONSTRATION DU PREPROCESSING ===\n")

for i, text in enumerate(sample_texts, 1):
	print(f"📝 Exemple {i}:")
	print(f"Original: {text}")
	
	# Étape 1: Nettoyage
	cleaned = text_cleaner.clean_text(text)
	print(f"Nettoyé: '{cleaned}'")
	
	# Étape 2: Tokenisation
	tokens = tokenizer.tokenize(cleaned)
	print(f"Tokens: {tokens}")
	
	# Étape 3: Suppression des mots vides
	tokens_no_stop = stopword_remover.remove_stopwords(tokens)
	print(f"Sans mots vides: {tokens_no_stop}")
	
	# Étape 4: Lemmatisation
	lemmatized = lemmatizer.lemmatize(tokens_no_stop)
	print(f"Lemmatisé: {lemmatized}")
	
	print("-" * 80 + "\n")


=== DÉMONSTRATION DU PREPROCESSING ===

📝 Exemple 1:
Original: <p>Hello @user123! Check out this amazing #AI tool: https://example.com/awesome-tool 🚀</p>
Nettoyé: 'hello check out this amazing tool'
Tokens: ['hello', 'check', 'out', 'this', 'amazing', 'tool']
Sans mots vides: ['hello', 'check', 'amazing', 'tool']
Lemmatisé: ['hello', 'check', 'amaze', 'tool']
--------------------------------------------------------------------------------

📝 Exemple 2:
Original: I'm loving the new ChatGPT updates!!! It's so much better than before... 😍
Nettoyé: 'im loving the new chatgpt updates its so much better than before'
Tokens: ['im', 'loving', 'the', 'new', 'chatgpt', 'updates', 'its', 'so', 'much', 'better', 'than', 'before']
Sans mots vides: ['im', 'loving', 'new', 'chatgpt', 'updates', 'much', 'better']
Lemmatisé: ['im', 'love', 'new', 'chatgpt', 'update', 'much', 'good']
--------------------------------------------------------------------------------

📝 Exemple 3:
Original: Why are people s