In [None]:
import pandas as pd

df = pd.read_csv('data/chatgpt-reddit-comments.csv')

In [None]:
import re
from bs4 import BeautifulSoup

class TextCleaner:
	"""
	Class for cleaning and preprocessing text data with multiple processing steps.
	"""
	
	def __init__(self):
		pass
	
	def remove_html(self, text):
		"""Remove HTML tags from text."""
		if not isinstance(text, str):
			return ""
		return BeautifulSoup(text, "html.parser").get_text()
	
	def convert_to_lowercase(self, text):
		"""Convert text to lowercase."""
		if not isinstance(text, str):
			return ""
		return text.lower()
	
	def remove_urls(self, text):
		"""Remove URLs from text."""
		if not isinstance(text, str):
			return ""
		return re.sub(r"http\S+", "", text)
	
	def remove_mentions_hashtags(self, text):
		"""Remove mentions (@username) and hashtags (#hashtag) from text."""
		if not isinstance(text, str):
			return ""
		return re.sub(r"@\w+|#\w+", "", text)
	
	def remove_punctuation(self, text):
		"""Remove punctuation from text."""
		if not isinstance(text, str):
			return ""
		return re.sub(r"[^\w\s]", "", text)
	
	def remove_digits(self, text):
		"""Remove digits from text."""
		if not isinstance(text, str):
			return ""
		return re.sub(r"\d+", "", text)
	
	def normalize_whitespace(self, text):
		"""Normalize whitespace in text."""
		if not isinstance(text, str):
			return ""
		return re.sub(r"\s+", " ", text).strip()
	
	def clean_text(self, text):
		"""
		Apply all cleaning steps to the text.
		
		Args:
			text (str): The text to clean
			
		Returns:
			str: The cleaned text
		"""
		if not isinstance(text, str):
			return ""
		
		# Apply all cleaning steps in sequence
		text = self.remove_html(text)
		text = self.convert_to_lowercase(text)
		text = self.remove_urls(text)
		text = self.remove_mentions_hashtags(text)
		text = self.remove_punctuation(text)
		text = self.remove_digits(text)
		text = self.normalize_whitespace(text)
		
		return text

# Create an instance for backward compatibility
cleaner = TextCleaner()

def clean_text(text):
	"""
	Legacy function for backward compatibility.
	"""
	return cleaner.clean_text(text)


In [None]:
import nltk
from nltk.tokenize import word_tokenize

# Comment once done for the first time
nltk.download('punkt')

class TextTokenizer:
	"""
	Class for tokenizing cleaned text into word-level tokens.
	"""
	
	def __init__(self):
		pass
	
	def tokenize(self, text):
		"""
		Tokenize text into words.
		
		Args:
			text (str): The cleaned input text
		
		Returns:
			List[str]: List of tokens
		"""
		if not isinstance(text, str):
			return []
		return word_tokenize(text)

In [None]:
import nltk
from nltk.corpus import stopwords

# À exécuter une seule fois dans ton code
nltk.download('stopwords')

class StopwordRemover:
	"""
	Class for removing stopwords from tokenized text.
	"""
	
	def __init__(self, language="english"):
		"""
		Initialize the stopword remover with a given language.
		
		Args:
			language (str): Language of the stopwords (default is 'english')
		"""
		self.stop_words = set(stopwords.words(language))
	
	def remove_stopwords(self, tokens):
		"""
		Remove stopwords from a list of tokens.
		
		Args:
			tokens (List[str]): List of word tokens
		
		Returns:
			List[str]: Tokens without stopwords
		"""
		if not isinstance(tokens, list):
			return []
		return [word for word in tokens if word.lower() not in self.stop_words]
