In [8]:
import pandas as pd

df = pd.read_csv('data/chatgpt-reddit-comments.csv')

In [9]:
import re
from bs4 import BeautifulSoup

class TextCleaner:
	"""
	Class for cleaning and preprocessing text data with multiple processing steps.
	"""
	
	def __init__(self):
		pass
	
	def remove_html(self, text):
		"""Remove HTML tags from text."""
		if not isinstance(text, str):
			return ""
		return BeautifulSoup(text, "html.parser").get_text()
	
	def convert_to_lowercase(self, text):
		"""Convert text to lowercase."""
		if not isinstance(text, str):
			return ""
		return text.lower()
	
	def remove_urls(self, text):
		"""Remove URLs from text."""
		if not isinstance(text, str):
			return ""
		return re.sub(r"http\S+", "", text)
	
	def remove_mentions_hashtags(self, text):
		"""Remove mentions (@username) and hashtags (#hashtag) from text."""
		if not isinstance(text, str):
			return ""
		return re.sub(r"@\w+|#\w+", "", text)
	
	def remove_punctuation(self, text):
		"""Remove punctuation from text."""
		if not isinstance(text, str):
			return ""
		return re.sub(r"[^\w\s]", "", text)
	
	def remove_digits(self, text):
		"""Remove digits from text."""
		if not isinstance(text, str):
			return ""
		return re.sub(r"\d+", "", text)
	
	def normalize_whitespace(self, text):
		"""Normalize whitespace in text."""
		if not isinstance(text, str):
			return ""
		return re.sub(r"\s+", " ", text).strip()
	
	def clean_text(self, text):
		"""
		Apply all cleaning steps to the text.
		
		Args:
			text (str): The text to clean
			
		Returns:
			str: The cleaned text
		"""
		if not isinstance(text, str):
			return ""
		
		# Apply all cleaning steps in sequence
		text = self.remove_html(text)
		text = self.convert_to_lowercase(text)
		text = self.remove_urls(text)
		text = self.remove_mentions_hashtags(text)
		text = self.remove_punctuation(text)
		text = self.remove_digits(text)
		text = self.normalize_whitespace(text)
		
		return text

# Create an instance for backward compatibility
cleaner = TextCleaner()

def clean_text(text):
	"""
	Legacy function for backward compatibility.
	"""
	return cleaner.clean_text(text)


In [10]:
import nltk
from nltk.tokenize import word_tokenize

# Comment once done for the first time
nltk.download('punkt')

class TextTokenizer:
	"""
	Class for tokenizing cleaned text into word-level tokens.
	"""
	
	def __init__(self):
		pass
	
	def tokenize(self, text):
		"""
		Tokenize text into words.
		
		Args:
			text (str): The cleaned input text
		
		Returns:
			List[str]: List of tokens
		"""
		if not isinstance(text, str):
			return []
		return word_tokenize(text)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\omist\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [11]:
import nltk
from nltk.corpus import stopwords

# À exécuter une seule fois dans ton code
nltk.download('stopwords')

class StopwordRemover:
	"""
	Class for removing stopwords from tokenized text.
	"""
	
	def __init__(self, language="english"):
		"""
		Initialize the stopword remover with a given language.
		
		Args:
			language (str): Language of the stopwords (default is 'english')
		"""
		self.stop_words = set(stopwords.words(language))
	
	def remove_stopwords(self, tokens):
		"""
		Remove stopwords from a list of tokens.
		
		Args:
			tokens (List[str]): List of word tokens
		
		Returns:
			List[str]: Tokens without stopwords
		"""
		if not isinstance(tokens, list):
			return []
		return [word for word in tokens if word.lower() not in self.stop_words]


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\omist\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

# Comment once done for the first time
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')

class TextLemmatizer:
	"""
	Class for lemmatizing word tokens using NLTK's WordNetLemmatizer.
	"""
	
	def __init__(self):
		self.lemmatizer = WordNetLemmatizer()
	
	def get_wordnet_pos(self, treebank_tag):
		"""
		Convert POS tag from Treebank to WordNet format for better lemmatization.
		"""
		if treebank_tag.startswith('J'):
			return wordnet.ADJ
		elif treebank_tag.startswith('V'):
			return wordnet.VERB
		elif treebank_tag.startswith('N'):
			return wordnet.NOUN
		elif treebank_tag.startswith('R'):
			return wordnet.ADV
		else:
			return wordnet.NOUN  # fallback
	
	def lemmatize(self, tokens):
		"""
		Lemmatize a list of word tokens.
		
		Args:
			tokens (List[str]): List of word tokens
		
		Returns:
			List[str]: Lemmatized tokens
		"""
		if not isinstance(tokens, list):
			return []

		pos_tags = nltk.pos_tag(tokens)  # POS tagging
		return [
			self.lemmatizer.lemmatize(token, self.get_wordnet_pos(pos))
			for token, pos in pos_tags
		]


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\omist\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\omist\AppData\Roaming\nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\omist\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.
