In [None]:
# load data from csv file
import pandas as pd

df = pd.read_csv('data/chatgpt-reddit-comments.csv')

# print the first 5 rows
# print(df.head())

# print the last 5 rows
print(df.tail())

In [5]:
import re
import nltk
from bs4 import BeautifulSoup

class TextCleaner:
	"""
	Class for cleaning and preprocessing text data with multiple processing steps.
	"""
	
	def __init__(self):
		pass
	
	def remove_html(self, text):
		"""Remove HTML tags from text."""
		if not isinstance(text, str):
			return ""
		return BeautifulSoup(text, "html.parser").get_text()
	
	def convert_to_lowercase(self, text):
		"""Convert text to lowercase."""
		if not isinstance(text, str):
			return ""
		return text.lower()
	
	def remove_urls(self, text):
		"""Remove URLs from text."""
		if not isinstance(text, str):
			return ""
		return re.sub(r"http\S+", "", text)
	
	def remove_mentions_hashtags(self, text):
		"""Remove mentions (@username) and hashtags (#hashtag) from text."""
		if not isinstance(text, str):
			return ""
		return re.sub(r"@\w+|#\w+", "", text)
	
	def remove_punctuation(self, text):
		"""Remove punctuation from text."""
		if not isinstance(text, str):
			return ""
		return re.sub(r"[^\w\s]", "", text)
	
	def remove_digits(self, text):
		"""Remove digits from text."""
		if not isinstance(text, str):
			return ""
		return re.sub(r"\d+", "", text)
	
	def normalize_whitespace(self, text):
		"""Normalize whitespace in text."""
		if not isinstance(text, str):
			return ""
		return re.sub(r"\s+", " ", text).strip()
	
	def clean_text(self, text):
		"""
		Apply all cleaning steps to the text.
		
		Args:
			text (str): The text to clean
			
		Returns:
			str: The cleaned text
		"""
		if not isinstance(text, str):
			return ""
		
		# Apply all cleaning steps in sequence
		text = self.remove_html(text)
		text = self.convert_to_lowercase(text)
		text = self.remove_urls(text)
		text = self.remove_mentions_hashtags(text)
		text = self.remove_punctuation(text)
		text = self.remove_digits(text)
		text = self.normalize_whitespace(text)
		
		return text

# Create an instance for backward compatibility
cleaner = TextCleaner()

def clean_text(text):
	"""
	Legacy function for backward compatibility.
	"""
	return cleaner.clean_text(text)


In [6]:
# Test de la classe NettoyeurTexte
cleaner = TextCleaner()

# Exemple de texte à nettoyer
texte_test = """
<p>Hello @user123! Check out this link: https://example.com/test #AI #ChatGPT</p>
This has NUMBERS 123 and punctuation!!! 
   Multiple    spaces   everywhere.
"""

print("Texte original:")
print(repr(texte_test))
print("\n" + "="*50 + "\n")

# Test de chaque étape individuellement
print("1. Suppression HTML:", repr(cleaner.remove_html(texte_test)))
print("2. Conversion minuscules:", repr(cleaner.convert_to_lowercase(cleaner.remove_html(texte_test))))
print("3. Suppression URLs:", repr(cleaner.remove_urls("Visitez https://example.com pour plus d'infos")))
print("4. Suppression mentions/hashtags:", repr(cleaner.remove_mentions_hashtags("Salut @user #test")))
print("5. Suppression ponctuation:", repr(cleaner.remove_punctuation("Hello, world!!!")))
print("6. Suppression chiffres:", repr(cleaner.remove_digits("J'ai 25 ans en 2023")))
print("7. Normalisation espaces:", repr(cleaner.normalize_whitespace("   espaces    multiples   ")))

print("\n" + "="*50 + "\n")

# Test du nettoyage complet
print("Nettoyage complet:")
resultat_complet = cleaner.clean_text(texte_test)
print(repr(resultat_complet))

print("\n" + "="*50 + "\n")


Texte original:
'\n<p>Hello @user123! Check out this link: https://example.com/test #AI #ChatGPT</p>\nThis has NUMBERS 123 and punctuation!!! \n   Multiple    spaces   everywhere.\n'


1. Suppression HTML: '\nHello @user123! Check out this link: https://example.com/test #AI #ChatGPT\nThis has NUMBERS 123 and punctuation!!! \n   Multiple    spaces   everywhere.\n'
2. Conversion minuscules: '\nhello @user123! check out this link: https://example.com/test #ai #chatgpt\nthis has numbers 123 and punctuation!!! \n   multiple    spaces   everywhere.\n'
3. Suppression URLs: "Visitez  pour plus d'infos"
4. Suppression mentions/hashtags: 'Salut  '
5. Suppression ponctuation: 'Hello world'
6. Suppression chiffres: "J'ai  ans en "
7. Normalisation espaces: 'espaces multiples'


Nettoyage complet:
'hello check out this link this has numbers and punctuation multiple spaces everywhere'


