# Library importation

In [7]:
import pandas as pd 
import spacy
import re

# Data preprocessing

In [4]:
#Droppping unnecesary columns
data = pd.read_csv('chatgpt-reddit-comments.csv').drop(columns=['comment_id', "comment_parent_id"])

#Visualization
print(data.head())

#Number of comments
print(f'There is {data.shape[0]} comments')

  Unnamed: 0                                       comment_body  subreddit
0          0  I've been shocked for days now, I don't need c...  r/ChatGPT
1          1   \n\nI am so angry right now. I just wasted my...  r/ChatGPT
2          2  chatgpt karma whoring is here folks! just when...  r/ChatGPT
3          3                                 Worked on me, ngl.  r/ChatGPT
4          4  Certified 10/10, must-see moment. It really di...  r/ChatGPT
There is 52416 comments


In [9]:
# 1) Load spaCy 
nlp = spacy.load("en_core_web_lg", disable=["parser","ner"])

# 2) Regexes
URL_RE     = re.compile(r"http\S+|www\.\S+") #Eliminating URLs
HASHTAG_RE = re.compile(r"#\w+") #Eliminating hashtags
#Eliminating emojis
EMOJI_RE   = re.compile(
    "["
      u"\U0001F600-\U0001F64F"
      u"\U0001F300-\U0001F5FF"
      u"\U0001F680-\U0001F6FF"
      u"\U0001F700-\U0001F77F"
      u"\U0001F780-\U0001F7FF"
      u"\U0001F800-\U0001F8FF"
      u"\U0001F900-\U0001F9FF"
      u"\U0001FA00-\U0001FA6F"
      u"\U00002600-\U000026FF"
      u"\U00002700-\U000027BF"
    "]+",
    flags=re.UNICODE
)

def preprocess_strip_emojis(text):
    if not isinstance(text, str):
        text = ""
    # remove URLs, hashtags, and emojis
    text = URL_RE.sub("", text)
    text = HASHTAG_RE.sub("", text)
    text = EMOJI_RE.sub("", text)
    # lowercase & trim
    text = text.lower().strip()
    # tokenize & lemmatize
    doc = nlp(text)
    tokens = [tok.lemma_ for tok in doc if tok.is_alpha and not tok.is_stop]
    return " ".join(tokens)

# Apply to your DataFrame
data['comment_clean'] = data['comment_body'].fillna("").apply(preprocess_strip_emojis)

In [12]:
data.head()

Unnamed: 0.1,Unnamed: 0,comment_body,subreddit,comment_clean
0,0,"I've been shocked for days now, I don't need c...",r/ChatGPT,shock day need clickbait
1,1,\n\nI am so angry right now. I just wasted my...,r/ChatGPT,angry right waste time read post sub clickbait...
2,2,chatgpt karma whoring is here folks! just when...,r/ChatGPT,chatgpt karma whore folk think stream think bu...
3,3,"Worked on me, ngl.",r/ChatGPT,work ngl
4,4,"Certified 10/10, must-see moment. It really di...",r/ChatGPT,certify moment shock core
