In [15]:
import re
import string
import pandas as pd
import numpy as np
import contractions
import emoji
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split

# Download NLTK resources if not already present
nltk.download("punkt")
nltk.download("punkt_tab")
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("omw-1.4")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\kengu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\kengu\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kengu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\kengu\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\kengu\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [17]:
def expand_contractions(text):
    """
    Expands contractions in English text.
    Example: "can't" -> "cannot", "I'm" -> "I am"
    """
    if not isinstance(text, str) or text.strip() == "":
        return ""
    try:
        return contractions.fix(text)
    except Exception as e:
        # fallback in case contractions lib fails
        print(f"[WARN] Contraction expansion failed for text: {text[:50]}... | Error: {e}")
        return text


In [9]:
def basic_cleaning(text):
    """
    - Lowercase text
    - Remove URLs, mentions, hashtags
    - Remove numbers and punctuation (keep words only)
    - Remove extra whitespace
    """
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", " ", text)  # remove links
    text = re.sub(r"@\w+|#\w+", " ", text)  # remove mentions and hashtags
    text = re.sub(r"\d+", " ", text)  # remove numbers
    text = text.translate(str.maketrans("", "", string.punctuation))  # remove punctuation
    text = re.sub(r"\s+", " ", text).strip()
    return text


In [10]:
def handle_emojis(text):
    """
    Converts emojis into words.
    Example: "😭" -> "loudly crying face"
    """
    return emoji.demojize(text, delimiters=(" ", " "))


In [11]:
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def tokenize_and_lemmatize(text):
    """
    - Tokenizes text
    - Removes stopwords
    - Lemmatizes each token
    """
    tokens = word_tokenize(text)
    tokens = [t for t in tokens if t not in stop_words]
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    return " ".join(tokens)


In [21]:
def tokenize_lemmatize(text):
    if not isinstance(text, str):
        return ""
    try:
        tokens = nltk.word_tokenize(text)
        tokens = [w for w in tokens if w not in stop_words and len(w) > 2]
        tokens = [lemmatizer.lemmatize(w) for w in tokens]
        return " ".join(tokens)
    except Exception as e:
        print(f"[WARN] Tokenization failed: {text[:50]}... | {e}")
        return text

In [22]:
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    """
    Full cleaning pipeline with guards.
    Steps:
    1. Expand contractions
    2. Handle emojis
    3. Basic cleaning (URLs, punctuation, lowercasing)
    4. Tokenization + stopwords removal + lemmatization
    """
    if not isinstance(text, str) or text.strip() == "":
        return ""
    try:
        text = expand_contractions(text)
        text = handle_emojis(text)
        text = basic_cleaning(text)
        text = tokenize_lemmatize(text)
        return text.strip()
    except Exception as e:
        print(f"[WARN] clean_text failed: {text[:50]}... | {e}")
        return ""

In [23]:
# Example: replace with your actual dataset path
df = pd.read_csv("../data/processed/balanced_dataset.csv")

print("Before cleaning:")
print(df.head(3))


Before cleaning:
         id                                              title  \
0    ak3cm1                My mental health is ruining my life   
1  d9a53bb3  20 of 365 I'm surprised [Godzilla ](https://yo...   
2   1jae7ux                      Poverty has destroyed my life   

                                            selftext    label  
0  Every morning I wake up in a panic and I cant ...  anxiety  
1     wasn't 2020's final piece of revenge. (Eminem)   normal  
2  I live in a homeless shelter. We can't close t...  anxiety  


In [24]:
df["clean_text"] = df["title"].fillna("") + " " + df["selftext"].fillna("")
df["clean_text"] = df["clean_text"].apply(clean_text)

print("After cleaning:")
print(df[["clean_text", "label"]].head(3))


[WARN] Contraction expansion failed for text: [L][M][16] tired and don't know what to do My best... | Error: string index out of range
After cleaning:
                                          clean_text    label
0  mental health ruining life every morning wake ...  anxiety
1      surprised godzilla final piece revenge eminem   normal
2  poverty destroyed life live homeless shelter c...  anxiety


In [25]:
# 80% Train, 20% Test
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df["label"], random_state=42)

print("Train size:", train_df.shape)
print("Test size:", test_df.shape)


Train size: (35820, 5)
Test size: (8955, 5)


In [26]:
# Save train and test CSVs
train_df.to_csv("../data/processed/train.csv", index=False)
test_df.to_csv("../data/processed/test.csv", index=False)

print("✅ Train and test CSVs saved in ../data/processed/")


✅ Train and test CSVs saved in ../data/processed/
