In [1]:
import re
import nltk
import pandas as pd

In [2]:
df = pd.read_csv("spam.csv", encoding='latin1')
# df.head()

In [3]:
import re
import string
import nltk

nltk.download("punkt")
nltk.download("stopwords")

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

stop_words = set(stopwords.words("english"))
stemmer = PorterStemmer()


[nltk_data] Downloading package punkt to /home/natchanon/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/natchanon/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
def fix_encoding(text: str) -> str:
    """
    แก้ตัวอักษรเพี้ยนจาก encoding latin-1 / windows-1252
    ปรับเพิ่ม pattern เองได้ตามที่เจอใน dataset
    """
    text = str(text)

    replacements = {
        "åÕ": "'",
        "Ì_": "u",
        "å": "",
        "Ã¢â‚¬â„¢": "'",
        "Ã¢â‚¬": "'",
        "Ã¢": "'",
        "â€˜": "'",
        "â€™": "'",
        "â€œ": '"',
        "â€": '"',
        "Â£": "£",
        "Â¥": "¥",
        "Â€": "€",
        "Â©": "©",
        "Â®": "®",
        "Â±": "±",
        "Ã©": "é",
        "Ã¨": "è",
        "Ã¢": "â",
        "Ã´": "ô",
        "Ã§": "ç",
        "Ã£": "ã",
        "Ã¼": "ü",
        "Ã¶": "ö",
        "Ã±": "ñ",
    }

    for bad, good in replacements.items():
        text = text.replace(bad, good)

    return text


In [5]:
abbrev_dict = {
    # basic chat
    "n": "and",
    "u": "you",
    "ur": "your",
    "r": "are",
    "b": "be",
    "c": "see",
    "k": "okay",
    "ok": "okay",
    "okie": "okay",
    "oki": "okay",
    "tmr": "tomorrow",
    "tmrw": "tomorrow",
    "tdy": "today",
    "2day": "today",
    "2moro": "tomorrow",
    "b4": "before",
    "b4n": "bye for now",
    "gr8": "great",
    "l8": "late",
    "l8r": "later",
    "nite": "night",
    "gonna": "going to",
    "wanna": "want to",
    "gotta": "got to",
    "kinda": "kind of",
    "sorta": "sort of",

    # time abbreviations
    "hr": "hour",
    "hrs": "hours",
    "min": "minute",
    "mins": "minutes",
    "sec": "second",
    "secs": "seconds",
    
    # expressions
    "omg": "oh my god",
    "omfg": "oh my fucking god",
    "wtf": "what the fuck",
    "wth": "what the hell",
    "idk": "i do not know",
    "idc": "i do not care",
    "imo": "in my opinion",
    "imho": "in my humble opinion",
    "fyi": "for your information",
    "tbh": "to be honest",
    "irl": "in real life",
    "ikr": "i know right",
    "nvm": "never mind",
    "smh": "shaking my head",
    "brb": "be right back",
    "bbl": "be back later",
    "bfn": "bye for now",
    "bff": "best friends forever",

    # laughter
    "lol": "laugh out loud",
    "lmao": "laughing my ass off",
    "rofl": "rolling on the floor laughing",
    "roflmao": "rolling on the floor laughing my ass off",
    "hehe": "laugh",
    "haha": "laugh",

    # thanks / please
    "thx": "thanks",
    "tx": "thanks",
    "tnx": "thanks",
    "ty": "thank you",
    "tysm": "thank you so much",
    "pls": "please",
    "plz": "please",
    "pwease": "please",

    # time / soon / frequently used
    "asap": "as soon as possible",
    "bf": "boyfriend",
    "gf": "girlfriend",
    "bday": "birthday",
    "hbd": "happy birthday",
    "msg": "message",
    "txt": "text",
    "dm": "direct message",
    "pm": "private message",

    # prepositions / short
    "abt": "about",
    "abt2": "about to",
    "btw": "by the way",
    "bc": "because",
    "bcoz": "because",
    "cuz": "because",
    "coz": "because",
    "cos": "because",
    "tho": "though",
    "thru": "through",
    "u2": "you too",
    "luv": "love",
    "xoxo": "hugs and kisses",

    # frequency
    "afaik": "as far as i know",
    "afk": "away from keyboard",
    "atm": "at the moment",
    "bday": "birthday",
    "cu": "see you",
    "cya": "see you",
    "gn": "good night",
    "gm": "good morning",
    "g2g": "got to go",
    "gtg": "got to go",
    "gota": "got to go",

    # work / study
    "est": "estimated",
    "eod": "end of day",
    "eom": "end of message",
    "np": "no problem",
    "tba": "to be announced",
    "tbc": "to be confirmed",
    "faq": "frequently asked questions",

    # feelings
    "hru": "how are you",
    "wyd": "what are you doing",
    "wya": "where are you",
    "ily": "i love you",
    "ilu": "i love you",
    "ily2": "i love you too",
    "missu": "miss you",
    "missya": "miss you",
    "xmas": "christmas",

    # misc
    "yo": "hey",
    "sup": "what is up",
    "wassup": "what is up",
    "wazzup": "what is up",
    "bro": "brother",
    "sis": "sister",
    "omw": "on my way",
    "rn": "right now",
    "jk": "just kidding",
    "jkjk": "just kidding",
    "gg": "good game",
    "ez": "easy",
}


In [6]:
# ขยายคำ
def expand_abbrev(text: str) -> str:
    """
    แทนคำย่อใน abbrev_dict ด้วยรูปแบบเต็ม
    เช่น omg -> oh my god, btw -> by the way
    """
    words = text.split()
    expanded_words = []

    for w in words:
        key = w.lower()
        if key in abbrev_dict:
            expanded_words.extend(abbrev_dict[key].split())
        else:
            expanded_words.append(w)

    return " ".join(expanded_words)


In [7]:
def simple_preprocess_full(text):
    # 0) ensure string
    text = str(text)

    # 1) fix encoding
    text = fix_encoding(text)

    # 2) normalize URLs
    text = re.sub(r"https?://\S+|www\.\S+", " <URL> ", text)

    # 3) expand abbreviations (omg->oh my god, u->you)
    text = expand_abbrev(text)

    # 4) handle contractions 's → remove (that's → that)
    #    ทำให้ stopwords ทำงานถูกต้อง
    # text = re.sub(r"\'s\b", "", text)

    # 5) remove digits
    text = re.sub(r"\d+", " ", text)

    # 6) remove punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))

    # 7) lowercase
    text = text.lower()

    # 8) remove extra spaces
    text = re.sub(r"\s+", " ", text).strip()

    # 9) tokenize
    tokens = word_tokenize(text)

    # 10) remove stopwords
    tokens = [w for w in tokens if w not in stop_words]

    # 11) stemming
    tokens = [stemmer.stem(w) for w in tokens]

    return tokens
    # ถ้าอยากให้คืนเป็น string แทน list:
    # return " ".join(tokens)


In [12]:
# Example
text = "A gram usually runs like &lt;#&gt; , a half eighth is smarter though and gets you almost a whole second gram for &lt;#&gt;"
# text = "URGENT! We are trying to contact you. Last weekends draw shows that you have won a å£900 prize GUARANTEED. Call 09061701939. Claim code S89. Valid 12hrs only"
# text = "Yup... Ok i go home look at the timings then i msg Ì_ again... Xuhui going to learn on 2nd may too but her lesson is at 8am"
processed_text = simple_preprocess_full(text)
print(f"Original Text: {text}")
print(f"Processed Text: {processed_text}")

Original Text: A gram usually runs like &lt;#&gt; , a half eighth is smarter though and gets you almost a whole second gram for &lt;#&gt;
Processed Text: ['gram', 'usual', 'run', 'like', 'ltgt', 'half', 'eighth', 'smarter', 'though', 'get', 'almost', 'whole', 'second', 'gram', 'ltgt']


In [13]:
df["clean_tokens"] = df["v2"].apply(simple_preprocess_full)
# df["clean_text"] = df["clean_tokens"].apply(lambda x: " ".join(x))
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4,clean_tokens
0,ham,"Go until jurong point, crazy.. Available only ...",,,,"[go, jurong, point, crazi, avail, bugi, great,..."
1,ham,Ok lar... Joking wif u oni...,,,,"[okay, lar, joke, wif, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,,"[free, entri, wkli, comp, win, fa, cup, final,..."
3,ham,U dun say so early hor... U c already then say...,,,,"[dun, say, earli, hor, see, alreadi, say]"
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,,"[nah, dont, think, goe, usf, live, around, tho..."


## ขั้นตอนที่ทำ Text Preprocessing กับ dataset นี้ มีลำดับดังนี้
1) แปลงเป็น string เผื่อมีค่า NaN
2) fix encoding เช่น åÕ -> '
3) แทน URL ด้วย <'URL'>
4) ขยายคำย่อ เช่น  btw => by the way
5) จัดการ 's
6) ลบตัวเลข
7) ลบเครื่องหมาย punctuation
8) ภาษาอังกฤษ แปลงเป็นพิมพ์เล็ก
9) ตัด space ที่ซ้ำ
10) tokenization
11) remove stopwords
12) stemming