In [2]:
import pandas as pd
import re
import emoji
import string
import nltk
from bs4 import BeautifulSoup
from autocorrect import Speller
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk import pos_tag

# =========================
# Download Required NLTK Data
# =========================
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')

# =========================
# Initialize Tools
# =========================
spell = Speller(lang='en')
stop_words = set(stopwords.words('english'))

# IMPORTANT:
# Keep important domain words
custom_keep_words = {
    "uniten", "ucc", "dss", "bm", "bv",
    "wifi", "parking", "lecturer",
    "management", "hostel", "library"
}

stop_words = stop_words - custom_keep_words

lemmatizer = WordNetLemmatizer()

# =========================
# Fix Encoding Issues (â€™ etc.)
# =========================
def fix_encoding(text):
    if isinstance(text, str):
        return text.encode('latin1', errors='ignore').decode('utf-8', errors='ignore')
    return text

# =========================
# Remove URLs
# =========================
def remove_urls(text):
    return re.sub(r'http\S+|www\S+', '', text)

# =========================
# Remove HTML
# =========================
def remove_html(text):
    return BeautifulSoup(text, "html.parser").get_text()

# =========================
# Remove Emojis
# =========================
def remove_emojis(text):
    return emoji.replace_emoji(text, replace='')

# =========================
# Remove Punctuation
# =========================
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

# =========================
# Remove Numbers
# =========================
def remove_numbers(text):
    return re.sub(r'\d+', '', text)

# =========================
# Remove Stopwords
# =========================
def remove_stopwords(text):
    words = text.split()
    filtered_words = [word for word in words if word not in stop_words]
    return " ".join(filtered_words)

# =========================
# POS Mapping
# =========================
def get_wordnet_pos(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

# =========================
# Lemmatization
# =========================
def lemmatize_text(text):
    if not isinstance(text, str):
        return ""

    words = word_tokenize(text)
    pos_tags = pos_tag(words)

    lemmatized_words = [
        lemmatizer.lemmatize(word, get_wordnet_pos(tag))
        for word, tag in pos_tags
    ]

    return " ".join(lemmatized_words)

# =========================
# Tokenization
# =========================
def tokenize_text(text):
    if not isinstance(text, str):
        return []
    return word_tokenize(text)

# =========================
# FULL PREPROCESSING PIPELINE
# =========================
def preprocess_text(text):

    if not isinstance(text, str):
        return []

    text = fix_encoding(text)          # Fix encoding issues
    text = text.lower()                # Lowercase
    text = remove_urls(text)
    text = remove_html(text)
    text = remove_emojis(text)
    text = remove_punctuation(text)
    text = remove_numbers(text)
    text = remove_stopwords(text)
    text = lemmatize_text(text)
    tokens = tokenize_text(text)

    return tokens

# =========================
# LOAD DATASET
# =========================
df = pd.read_csv("UNITENReview.csv")

# Drop rows like "#NAME?"
df = df[df["Review"].str.contains("#NAME?", na=False) == False]

# Apply preprocessing
df["processed"] = df["Review"].apply(preprocess_text)

# Save processed dataset
df.to_csv("Processed_UNITEN_Reviews.csv", index=False)

# Display sample
print(df[["Review", "processed"]].head())

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/23ae2440-d9b7-4bf1-b490-
[nltk_data]     439ad9e57e22/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/23ae2440-d9b7-4bf1-b490-
[nltk_data]     439ad9e57e22/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /home/23ae2440-d9b7-4bf1-b490-
[nltk_data]     439ad9e57e22/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/23ae2440-d9b7-4bf1-b490-
[nltk_data]     439ad9e57e22/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     /home/23ae2440-d9b7-4bf1-b490-
[nltk_data]     439ad9e57e22/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


                                              Review  \
0  Im happy with uniten actually, even the people...   
1  I’m having a pretty good time here, happy to m...   
2        a very neutral place in terms of everything   
3  I would say Uniten it's  a good university  bu...   
4   UNITEN is well-regarded, particularly for its...   

                                           processed  
0     [im, happy, uniten, actually, even, people, w]  
1   [im, pretty, good, time, happy, meet, w, people]  
2                 [neutral, place, term, everything]  
3  [would, say, uniten, good, university, issue, ...  
4  [uniten, wellregarded, particularly, strong, e...  
