In [1]:
import pandas as pd
import re
import emoji
import string
import nltk
from bs4 import BeautifulSoup
from autocorrect import Speller
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk import pos_tag

# Download required NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('punkt_tab')

# Initialize tools
spell = Speller(lang='en')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Load the dataset
df = pd.read_csv("UNITENReview.csv")

# Display original data to identify issues
print("Original Data:")
print(df.head())
print("\nColumn names:", df.columns.tolist())
print("\nData types:", df.dtypes)
print("\nMissing values:", df.isnull().sum())

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/9bd7767f-e2b1-4f3e-a47c-
[nltk_data]     80a0c1669bd6/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/9bd7767f-e2b1-4f3e-a47c-
[nltk_data]     80a0c1669bd6/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /home/9bd7767f-e2b1-4f3e-a47c-
[nltk_data]     80a0c1669bd6/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/9bd7767f-e2b1-4f3e-a47c-
[nltk_data]     80a0c1669bd6/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/9bd7767f-e2b1-4f3e-a47c-
[nltk_data]     80a0c1669bd6/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Original Data:
                     Timestamp  \
0  2025/02/10 7:40:54 pm GMT+8   
1  2025/02/10 7:41:00 pm GMT+8   
2  2025/02/10 7:41:19 pm GMT+8   
3  2025/02/10 7:46:40 pm GMT+8   
4  2025/02/10 7:46:43 pm GMT+8   

                                              Review  
0  Im happy with uniten actually, even the people...  
1  I’m having a pretty good time here, happy to m...  
2        a very neutral place in terms of everything  
3  I would say Uniten it's  a good university  bu...  
4   UNITEN is well-regarded, particularly for its...  

Column names: ['Timestamp', 'Review']

Data types: Timestamp    object
Review       object
dtype: object

Missing values: Timestamp    0
Review       0
dtype: int64


In [2]:
# Slang dictionary
slang_dict = {
    "tbh": "to be honest",
    "omg": "oh my god",
    "lol": "laugh out loud",
    "idk": "I don't know",
    "brb": "be right back",
    "btw": "by the way",
    "imo": "in my opinion",
    "smh": "shaking my head",
    "fyi": "for your information",
    "np": "no problem",
    "ikr": "I know right",
    "asap": "as soon as possible",
    "bff": "best friend forever",
    "gg": "good game",
    "hmu": "hit me up",
    "rofl": "rolling on the floor laughing"
}

# Contractions dictionary
contractions_dict = {
    "wasn't": "was not", "isn't": "is not", "aren't": "are not",
    "weren't": "were not", "doesn't": "does not", "don't": "do not",
    "didn't": "did not", "can't": "cannot", "couldn't": "could not",
    "shouldn't": "should not", "wouldn't": "would not", "won't": "will not",
    "haven't": "have not", "hasn't": "has not", "hadn't": "had not",
    "i'm": "i am", "you're": "you are", "he's": "he is", "she's": "she is",
    "it's": "it is", "we're": "we are", "they're": "they are",
    "i've": "i have", "you've": "you have", "we've": "we have",
    "they've": "they have", "i'd": "i would", "you'd": "you would",
    "he'd": "he would", "she'd": "she would", "we'd": "we would",
    "they'd": "they would", "i'll": "i will", "you'll": "you will",
    "he'll": "he will", "she'll": "she will", "we'll": "we will",
    "they'll": "they will", "let's": "let us", "that's": "that is",
    "who's": "who is", "what's": "what is", "where's": "where is",
    "when's": "when is", "why's": "why is"
}

# Build contractions pattern
escaped_contractions = [re.escape(c) for c in contractions_dict.keys()]
compiled_pattern = re.compile(r'\b(' + '|'.join(escaped_contractions) + r')\b', flags=re.IGNORECASE)

# Preprocessing functions
def remove_urls(text):
    return re.sub(r'http\S+|www\S+', '', text)

def remove_html(text):
    return BeautifulSoup(text, "html.parser").get_text()

def remove_emojis(text):
    return emoji.replace_emoji(text, replace='')

def replace_slang(text):
    escaped = [re.escape(w) for w in slang_dict.keys()]
    pattern = r'\b(' + '|'.join(escaped) + r')\b'
    return re.sub(pattern, lambda m: slang_dict[m.group(0).lower()], text, flags=re.IGNORECASE)

def replace_contractions(text):
    return compiled_pattern.sub(lambda m: contractions_dict[m.group(0).lower()], text)

def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

def remove_numbers(text):
    return re.sub(r'\d+', '', text)

def correct_spelling(text):
    return spell(text)

def remove_stopwords(text):
    words = text.split()
    return " ".join([word for word in words if word.lower() not in stop_words])

def get_wordnet_pos(nltk_tag):
    if nltk_tag.startswith('J'): return wordnet.ADJ
    elif nltk_tag.startswith('V'): return wordnet.VERB
    elif nltk_tag.startswith('N'): return wordnet.NOUN
    elif nltk_tag.startswith('R'): return wordnet.ADV
    else: return wordnet.NOUN

def lemmatize_text(text):
    if not isinstance(text, str): return ""
    words = word_tokenize(text)
    pos_tags = pos_tag(words)
    return " ".join([lemmatizer.lemmatize(word, get_wordnet_pos(tag)) for word, tag in pos_tags])

def tokenize_text(text):
    if not isinstance(text, str): return []
    return word_tokenize(text)

# Full preprocessing pipeline
def preprocess_text(text):
    if not isinstance(text, str): return ""
    text = text.lower()
    text = remove_urls(text)
    text = remove_html(text)
    text = remove_emojis(text)
    text = replace_slang(text)
    text = replace_contractions(text)
    text = remove_punctuation(text)
    text = remove_numbers(text)
    text = correct_spelling(text)
    text = remove_stopwords(text)
    text = lemmatize_text(text)
    text = tokenize_text(text)
    return text

# Apply preprocessing - replace "Review" with your actual column name if different
df["processed"] = df["Review"].apply(preprocess_text)

# Save result
df.to_csv("Processed_UNITENReview.csv", index=False)

print("Done! Processed data saved to Processed_UNITENReview.csv")
print(df[["Review", "processed"]].head())

Done! Processed data saved to Processed_UNITENReview.csv
                                              Review  \
0  Im happy with uniten actually, even the people...   
1  I’m having a pretty good time here, happy to m...   
2        a very neutral place in terms of everything   
3  I would say Uniten it's  a good university  bu...   
4   UNITEN is well-regarded, particularly for its...   

                                           processed  
0      [im, happy, unite, actually, even, people, w]  
1  [i, ’, m, pretty, good, time, happy, meet, w, ...  
2                 [neutral, place, term, everything]  
3  [would, say, united, good, university, issue, ...  
4  [united, wellregarded, particularly, strong, e...  
