In [5]:
import pandas as pd 
import re 
import emoji 
import string 
import nltk

from bs4 import BeautifulSoup 
from nltk.corpus import stopwords, wordnet 
from nltk.stem import WordNetLemmatizer 
from nltk.tokenize import word_tokenize 
from nltk import pos_tag 

# Download required NLTK resources 
nltk.download('stopwords') 
nltk.download('wordnet')                    # For lemmatization 
nltk.download('omw-1.4')                     # WordNet lexical database 
nltk.download('averaged_perceptron_tagger')  # For POS tagging 
nltk.download('punkt')                       # For tokenization 

# Initialize tools 
stop_words = set(stopwords.words('english')) 
lemmatizer = WordNetLemmatizer() 

# Dictionary of slang words and their replacements 
slang_dict = { 
    "tbh": "to be honest", 
    "omg": "oh my god", 
    "lol": "laugh out loud", 
    "idk": "I don't know", 
    "brb": "be right back", 
    "btw": "by the way", 
    "imo": "in my opinion", 
    "smh": "shaking my head", 
    "fyi": "for your information", 
    "np": "no problem", 
    "ikr": "I know right", 
    "asap": "as soon as possible", 
    "bff": "best friend forever", 
    "gg": "good game", 
    "hmu": "hit me up", 
    "rofl": "rolling on the floor laughing" 
} 

# Contractions dictionary 
contractions_dict = { 
    "wasn't": "was not", 
    "isn't": "is not", 
    "aren't": "are not", 
    "weren't": "were not", 
    "doesn't": "does not", 
    "don't": "do not", 
    "didn't": "did not", 
    "can't": "cannot", 
    "couldn't": "could not", 
    "shouldn't": "should not", 
    "wouldn't": "would not", 
    "won't": "will not", 
    "haven't": "have not", 
    "hasn't": "has not", 
    "hadn't": "had not", 
    "i'm": "i am", 
    "you're": "you are", 
    "he's": "he is", 
    "she's": "she is", 
    "it's": "it is", 
    "we're": "we are", 
    "they're": "they are", 
    "i've": "i have", 
    "you've": "you have", 
    "we've": "we have", 
    "they've": "they have", 
    "i'd": "i would", 
    "you'd": "you would", 
    "he'd": "he would", 
    "she'd": "she would", 
    "we'd": "we would", 
    "they'd": "they would", 
    "i'll": "i will", 
    "you'll": "you will", 
    "he'll": "he will", 
    "she'll": "she will", 
    "we'll": "we will", 
    "they'll": "they will", 
    "let's": "let us", 
    "that's": "that is", 
    "who's": "who is", 
    "what's": "what is", 
    "where's": "where is", 
    "when's": "when is", 
    "why's": "why is" 
} 

# Remove URLs 
def remove_urls(text): 
    return re.sub(r'http\S+|www\S+', '', text) 

# Remove HTML tags 
def remove_html(text): 
    return BeautifulSoup(text, "html.parser").get_text() 

# Remove emojis 
def remove_emojis(text): 
    return emoji.replace_emoji(text, replace='') 

# Replace slang words 
def replace_slang(text): 
    for slang, replacement in slang_dict.items(): 
        text = re.sub(r'\b' + re.escape(slang) + r'\b', replacement, text, flags=re.IGNORECASE) 
    return text 

# Expand contractions 
def replace_contractions(text): 
    for contraction, expansion in contractions_dict.items(): 
        text = re.sub(r'\b' + re.escape(contraction) + r'\b', expansion, text, flags=re.IGNORECASE) 
    return text 

# Remove punctuation 
def remove_punctuation(text): 
    return text.translate(str.maketrans('', '', string.punctuation)) 

# Remove numbers 
def remove_numbers(text): 
    return re.sub(r'\d+', '', text) 

# Remove stopwords 
def remove_stopwords(text): 
    words = text.split() 
    filtered_words = [word for word in words if word.lower() not in stop_words] 
    return " ".join(filtered_words) 

# POS tagging for lemmatization 
def get_wordnet_pos(nltk_tag): 
    if nltk_tag.startswith('J'): 
        return wordnet.ADJ 
    elif nltk_tag.startswith('V'): 
        return wordnet.VERB 
    elif nltk_tag.startswith('N'): 
        return wordnet.NOUN 
    elif nltk_tag.startswith('R'): 
        return wordnet.ADV 
    else: 
        return wordnet.NOUN 

# Lemmatization 
def lemmatize_text(text): 
    words = word_tokenize(text) 
    pos_tags = pos_tag(words) 
    lemmatized_words = [lemmatizer.lemmatize(word, get_wordnet_pos(tag)) for word, tag in pos_tags] 
    return " ".join(lemmatized_words) 

# Preprocessing pipeline 
def preprocess_text(text): 
    text = text.lower() 
    text = remove_urls(text) 
    text = remove_html(text) 
    text = remove_emojis(text) 
    text = replace_slang(text) 
    text = replace_contractions(text) 
    text = remove_punctuation(text) 
    text = remove_numbers(text) 
    text = remove_stopwords(text) 
    text = lemmatize_text(text) 
    return text 

# Load dataset 
df = pd.read_csv("UNITENReview.csv") 
df = df[["Review"]]  

df["processed"] = df["Review"].apply(preprocess_text) 
df.to_csv("Processed_UNITENReview.csv", index=False) 
print(df.head())


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Emily\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Emily\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Emily\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Emily\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Emily\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  return BeautifulSoup(text, "html.parser").get_text()


                                              Review  \
0  Im happy with uniten actually, even the people...   
1  I’m having a pretty good time here, happy to m...   
2        a very neutral place in terms of everything   
3  I would say Uniten it's  a good university  bu...   
4   UNITEN is well-regarded, particularly for its...   

                                           processed  
0             im happy uniten actually even people w  
1         i ’ m pretty good time happy meet w people  
2                      neutral place term everything  
3  would say uniten good university issue need im...  
4  uniten wellregarded particularly strong engine...  
