In [1]:
import pandas as pd
import string
from nltk.corpus import stopwords
import nltk
from textblob import TextBlob
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import spacy
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import json
import py_vncorenlp
import re

### Recipe 1: Clean

In [2]:
def clean(text):
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = text.strip()
    return text

In [3]:
clean("  Hello,   my name is 123  ")

'Hello my name is 123'

### Recipe 2: Lowercasing

In [4]:
def to_lower(text):
    """Convert the input text to lowercase."""
    if not isinstance(text, str):
        raise ValueError("Input must be a string.")
    result = text.lower()
    return result

In [5]:
to_lower("Hello WoLrD")

'hello wolrd'

### Recipe 3: Punctuation Removal

In [6]:
def remove_punctuation(text):
    """Remove punctuation from the input text."""
    if not isinstance(text, str):
        raise ValueError("Input must be a string.")
    result = text.translate(str.maketrans("", "", string.punctuation))
    return result

In [7]:
remove_punctuation("Hello, World!")

'Hello World'

### Recipe 4: Stop Words Removal

In [8]:
# Read stopwords from the vietnamese.txt file
with open('../data/vietnamese.txt', 'r', encoding='utf-8') as file:
    vietnamese_stopwords = set(file.read().splitlines())

# Function to remove stopwords, ensuring we handle NaN or non-string values
def remove_stopwords(text):
    # Convert NaN to an empty string
    if not isinstance(text, str):
        return ''
    
    # Remove stopwords
    return ' '.join([word for word in text.split() if word not in vietnamese_stopwords])

In [9]:
remove_stopwords("tôi là sinh viên")

'sinh viên'

### Recipe 5: Chuẩn hóa văn bản

In [10]:
file_path = '../data/vi-abbreviations.json'

# Open the JSON file and load its contents
with open(file_path, 'r', encoding='utf-8') as file:
    abbreviations  = json.load(file)
    
def replace_abbreviations(text):
    """Remove digits from the input text and replace abbreviations."""
    if not isinstance(text, str):
        raise ValueError("Input must be a string.")
    
    # Replace abbreviations using the loaded dictionary
    result = " ".join([abbreviations.get(word, word) for word in text.split()])
    return result

In [11]:
replace_abbreviations("tgdd có đt khá ổn nên ib ko")

'thế giới di động có điện thoại khá ổn nên inbox không'