# Importing Required Libraries

In [1]:
import pandas as pd
import json
import re
from symspellpy.symspellpy import SymSpell, Verbosity
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from num2words import num2words
from word2number import w2n

# Data Loading

In [20]:
data = pd.read_csv(r'D:\Đại học\DS T5\Model\test_restaurants.csv')
data.head()

Unnamed: 0,restaurant_id,restaurant_name,Name,Rating,Comment
0,1,Anar Indian Restaurant,Gep Yalong,5,Everything was amazing. Food and variety with ...
1,1,Anar Indian Restaurant,Mario Garcia-Gillespie,5,Fantastic local restaurant with a great dinner...
2,1,Anar Indian Restaurant,Sukanya Arunkumar,5,We had come on a visit to LA. We had ordered t...
3,1,Anar Indian Restaurant,Katherine,5,"This is my favorite Indian spot in LA, and the..."
4,1,Anar Indian Restaurant,Priyanka Bengaluru,4,Service was amazing. Tried their one person di...


In [21]:
import nltk
from nltk.tokenize import sent_tokenize
data['id'] = data.index + 1

data['sentence'] = data['Comment'].apply(nltk.sent_tokenize)
data = data.explode('sentence')[['id', 'restaurant_id', 'restaurant_name', 'sentence',]].reset_index(drop=True)

data['id_sentence'] = data.index + 1
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 637 entries, 0 to 636
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   id               637 non-null    int64 
 1   restaurant_id    637 non-null    int64 
 2   restaurant_name  637 non-null    object
 3   sentence         637 non-null    object
 4   id_sentence      637 non-null    int64 
dtypes: int64(3), object(2)
memory usage: 25.0+ KB


In [22]:
def clean_text(txt):
    if not isinstance(txt,str):
        txt = str(txt)
    txt = txt.lower()
    txt = re.sub(r'\\[nt]*',' ',txt) #removes \n and \t tags
    txt = re.sub(r'[^A-Za-z0-9\s]',' ',txt) #removes non textual data
    if isinstance(txt, str):
        return re.sub(r'\s+', ' ', txt).strip()
    return txt

In [23]:
data['sentence'] = data['sentence'].apply(clean_text)

In [244]:
data = data[['id', 'sentence', 'price', 'quality', 'environment', 'service', 'other']]
data.head()

Unnamed: 0,id,sentence,price,quality,environment,service,other
0,1,i ordered egg soft noodles from here they did...,,Negative,,Negative,
1,2,ice cream was good but it s totally got melt...,,Positive,,Negative,
2,2,he mentioned it as delivered as soon as he pic...,,,,Negative,
3,2,he took nearly 45 minutes to come,,,,Negative,
4,3,food was not good its from yesterday s food wi...,,Negative,,Negative,


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   restaurant_id    205 non-null    int64 
 1   restaurant_name  205 non-null    object
 2   Name             205 non-null    object
 3   Rating           205 non-null    int64 
 4   Comment          205 non-null    object
dtypes: int64(2), object(3)
memory usage: 8.1+ KB


# Preprocessing

### Remove empty line

In [3]:
def clean_whitespace(text):
    if isinstance(text, str):
        return re.sub(r'\s+', ' ', text).strip()
    return text

In [247]:
data['sentence'] = data['sentence'].apply(clean_whitespace)
data = data[~((data['sentence'].isna()) | (data['sentence'] == ''))].reset_index(drop=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5288 entries, 0 to 5287
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   id           5288 non-null   int64 
 1   sentence     5288 non-null   object
 2   price        207 non-null    object
 3   quality      2244 non-null   object
 4   environment  590 non-null    object
 5   service      1186 non-null   object
 6   other        535 non-null    object
dtypes: int64(1), object(6)
memory usage: 289.3+ KB


### Text Normalization

In [6]:
# Dictionary for slang words
with open("slang.json", "r", encoding="utf-8") as f:
    slang_dict = json.load(f)

In [7]:
# Dictionary for contraction words
with open("contractions_dict.json", "r", encoding="utf-8") as f:
    contraction_dict = json.load(f)

In [8]:
def text_normalization(text):
    words = text.split()
    normalized_words = [slang_dict.get(word.lower(), word) for word in words]
    normalized_text = ' '.join(normalized_words)
    pattern = re.compile(r'\b(' + '|'.join(re.escape(k) for k in contraction_dict.keys()) + r')\b')
    return pattern.sub(lambda x: contraction_dict[x.group()], normalized_text)

### Typo correction

In [9]:
sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
dictionary_path = "frequency_dictionary_en_82_765.txt"
sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)

True

In [10]:
def numbers_to_words(text):
    return re.sub(r'\d+', lambda m: num2words(int(m.group())), text)

In [11]:
def typo_correction(text):
    text = str(text)
    
    text = numbers_to_words(text)  # chuyển số thành chữ
    text = text_normalization(text)

    if not re.search(r'[a-zA-Z]', text):
        return text

    suggestions = sym_spell.lookup_compound(text, max_edit_distance=2)
    corrected = suggestions[0].term if suggestions else text
    print(corrected)

    return corrected

In [25]:
data['sentence'] = data['sentence'].apply(typo_correction)

everything was amazing
food and variety with custom ability ambiance with music they service and even they comfort room was excellent
we got they beef bryan and lamb marsala with plain naan
serving was generous enough for three
no gamey or domesticated taste
was also easy to chew eat
really delicious and filling rich taste
also had mango lass
it was really heavy and delicious
service was good and fast
comfort room was clean and complete with supplies no bidet
i would recommend this place to everyone
fantastic local restaurant with a great dinner for two deal that will definitely leave food for lunch they next day
food was cook to perfection and they curry and spices were spot on
we had come on a visit to la
we had ordered takeouts of veg bryan and samosa on they first day of our visit
they were delicious
they told us that they even deliver
so next time we ordered butter naan and deal taka over they phone and they delivered within thirty forty minutes to they hotel we stayed in
it was s

In [28]:
data.head(15)

Unnamed: 0,id,restaurant_id,restaurant_name,sentence,id_sentence
0,1,1,Anar Indian Restaurant,everything was amazing,1
1,1,1,Anar Indian Restaurant,food and variety with custom ability ambiance ...,2
2,1,1,Anar Indian Restaurant,we got they beef bryan and lamb marsala with p...,3
3,1,1,Anar Indian Restaurant,serving was generous enough for three,4
4,1,1,Anar Indian Restaurant,no gamey or domesticated taste,5
5,1,1,Anar Indian Restaurant,was also easy to chew eat,6
6,1,1,Anar Indian Restaurant,really delicious and filling rich taste,7
7,1,1,Anar Indian Restaurant,also had mango lass,8
8,1,1,Anar Indian Restaurant,it was really heavy and delicious,9
9,1,1,Anar Indian Restaurant,service was good and fast,10


In [26]:
data.to_csv('merged_restaurant.csv', index=False)