In [1]:
import pandas as pd
import re
from keybert import KeyBERT
from sklearn.feature_extraction.text import CountVectorizer
from ast import literal_eval

In [2]:
def remove_emoji(text):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # CJK characters
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010FFFF"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats font
                               u"\u3030"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

def remove_non_ascii(text):
    # Menghapus karakter non-ASCII
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    return text

def remove_symbols(text):
    # Menghapus simbol-simbol yang tidak diinginkan
    symbols_pattern = re.compile(r'[^a-zA-Z0-9\s]')
    text = re.sub(symbols_pattern, '', text)
    return text

# Membaca file CSV
data = pd.read_csv('Place_Detail_Scored.csv', encoding='unicode_escape')

# Mengubah kolom 'Review Text' menjadi tipe data string
data['Review Text'] = data['Review Text'].astype(str)

# Menghapus emoji, karakter non-ASCII, dan simbol-simbol tidak diinginkan
data['Review Text'] = data['Review Text'].apply(remove_emoji)
data['Review Text'] = data['Review Text'].apply(remove_non_ascii)
data['Review Text'] = data['Review Text'].apply(remove_symbols)

# Inisialisasi model KeyBERT
kw_model = KeyBERT()

# Proses setiap teks ulasan dan ekstraksi kata kunci
for index, row in data.iterrows():
    review_text = row['Review Text']
    keywords = kw_model.extract_keywords(review_text, keyphrase_ngram_range=(1, 2), stop_words='english',
                                         use_maxsum=True, nr_candidates=20, top_n=10)
    data.at[index, 'Keywords'] = ', '.join([kw for kw, _ in keywords])

# Menyimpan DataFrame yang telah dimodifikasi ke file CSV baru
data.to_csv('Dataset_cleanfix.csv', index=False)
