In [26]:
import pandas as pd
import numpy as np

import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, Flatten, Dropout, Dense 
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

from nltk.tokenize import word_tokenize

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

from keras_preprocessing import sequence
from keras_preprocessing.text import Tokenizer

from transformers import BertTokenizer, TFBertModel

from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

import os

import matplotlib.pyplot as plt
from wordcloud import WordCloud
import seaborn as sns
from collections import Counter

import nltk
import re
# nltk.download('punkt')

In [27]:
csv_file = "train_cleaned.csv"
data_path = os.path.join('data')
df = pd.read_csv(os.path.join(data_path, csv_file), delimiter=',').dropna()

# kata baku bahasa indonesia
kata_baku = "kamus-kata-dasar.csv"
df_baku = pd.read_csv(kata_baku, header=None).dropna()
kata_baku_set = set(df_baku[0])
kata_tidak_baku_list = []

# normalisasi kata "slang"
kamus_normalisasi = pd.read_csv("data/slang.csv")
kata_normalisasi_dict = {}
for index, row in kamus_normalisasi.iterrows():
    if row[0] not in kata_normalisasi_dict:
        kata_normalisasi_dict[row[0]] = row[1] 

def preprocess_text(text):
    # text = text.str.replace('3', 'e')
    text = re.sub(r'3', 'e', text)
    text = re.sub(r'(.)\1{2,}', r'\1\1', text)
    text = re.sub(r'\d+', '', text)
    filtered_words = [word for word in text.split()]
    return ' '.join(filtered_words)

def preprocess_dataframe(df, column_name):
    df[column_name] = df[column_name].apply(lambda x: preprocess_text(x))
    return df

def normalisasi_kata(document):
    normalized_words = [kata_normalisasi_dict.get(term, term) for term in document]
    return ' '.join(normalized_words)
    # return [kata_normalisasi_dict[term] if term in kata_normalisasi_dict else term for term in document]

def cek_kata_baku(document):
    kata_tidak_baku = [term for term in document if term not in kata_baku_set]
    return kata_tidak_baku

def word_tokenize_wrapper(text):
    return word_tokenize(text)

#nltk.download('words')
from nltk.corpus import words
def is_english_word(word):
    return word in words.words()

import translators as ts
def translate_text(q_text):
    if is_english_word(q_text):
        hasil = ts.translate_text(q_text, from_language='en', to_language='id').lower()
        print(f"Translating {q_text} to {hasil}")
        return hasil
    else:
        return q_text.lower()

train_cleaned = preprocess_dataframe(df, 'text').dropna()
train_cleaned['text'] = train_cleaned['text'].apply(word_tokenize_wrapper)
train_cleaned['text'] = train_cleaned['text'].apply(normalisasi_kata)
train_cleaned['text'] = train_cleaned['text'].apply(word_tokenize_wrapper)

# Check for non-baku words
kata_tidak_baku = train_cleaned['text'].apply(cek_kata_baku)
for kata in kata_tidak_baku:
    kata_tidak_baku_list.extend(kata)

train_cleaned['text'] = train_cleaned['text'].apply(lambda x: ' '.join(x))
train_cleaned.head()

Unnamed: 0,text,label
0,kunjung prabowo resmi serah proyek bantu bersi...,Sumber Daya Alam
1,anies tepuk tangan riah rektor wajib kuliah ko...,Politik
2,benar dukung goblok dukung ridwan kamil skema ...,Demografi
3,anies sikap kritis kerja prabowo anggap tidak ...,Politik
4,anies baswedan harap polri pegang sumpah milu,Politik


In [None]:
csv_file = "train_cleaned_2.csv"
train_cleaned.to_csv(os.path.join(data_path, csv_file), index=False)
print("Write CSV Done.")

df_kata_tidak_baku = pd.DataFrame(list(set(kata_tidak_baku_list)), columns=["Kata Tidak Baku"])
df_kata_tidak_baku['Terjemahan'] = df_kata_tidak_baku.applymap(translate_text)
df_kata_tidak_baku.to_csv("kata_tidak_baku.csv", index=False)

In [None]:
df_kata_tidak_baku

Unnamed: 0,Kata Tidak Baku,Terjemahan
0,buzzer,bel
1,leading,terkemuka
2,polybag,polybag
3,purwakarta,purwakarta
4,resources,resources
...,...,...
3354,tenggelamkan,tenggelamkan
3355,dissing,dissing
3356,newsweek,newsweek
3357,sprotiv,sprotiv
