In [13]:
import pandas as pd
import numpy as np

import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, Flatten, Dropout, Dense 
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

from nltk.tokenize import word_tokenize

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

from keras_preprocessing import sequence
from keras_preprocessing.text import Tokenizer

from transformers import BertTokenizer, TFBertModel

from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

import os

import matplotlib.pyplot as plt
from wordcloud import WordCloud
import seaborn as sns
from collections import Counter

import nltk
import re
# nltk.download('punkt')

#### Normalisasi Slang

In [22]:
csv_file = "train_cleaned.csv"
data_path = os.path.join('data')
df = pd.read_csv(os.path.join(data_path, csv_file), delimiter=',').dropna()

def preprocess_text(text):
    # text = text.str.replace('3', 'e')
    text = re.sub(r'3', 'e', text)
    text = re.sub(r'(.)\1{2,}', r'\1\1', text)
    text = re.sub(r'\d+', '', text)
    filtered_words = [word for word in text.split()]
    return ' '.join(filtered_words)

def preprocess_dataframe(df, column_name):
    df[column_name] = df[column_name].apply(lambda x: preprocess_text(x))
    return df

train_cleaned = preprocess_dataframe(df, 'text').dropna()

def word_tokenize_wrapper(text):
    return word_tokenize(text)
train_cleaned['text'] = train_cleaned['text'].apply(word_tokenize_wrapper)

kamus_normalisasi = pd.read_csv("data/slang.csv")
kata_normalisasi_dict = {}
for index, row in kamus_normalisasi.iterrows():
    if row[0] not in kata_normalisasi_dict:
        kata_normalisasi_dict[row[0]] = row[1] 

def normalisasi_kata(document):
    normalized_words = []
    for term in document:
        if term in kata_normalisasi_dict:
            normalized_term = kata_normalisasi_dict[term]
            if pd.isna(normalized_term) or normalized_term == '':
                # Jika hasil normalisasi adalah NaN atau kosong, tambahkan kata asli ke normalized_words
                normalized_words.append(term)
            else:
                normalized_words.extend(normalized_term.split())  # Pecah hasil normalisasi dan tambahkan ke normalized_words
        else:
            normalized_words.append(term)
    return ' '.join(normalized_words)

train_cleaned['text'] = train_cleaned['text'].apply(normalisasi_kata)
    # return [kata_normalisasi_dict[term] if term in kata_normalisasi_dict else term for term in document]


train_cleaned.head()

Unnamed: 0,text,label
0,kunjung prabowo resmi serah proyek bantu bersi...,Sumber Daya Alam
1,Anies tepuk tangan riah rektor wajib kuliah ko...,Politik
2,benar dukung goblok dukung ridwan kamil skema ...,Demografi
3,Anies sikap kritis kerja prabowo anggap tidak ...,Politik
4,Anies baswedan harap polri pegang sumpah milu,Politik


#### Stemming Hasil Normalisasi

In [25]:
def load_stopwords(filepath):
    with open(filepath, 'r') as file:
        stop_words = set(file.read().splitlines())
    return stop_words


factory = StemmerFactory()
stemmer = factory.create_stemmer()

def reduce_repeated_characters(text):
    return re.sub(r'(.)\1{2,}', r'\1\1', text)

def preprocess_text(text, stop_words, stemmer):
    # Lowercase 
    text = text.lower()
    text = reduce_repeated_characters(text)
    # Remove punctuation, '=', ',', startwith(@), word 'rt' with another character
    text = re.sub(r"#\w+", ' ', text)
    text = re.sub(r"\s*[/@+]\w+|\brt\b|[=,.()/:#!?'&-]\s*|rt(?=\W)|\b\w{1,4}\b|\[[^\]]*\]|(?<=\w),(?=\w)|https?", ' ', text)
    text = re.sub(r'[^A-Za-z0-9 ]', ' ', text)
    text = re.sub(r'\b(\w+)\d+\b', r'\1', text)
    # Remove stopwords and specific word
    filtered_words = [word for word in text.split() if word.lower() not in stop_words and word.lower()]
    # filtered_words = ''.join(filtered_words)
    # Stemming
    stemmed_words = [stemmer.stem(word) for word in filtered_words]
    # Join stemmed words
    return ' '.join(stemmed_words)


def preprocess_dataframe(df, column_name, stop_words, stemmer):
    df[column_name] = df[column_name].apply(lambda x: preprocess_text(x, stop_words, stemmer))
    return df
    

stopwords = load_stopwords(os.path.join(data_path, 'stopwords_indonesia.txt'))
df = train_cleaned.copy()
train_cleaned = preprocess_dataframe(df, 'text', stopwords, stemmer).dropna()


csv_file = "train_cleaned_3.csv"
train_cleaned.to_csv(os.path.join(data_path, csv_file), index=False)
print("Write CSV Done.")
# train_cleaned = pd.read_csv(os.path.join(data_path, csv_file), delimiter=',')
train_cleaned.head()

Write CSV Done.


Unnamed: 0,text,label
0,kunjung prabowo resmi serah proyek bantu bersi...,Sumber Daya Alam
1,anies tepuk tangan rektor wajib kuliah korupsi...,Politik
2,dukung goblok dukung ridwan kamil skema mayori...,Demografi
3,anies sikap kritis kerja prabowo anggap sopan ...,Politik
4,anies baswedan harap polri pegang sumpah,Politik
