# Import Library

In [1]:
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from bs4 import BeautifulSoup
from sklearn.model_selection import train_test_split

# Preprocessing

## Import Data

In [2]:
criticism_df = pd.read_csv('criticism_dataset.csv')

## Prepare the Data

In [3]:
def review_to_words(raw_review):
    # 1. Remove HTML
    review_text = BeautifulSoup(raw_review, 'lxml').get_text() 
    
    # 2. Tokenize words
    words = word_tokenize(review_text)
    
    # 3. Convert to lower case
    words = [word.lower() for word in words]
    
    # 4. Remove non-alphabetic characters and numbers
    words = [re.sub("[^a-zA-Z]", "", word) for word in words]
    
    # 5. Lemmatization
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    
    # 6. Create set of stopwords
    stops = set(stopwords.words("english"))
    
    # 7. Remove stop words
    meaningful_words = [word for word in words if word not in stops]
    
    # 8. Join the words back into one string separated by space
    return " ".join(meaningful_words)

In [4]:
preprocessed_criticism_df = criticism_df.copy()
preprocessed_criticism_df['english_review'] = preprocessed_criticism_df['english_review'].apply(review_to_words)

  review_text = BeautifulSoup(raw_review, 'lxml').get_text()


## Split the Data

In [5]:
X = preprocessed_criticism_df['english_review']
y = preprocessed_criticism_df['label']

In [6]:
# Split data menjadi train dan sementara untuk validation+test (20% dari data)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, stratify=y, random_state=2024)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=2024)

### Export the splitted data

In [7]:
# X = criticism_df[['place_id','english_review']]
# y = criticism_df['label']

# # Split data menjadi train dan sementara untuk validation+test (20% dari data)
# X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, stratify=y, random_state=2024)
# X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=2024)

# # Combine train and validation data
# X_train_val = pd.concat([X_train, X_val])
# y_train_val = pd.concat([y_train, y_val])

# # Combine train, validation, and test data
# X_all = pd.concat([X_train, X_val, X_test])
# y_all = pd.concat([y_train, y_val, y_test])

# # Create DataFrames
# train_val_df = pd.concat([X_train_val, y_train_val], axis=1)
# all_df = pd.concat([X_all, y_all], axis=1)

# # Save DataFrames to CSV
# train_val_df.to_csv('train_validation.csv', index=False)
# all_df.to_csv('train_validation_test.csv', index=False)

# Modeling

In [8]:
# Load model BERT
tfhub_handle_preprocess = "https://kaggle.com/models/tensorflow/bert/TensorFlow2/en-uncased-preprocess/3"
tfhub_handle_encoder = "https://www.kaggle.com/models/tensorflow/bert/TensorFlow2/bert-en-uncased-l-2-h-128-a-2/2"

In [9]:
def build_classifier_model():
    text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
    preprocessing_layer = hub.KerasLayer(tfhub_handle_preprocess, name='preprocessing')
    encoder_inputs = preprocessing_layer(text_input)
    encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='BERT_encoder')
    outputs = encoder(encoder_inputs)
    net = outputs['pooled_output']
    net = tf.keras.layers.Dropout(0.1)(net)
    net = tf.keras.layers.Dense(1, activation='sigmoid', name='classifier')(net)  # Sigmoid untuk binary classification
    return tf.keras.Model(text_input, net)

In [10]:
# Compile model
classifier_model = build_classifier_model()
classifier_model.compile(optimizer='adam',
                         loss='binary_crossentropy',
                         metrics=['accuracy'])

In [11]:
# Define callback for saving the best model based on validation loss
checkpoint_filepath = 'FeedbackClassifier.h5'
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=False,
    monitor='val_loss',
    mode='min',
    save_best_only=True)

In [12]:
# Training
history = classifier_model.fit(X_train, y_train,
                               validation_data=(X_val, y_val),
                               epochs=5,
                               batch_size=32,
                               callbacks=[model_checkpoint_callback])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
