In [1]:
# data manipulation
import pandas as pd

# feature engineering
from nltk.tokenize import word_tokenize
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# model definition & training
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization

# save model
import pickle

# ignore warning
import warnings
warnings.filterwarnings('ignore')

# show all columns
pd.set_option('display.max_columns', None)

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [2]:
# Load Model
vectorization_data = pickle.load(open('text_vectorization.pkl', 'rb'))
vectorizer = TextVectorization.from_config(vectorization_data['config'])
vectorizer.set_weights(vectorization_data['weights'])

model = tf.keras.models.load_model('model.h5')

In [71]:
# Dummy data
df_dummy = pd.DataFrame({
    'text' : ['This movie have bad plot!', 'Wishlist Able!', 'Must Watch!',
              'Must Skip!','Really Good Movie!!']
})

df_dummy

Unnamed: 0,text
0,This movie have bad plot!
1,Wishlist Able!
2,Must Watch!
3,Must Skip!
4,Really Good Movie!!


In [72]:
# make function to clean text
def clean_text(text):
    # Remove all symbols and keep only alphanumeric characters and spaces
    cleaned_text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    cleaned_text = cleaned_text.strip()
    cleaned_text = cleaned_text.lower()
    return cleaned_text

In [73]:
# Define stopword
en_sword = list(set(stopwords.words('english')))
en_sword.append('dont')
en_sword.append('br')
en_sword.append('cant')
en_sword.append('isnt')

# Define lemmatizer
lemmatizer = WordNetLemmatizer()

In [74]:
# create a function for text preprocessing

def text_preprocessing(text):
  # case folding
  text = text.lower()

  # mention removal
  text = re.sub("@[a-za-z0-9_]+", " ", text)

  # hashtags removal
  text = re.sub("#[a-za-z0-9_]+", " ", text)

  # newline removal (\n)
  text = re.sub(r"\\n", " ",text)

  # whitespace removal
  text = text.strip()

  # url removal
  text = re.sub(r"http\s+", " ", text)
  text = re.sub(r"www.\s+", " ", text)

  # non-letter removal (such as emoticon, symbol (like μ, $, 兀), etc
  text = re.sub("[^a-za-z\s']", " ", text)
  text = re.sub("'", "", text)

  # tokenization
  tokens = word_tokenize(text)

  # stopwords removal
  tokens = [word for word in tokens if word not in en_sword]

  # lemmatizing
  tokens = [lemmatizer.lemmatize(word) for word in tokens]

  # combining tokens
  text = ' '.join(tokens)

  return text

In [75]:
# apply function clean text
df_dummy['text'] = df_dummy['text'].apply(clean_text)

In [76]:
# Apply dan cek
data_preprocessing = df_dummy['text'].apply(lambda x: text_preprocessing(x))
data_preprocessing

0       movie bad plot
1        wishlist able
2           must watch
3            must skip
4    really good movie
Name: text, dtype: object

In [77]:
# Vectorize
data_vect = vectorizer(data_preprocessing)

In [78]:
# Assuming data_vect contains vectors for 3 texts
predicted_result_proba = model.predict(data_vect)

# Define the threshold
threshold = 0.5

# Loop over each prediction and print the results
for i, (data_preprocessing, proba) in enumerate(zip(data_preprocessing, predicted_result_proba)):
    if proba > threshold:
        predict_result = 'Positive Review (Recommended Film)'
    else:
        predict_result = 'Negative Review (Not Really Recommended Film)'

    print(f"Text: {data_preprocessing}")
    print(f"Predicted proba: {proba}")
    print(f"Predicted label: {predict_result}\n")

Text: movie bad plot
Predicted proba: [0.01273783]
Predicted label: Negative Review (Not Really Recommended Film)

Text: wishlist able
Predicted proba: [0.47953045]
Predicted label: Negative Review (Not Really Recommended Film)

Text: must watch
Predicted proba: [0.7617916]
Predicted label: Positive Review (Recommended Film)

Text: must skip
Predicted proba: [0.8231908]
Predicted label: Positive Review (Recommended Film)

Text: really good movie
Predicted proba: [0.2317635]
Predicted label: Negative Review (Not Really Recommended Film)



Model masih belum cukup baik dalam memprediksi, kemungkinan karena training tidak mencakup keseluruhan data.