In [3]:
import pandas as pd 
import numpy as np  
import seaborn as sns
import matplotlib.pyplot as plt
import re
import tensorflow as tf
import json
import pickle

# NLTK
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
nltk.download('averaged_perceptron_tagger')

nltk.download('stopwords')
nltk.download('punkt')

from wordcloud import WordCloud
from textblob import TextBlob
from collections import Counter

import string

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.metrics import ConfusionMatrixDisplay

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Bidirectional
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Embedding
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, Callback
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from imblearn.over_sampling import SMOTE
from tensorflow.keras.models import load_model


import gensim
from gensim.models import KeyedVectors
print("Gensim imported successfully")

from imblearn.over_sampling import RandomOverSampler

from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

import warnings
warnings.filterwarnings('ignore')

Gensim imported successfully


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\handw\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\handw\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\handw\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
model = load_model("model_lstm.h5")

# Step 2: Load the Tokenizer
with open("tokenizer.pkl", "rb") as f:
    tokenizer = pickle.load(f)

# Step 3: Define Sequence Length
# Use the same sequence length you used during training
sequence_length = 100

In [5]:
input_text = "Keren banget videonya! Penjelasannya jelas dan mudah dimengerti. Ditunggu konten selanjutnya, semangat terus!"



In [6]:
def load_slang_txt(file_path):
    slang_dict_txt = {}
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            file_content = file.read()
            slang_dict_txt = json.loads(file_content)
    except json.JSONDecodeError:
        print(f"Error decoding JSON in the file: {file_path}")
    return slang_dict_txt

def load_slang_csv(file_path):
    slang_df = pd.read_csv(file_path, encoding='ISO-8859-1')
    return dict(zip(slang_df.iloc[:, 0], slang_df.iloc[:, 1]))

# Combine Slang Dictionaries
slang_txt_path = 'combined_slang_words.txt'
slang_dict_txt = load_slang_txt(slang_txt_path)

slang_csv_path = 'new_kamusalay.csv'
slang_dict_csv = load_slang_csv(slang_csv_path)

slang_dict_tambahan = {
    "gw": "saya", "mau": "ingin", "ni": "ini", "aja": "saja", "gak": "tidak", "bgt": "sangat",
    "klo": "kalau", "bgs": "bagus", "masi": "masih", "msh": "masih", "lom": "belum",
    "blm": "belum", "ap": "apa", "brg": "barang", "ad": "ada", "blom": "belum",
    "kebli": "kebeli", "tp": "tapi", "org": "orang", "tdk": "tidak", "yg": "yang",
    "kalo": "kalau", "sy": "saya", "bng": "abang", "bg": "abang", "fto": "foto",
    "spek": "spesifikasi", "cm": "cuma", "jg": "juga", "pd": "pada", "skrg": "sekarang",
    "ga": "tidak", "gk": "tidak", "batre": "baterai", "gue": "saya", "dpt": "dapat",
    "kek": "seperti", "mna": "mana", "mnding": "mending", "mend": "mending",
    "dr": "dari", "sma": "sama", "drpada": "daripada"
}

slang_dict = {**slang_dict_tambahan, **slang_dict_txt, **slang_dict_csv}

# Stopwords (Adjusted)
stpwds_id = list(set(stopwords.words('indonesian')))
retain_words = ['baru', 'lama', 'sama', 'tapi', 'tidak', 'dari', 'belum', 'bagi', 'mau', 'masalah']
for word in retain_words:
    if word in stpwds_id:
        stpwds_id.remove(word)

# Initialize Lemmatizer
lemmatizer = WordNetLemmatizer()

# Function to Replace Slang
def replace_slang_in_text(text, slang_dict):
    words = text.split()
    replaced_words = [slang_dict.get(word, word) for word in words]
    return ' '.join(replaced_words)

# Informal Phrases
informal_phrases = {
    "sat set sat set": "cepat", "ya mas": ""
}

# Preprocessing Function
def text_preprocessing(text, slang_dict):
    # Case folding (convert text to lowercase)
    text = text.lower()

    # Remove mentions, hashtags, and newlines
    text = re.sub(r"@[\w]+|#[\w]+|\n", " ", text)

    # Remove URLs
    text = re.sub(r"http\S+|www.\S+", " ", text)

    # Remove non-alphabetic characters and extra spaces
    text = re.sub(r"[^\w\s']", " ", text)

    # Replace informal phrases
    for phrase, replacement in informal_phrases.items():
        text = text.replace(phrase, replacement)

    # Replace slang terms
    text = replace_slang_in_text(text, slang_dict)

    # Tokenization
    tokens = word_tokenize(text)

    # Remove stopwords
    tokens = [word for word in tokens if word not in stpwds_id]

    # Lemmatization (optional, but can improve performance)
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    # Stemming with exceptions
    stemming_exceptions = {"terasa": "terasa", "sat": "cepat", "set": "cepat"}
    tokens = [stemming_exceptions.get(word, word) for word in tokens]

    # Reassemble the text and remove duplicates
    text = ' '.join(dict.fromkeys(tokens))

    return text

# Process the Input Text
processed_text = text_preprocessing(input_text, slang_dict)

# Output
print(f"Original Text: {input_text}")
print(f"Processed Text: {processed_text}")

Original Text: Keren banget videonya! Penjelasannya jelas dan mudah dimengerti. Ditunggu konten selanjutnya, semangat terus!
Processed Text: keren banget videonya penjelasannya mudah dimengerti ditunggu konten semangat


In [12]:
tokenized_input = tokenizer.texts_to_sequences([processed_text])
padded_input = tf.keras.preprocessing.sequence.pad_sequences(tokenized_input, maxlen=sequence_length)

# Ensure the input is in the correct shape
print(f"Processed Input Shape: {padded_input.shape}")

# Predict using the model (make sure the model is loaded)
predictions = model.predict(padded_input)

# Check if predictions have the expected shape
if predictions.shape[0] == 0:
    raise ValueError("Model returned an empty prediction. Verify the input and model setup.")

# Interpret the prediction
predicted_class = np.argmax(predictions, axis=1)[0]
sentiment_mapping = {0: "Negative", 1: "Neutral", 2: "Positive"}

# Interpret the prediction (now converting integer to sentiment label)
predicted_class = np.argmax(predictions, axis=1)[0]
predicted_sentiment = sentiment_mapping.get(predicted_class, "Unknown")  # Default to "Unknown" if not found

# Print the result
print(f"Input Text: {input_text}")
print(f"Processed Sequence: {processed_text}")
print(f"Predicted Sentiment: {predicted_sentiment}")

Processed Input Shape: (1, 100)
Input Text: Keren banget videonya! Penjelasannya jelas dan mudah dimengerti. Ditunggu konten selanjutnya, semangat terus!
Processed Sequence: keren banget videonya penjelasannya mudah dimengerti ditunggu konten semangat
Predicted Sentiment: Negative
