In [1]:
!pip install gdown



In [2]:
pip install stanza

Collecting stanza
  Downloading stanza-1.7.0-py3-none-any.whl (933 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m933.2/933.2 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting emoji (from stanza)
  Downloading emoji-2.9.0-py2.py3-none-any.whl (397 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m397.5/397.5 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: emoji, stanza
Successfully installed emoji-2.9.0 stanza-1.7.0


In [3]:
!gdown --id '1LgI2Kh4XfwCf_9FtHRS51nmdMx0sfvhd'

Downloading...
From: https://drive.google.com/uc?id=1LgI2Kh4XfwCf_9FtHRS51nmdMx0sfvhd
To: /content/review.xlsx
100% 2.78M/2.78M [00:00<00:00, 170MB/s]


In [4]:
import pandas as pd
import stanza
import string
import re
import json
import tensorflow as tf
import numpy as np
from pathlib import Path
from collections import defaultdict
from keras.layers import Input
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, GlobalMaxPooling1D, Embedding, Dense
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.preprocessing.sequence import pad_sequences

In [5]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

In [6]:
def pad_or_truncate_sequences(sequences, maxlen):
    return pad_sequences(sequences, maxlen=maxlen, truncating='post', padding='post')

class AspectOpinionExtractor:
    def __init__(self):
        self.nlp = stanza.Pipeline('id')

    def extract_aspects_and_opinions(self, text):
        doc = self.nlp(text)
        aspect_opinions = []
        for sentence in doc.sentences:
            words = sentence.words
            i = 0
            while i < len(words):
                word = words[i]
                if word.upos in ['NOUN', 'PRON'] and word.text.lower() != 'sayang':  # Exclude 'sayang'
                    # Check if the noun is part of a compound noun
                    compound_noun = word.text
                    if i < len(words) - 1 and words[i + 1].upos == 'NOUN':
                        compound_noun += ' ' + words[i + 1].text
                        i += 1
                    # Check adjectives in noun phrase
                    for potential_child in sentence.words:
                        if potential_child.head == int(word.id) or (i < len(words) - 1 and potential_child.head == int(words[i + 1].id)):
                            if potential_child.upos == 'ADJ':  # Only consider adjectives as opinions
                                # Check if the adjective has a negation
                                for potential_modifier in sentence.words:
                                    if potential_modifier.head == int(potential_child.id) and potential_modifier.text.lower() == 'tidak':
                                        aspect_opinions.append((compound_noun, 'tidak ' + potential_child.text))
                                        break
                                else:
                                    aspect_opinions.append((compound_noun, potential_child.text))
                i += 1
        aspects = [ao[0] for ao in aspect_opinions]
        opinions = [ao[1] for ao in aspect_opinions]
        return aspects, opinions

def create_tokenizer(texts, max_features):
    tokenizer = Tokenizer(num_words=max_features, split=' ')
    tokenizer.fit_on_texts(texts)
    return tokenizer

def preprocess_text(text):
    slang_path = '/content/_json_colloquial-indonesian-lexicon.txt'
    with open(slang_path, 'r') as f:
        slang_dict = json.load(f)
    text = str(text)
    text = text.lower()
    text = text.split()
    text = [slang_dict.get(word, word) for word in text]
    text = ' '.join(text)
    return text

class SentimentModel(tf.keras.Model):
    def __init__(self, df, max_features, embed_dim, conv_out, batch_size, epochs):
        super(SentimentModel, self).__init__()
        self.df = df
        self.max_features = max_features
        self.embed_dim = embed_dim
        self.conv_out = conv_out
        self.batch_size = batch_size
        self.epochs = epochs
        self.tokenizer = create_tokenizer([], max_features)  # Inisialisasi tokenizer di sini dengan X kosong
        self.model = self.build_model()

    def get_config(self):
        return {"df": self.df,
                "max_features": self.max_features,
                "embed_dim": self.embed_dim,
                "conv_out": self.conv_out,
                "batch_size": self.batch_size,
                "epochs": self.epochs}

    def build_model(self):
        input_shape = self.preprocess()[0].shape[1]  # Menggunakan bentuk data input yang sebenarnya
        model = tf.keras.Sequential([
            tf.keras.layers.Embedding(self.max_features, self.embed_dim, input_length=input_shape),
            tf.keras.layers.Conv1D(self.conv_out, 3, activation='relu'),
            tf.keras.layers.GlobalMaxPooling1D(),
            tf.keras.layers.Dense(256, activation='relu'),  # Add another Dense layer
            tf.keras.layers.Dense(1, activation='sigmoid')  # Ubah jumlah neuron di lapisan Dense menjadi 1 dan fungsi aktivasi menjadi sigmoid
        ])
        model.compile(loss = 'binary_crossentropy', optimizer='adam', metrics = ['accuracy'])  # Ubah fungsi loss menjadi binary_crossentropy
        return model

    def preprocess(self):
        X = self.df['aspek'] + ' ' + self.df['opini']
        X = [preprocess_text(text) for text in X]  # Use preprocess_text here
        Y = self.df['sentimen']
        le = LabelEncoder()
        Y = le.fit_transform(Y)
        Y = Y.reshape(-1,1)
        self.tokenizer.fit_on_texts(X)
        X = self.tokenizer.texts_to_sequences(X)
        X = pad_sequences(X)
        return train_test_split(X, Y, test_size = 0.2)

    def train(self):
        X_train, X_test, Y_train, Y_test = self.preprocess()
        self.model.fit(X_train, Y_train, epochs = self.epochs, batch_size=self.batch_size, verbose = 2, validation_data=(X_test, Y_test))
        accuracy = self.model.evaluate(X_test, Y_test, verbose = 2, batch_size = self.batch_size)
        return accuracy

    def predict(self, review):
        review = preprocess_text(review)  # Pra-pemrosesan teks review
        aspect_opinion_extractor = AspectOpinionExtractor()
        aspects, opinions = aspect_opinion_extractor.extract_aspects_and_opinions(review)
        X = [aspect + ' ' + opinion for aspect, opinion in zip(aspects, opinions)]
        X_seq = self.tokenizer.texts_to_sequences(X)
        X_pad = pad_or_truncate_sequences(X_seq, maxlen=7)
        predictions = self.model.predict(X_pad)
        sentiments = ['positif' if prediction > 0.5 else 'negatif' for prediction in predictions]
        result = [({aspect: opinion}, sentiment) for aspect, opinion, sentiment in zip(aspects, opinions, sentiments)]
        result_json = json.dumps(result)
        return result_json

@staticmethod
class Predictor:
    def __init__(self, model, tokenizer=None):
        self.model = model
        self.tokenizer = tokenizer or create_tokenizer([], max_features=10000)  # Ganti max_features sesuai kebutuhan
        self.aspect_opinion_extractor = AspectOpinionExtractor()

    def predict(self, review):
        # Preprocess the input review using the existing preprocess_text function
        review = preprocess_text(review)

        # Extract aspects and opinions
        aspects, opinions = self.aspect_opinion_extractor.extract_aspects_and_opinions(review)

        # Prepare input data for the model
        X = [aspect + ' ' + opinion for aspect, opinion in zip(aspects, opinions)]
        X_seq = self.tokenizer.texts_to_sequences(X)
        X_pad = pad_sequences(X_seq, maxlen=7)

        # Make predictions using the pre-trained model
        predictions = self.model.predict(X_pad)

        # Convert predictions into the desired format
        sentiments = ['positif' if prediction > 0.5 else 'negatif' for prediction in predictions]
        result = [({aspect: opinion}, sentiment) for aspect, opinion, sentiment in zip(aspects, opinions, sentiments)]
        result_json = json.dumps(result)

        return result_json

    def preprocess_input_for_tflite(self, input_string):
        # Preprocess the input review using the existing preprocess_text function
        preprocessed_input = preprocess_text(input_string)

        # Extract aspects and opinions
        aspects, opinions = self.aspect_opinion_extractor.extract_aspects_and_opinions(preprocessed_input)

        # Prepare input data for the TFLite model
        X = [aspect + ' ' + opinion for aspect, opinion in zip(aspects, opinions)]
        X_seq = self.tokenizer.texts_to_sequences(X)
        X_pad = pad_sequences(X_seq, maxlen=7)

        # Convert the input data to FLOAT32 (required by TFLite)
        X_float32 = np.float32(X_pad)

        # Ensure the input data has the correct shape (n, 7) where n is the number of aspect-opinion pairs
        if len(X_float32.shape) == 3:
            X_float32 = X_float32.reshape((X_float32.shape[1], X_float32.shape[2]))

        if X_float32.shape[1] != 7:
            raise ValueError(f"Invalid shape: {X_float32.shape}. Expected (n, 7).")

        return X_float32




In [7]:
import pandas as pd
df = pd.read_excel('review.xlsx')
df

Unnamed: 0,title,text,aspek,opini,sentimen
0,Katenjo Chill Space & kopi Bar,Tempatnya lumayan luas,tempat,luas,positif
1,Katenjo Chill Space & kopi Bar,Lokasi strategis,lokasi,strategis,positif
2,Katenjo Chill Space & kopi Bar,Makanan lumayan enak,makanan,enak,positif
3,Katenjo Chill Space & kopi Bar,Harga cukup terjangkau,harga,terjangkau,positif
4,Katenjo Chill Space & kopi Bar,Pelayan bertanggungjawab,pelayan,bertanggungjawab,positif
...,...,...,...,...,...
53486,Rumah Kopi Baretto,Rahal,,,
53487,Rumah Kopi Baretto,bagus,,,
53488,Rumah Kopi Baretto,Bagus sekali,,,
53489,Rumah Kopi Baretto,berlari,,,


In [8]:
df_new = df.dropna(subset=['opini'])
df_new

Unnamed: 0,title,text,aspek,opini,sentimen
0,Katenjo Chill Space & kopi Bar,Tempatnya lumayan luas,tempat,luas,positif
1,Katenjo Chill Space & kopi Bar,Lokasi strategis,lokasi,strategis,positif
2,Katenjo Chill Space & kopi Bar,Makanan lumayan enak,makanan,enak,positif
3,Katenjo Chill Space & kopi Bar,Harga cukup terjangkau,harga,terjangkau,positif
4,Katenjo Chill Space & kopi Bar,Pelayan bertanggungjawab,pelayan,bertanggungjawab,positif
...,...,...,...,...,...
1355,Toga Peak Cafe,Caffe dengan pemandangan pemandangan kota sume...,pemandangan,cityskype,positif
1356,Toga Peak Cafe,buat yang berkunjung di sumedang silahkan data...,suasana,ciamik,positif
1357,Toga Peak Cafe,"Tempatnya enak, sambil nongkrong, makan, ngemi...",tempat,enak,positif
1358,Toga Peak Cafe,Pelayan nya ga ramah,pelayanan,ga ramah,negatif


In [9]:
df_new = df_new.applymap(lambda s:s.lower() if type(s) == str else s)
df_new['sentimen'] = df_new['sentimen'].replace("posiif", "positif")
df_new.dropna(axis=0, inplace=True)

In [10]:
data_augmented = pd.read_excel('/content/augmented.xlsx')
# Menambahkan data_augmented ke data asli
df_new = pd.concat([df_new, data_augmented])
df_new = df_new.sample(frac=1).reset_index(drop=True)
df_new.drop(['title', 'Unnamed: 0', 'label'], axis=1, inplace=True)


In [11]:
df_new['text'] = df_new['text'].astype(str).apply(preprocess_text)

In [12]:
df_new

Unnamed: 0,text,aspek,opini,sentimen
0,sarana ibadah untuk muslim ada mushola yang cu...,musholla,tersedia,positif
1,tempat favorit anak muda nih,tempat,favorit anak muda,positif
2,"penyajian cepat, makanan dan minuman juga enak",menu,enak,positif
3,untuk yang mencari pelarian dari hiruk pikuk k...,suasana,tenang,positif
4,kopi susu nikmat,kopi,nikmat,positif
...,...,...,...,...
1187,kopi susu juara,kopi,juara,positif
1188,"masih kurang space untuk nongki disini, jadi s...",dark belgia choco,enak,positif
1189,kopi terenak menurut saya...,kopi,enak,positif
1190,begitu sampai kita disuguhkan udara yang sejuk...,pemandangan,menyegarkan,positif


In [13]:
model = SentimentModel(df_new, max_features=10000, embed_dim=480, conv_out=288, batch_size=16, epochs=20)
model.train()

Epoch 1/20
60/60 - 6s - loss: 0.2971 - accuracy: 0.8709 - val_loss: 0.1161 - val_accuracy: 0.9665 - 6s/epoch - 104ms/step
Epoch 2/20
60/60 - 5s - loss: 0.0544 - accuracy: 0.9811 - val_loss: 0.0751 - val_accuracy: 0.9791 - 5s/epoch - 81ms/step
Epoch 3/20
60/60 - 5s - loss: 0.0283 - accuracy: 0.9906 - val_loss: 0.0760 - val_accuracy: 0.9874 - 5s/epoch - 88ms/step
Epoch 4/20
60/60 - 4s - loss: 0.0198 - accuracy: 0.9927 - val_loss: 0.0873 - val_accuracy: 0.9749 - 4s/epoch - 72ms/step
Epoch 5/20
60/60 - 5s - loss: 0.0149 - accuracy: 0.9927 - val_loss: 0.0999 - val_accuracy: 0.9707 - 5s/epoch - 82ms/step
Epoch 6/20
60/60 - 5s - loss: 0.0129 - accuracy: 0.9916 - val_loss: 0.1091 - val_accuracy: 0.9749 - 5s/epoch - 81ms/step
Epoch 7/20
60/60 - 4s - loss: 0.0126 - accuracy: 0.9937 - val_loss: 0.1146 - val_accuracy: 0.9749 - 4s/epoch - 75ms/step
Epoch 8/20
60/60 - 5s - loss: 0.0133 - accuracy: 0.9906 - val_loss: 0.1143 - val_accuracy: 0.9749 - 5s/epoch - 87ms/step
Epoch 9/20
60/60 - 5s - loss: 0

[0.1553228199481964, 0.9748954176902771]

In [18]:
review =["pelayanan lambat, tapi tempatnya bagus", "tempatnya luas, dan lokasi strategis", "tempatnya enak tapi panas"]
predictions = model.predict(review)
print("Predictions:", predictions)

INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.7.0.json:   0%|   …

INFO:stanza:Loading these models for language: id (Indonesian):
| Processor    | Package      |
-------------------------------
| tokenize     | gsd          |
| mwt          | gsd          |
| pos          | gsd_charlm   |
| lemma        | gsd_nocharlm |
| constituency | icon_charlm  |
| depparse     | gsd_charlm   |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Loading: pos
INFO:stanza:Loading: lemma
INFO:stanza:Loading: constituency
INFO:stanza:Loading: depparse
INFO:stanza:Done loading processors!


Predictions: [[{"pelayanan": "lambat"}, "negatif"], [{"tempat": "bagus"}, "positif"], [{"tempatnya": "luas"}, "positif"], [{"lokasi": "strategis"}, "positif"], [{"tempatnya": "enak"}, "positif"], [{"tempatnya": "panas"}, "negatif"]]


In [20]:
model.model.save("model3.h5", include_optimizer=True)


In [23]:
# Load the pre-trained model
model_path = '/content/model3.h5'  # Replace with the actual path
model = tf.keras.models.load_model(model_path)

# Create an instance of the Predictor class with the model and tokenizer
predictor = Predictor(model, tokenizer)

# Example usage
review = ["pelayanan lambat, tapi tempatnya bagus", "tempatnya luas, dan lokasi strategis", "tempatnya enak tapi panas"]
prediction_result = predictor.predict(review)
print(prediction_result)


INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.7.0.json:   0%|   …

INFO:stanza:Loading these models for language: id (Indonesian):
| Processor    | Package      |
-------------------------------
| tokenize     | gsd          |
| mwt          | gsd          |
| pos          | gsd_charlm   |
| lemma        | gsd_nocharlm |
| constituency | icon_charlm  |
| depparse     | gsd_charlm   |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Loading: pos
INFO:stanza:Loading: lemma
INFO:stanza:Loading: constituency
INFO:stanza:Loading: depparse
INFO:stanza:Done loading processors!


[[{"pelayanan": "lambat"}, "negatif"], [{"tempat": "bagus"}, "positif"], [{"tempatnya": "luas"}, "positif"], [{"lokasi": "strategis"}, "positif"], [{"tempatnya": "enak"}, "positif"], [{"tempatnya": "panas"}, "negatif"]]


In [24]:
import tensorflow as tf

# Load the pre-trained model
model_path = '/content/model3.h5'  # Replace with the actual path
model = tf.keras.models.load_model(model_path)

# Convert the model to TFLite format
converter = tf.lite.TFLiteConverter.from_keras_model(model)
tflite_model = converter.convert()

# Save the TFLite model to a file
with open('model3.tflite', 'wb') as f:
    f.write(tflite_model)


In [25]:
tflite_model_path = '/content/model3.tflite'
interpreter = tf.lite.Interpreter(model_path=tflite_model_path)
interpreter.allocate_tensors()

# Assuming you have defined the input_details and output_details
input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()

# Create an instance of the Predictor class
predictor = Predictor(model, tokenizer)

# Example usage for TFLite model with multiple inputs
input_strings = ["pelayanan lambat, tapi tempatnya bagus", "tempatnya luas, dan lokasi strategis", "tempatnya enak tapi panas"]
tflite_input_data = predictor.preprocess_input_for_tflite(input_strings)

for i in range(tflite_input_data.shape[0]):
    # Setel tensor input di TFLite Interpreter
    interpreter.set_tensor(input_details[0]['index'], tflite_input_data[i:i+1])

    # Jalankan inference
    interpreter.invoke()

    # Dapatkan tensor output
    output_data = interpreter.get_tensor(output_details[0]['index'])
    print(f"Prediction Result for input {i+1}:", output_data)


INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.7.0.json:   0%|   …

INFO:stanza:Loading these models for language: id (Indonesian):
| Processor    | Package      |
-------------------------------
| tokenize     | gsd          |
| mwt          | gsd          |
| pos          | gsd_charlm   |
| lemma        | gsd_nocharlm |
| constituency | icon_charlm  |
| depparse     | gsd_charlm   |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Loading: pos
INFO:stanza:Loading: lemma
INFO:stanza:Loading: constituency
INFO:stanza:Loading: depparse
INFO:stanza:Done loading processors!


Prediction Result for input 1: [[0.00021348]]
Prediction Result for input 2: [[1.]]
Prediction Result for input 3: [[0.9999999]]
Prediction Result for input 4: [[0.9999999]]
Prediction Result for input 5: [[1.]]
Prediction Result for input 6: [[4.884716e-05]]


In [None]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 7, 480)            4800000   
                                                                 
 conv1d (Conv1D)             (None, 5, 288)            415008    
                                                                 
 global_max_pooling1d (Glob  (None, 288)               0         
 alMaxPooling1D)                                                 
                                                                 
 dense (Dense)               (None, 256)               73984     
                                                                 
 dense_1 (Dense)             (None, 1)                 257       
                                                                 
Total params: 5289249 (20.18 MB)
Trainable params: 5289249 (20.18 MB)
Non-trainable params: 0 (0.00 Byte)
________________