In [44]:
!pip install gdown



In [45]:
pip install stanza



In [46]:
!gdown --id '1LgI2Kh4XfwCf_9FtHRS51nmdMx0sfvhd'

Downloading...
From: https://drive.google.com/uc?id=1LgI2Kh4XfwCf_9FtHRS51nmdMx0sfvhd
To: /content/review.xlsx
100% 2.78M/2.78M [00:00<00:00, 35.6MB/s]


In [47]:
import pandas as pd
import stanza
import string
import re
import json
import tensorflow as tf
from pathlib import Path
from collections import defaultdict
from keras.layers import Input
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, GlobalMaxPooling1D, Embedding, Dense
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.preprocessing.sequence import pad_sequences

In [48]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

In [49]:
def pad_or_truncate_sequences(sequences, maxlen):
    return pad_sequences(sequences, maxlen=maxlen, truncating='post', padding='post')

class AspectOpinionExtractor:
    def __init__(self):
        self.nlp = stanza.Pipeline('id')

    def extract_aspects_and_opinions(self, text):
        doc = self.nlp(text)
        aspect_opinions = []
        for sentence in doc.sentences:
            words = sentence.words
            i = 0
            while i < len(words):
                word = words[i]
                if word.upos in ['NOUN', 'PRON'] and word.text.lower() != 'sayang':  # Exclude 'sayang'
                    # Check if the noun is part of a compound noun
                    compound_noun = word.text
                    if i < len(words) - 1 and words[i + 1].upos == 'NOUN':
                        compound_noun += ' ' + words[i + 1].text
                        i += 1
                    # Check adjectives in noun phrase
                    for potential_child in sentence.words:
                        if potential_child.head == int(word.id) or (i < len(words) - 1 and potential_child.head == int(words[i + 1].id)):
                            if potential_child.upos == 'ADJ':  # Only consider adjectives as opinions
                                # Check if the adjective has a negation
                                for potential_modifier in sentence.words:
                                    if potential_modifier.head == int(potential_child.id) and potential_modifier.text.lower() == 'tidak':
                                        aspect_opinions.append((compound_noun, 'tidak ' + potential_child.text))
                                        break
                                else:
                                    aspect_opinions.append((compound_noun, potential_child.text))
                i += 1
        aspects = [ao[0] for ao in aspect_opinions]
        opinions = [ao[1] for ao in aspect_opinions]
        return aspects, opinions

def preprocess_text(text):
    slang_path = '/content/_json_colloquial-indonesian-lexicon.txt'
    with open(slang_path, 'r') as f:
        slang_dict = json.load(f)
    text = str(text)
    text = text.lower()
    text = text.split()
    text = [slang_dict.get(word, word) for word in text]
    text = ' '.join(text)
    return text
class SentimentModel(tf.keras.Model):
    def __init__(self, df, max_features, embed_dim, conv_out, batch_size, epochs):
        super(SentimentModel, self).__init__()
        self.df = df
        self.tokenizer = Tokenizer(num_words=max_features, split=' ')
        self.max_features = max_features
        self.embed_dim = embed_dim
        self.conv_out = conv_out
        self.batch_size = batch_size
        self.epochs = epochs
        self.model = self.build_model()

    def get_config(self):
        return {"df": self.df,
                "max_features": self.max_features,
                "embed_dim": self.embed_dim,
                "conv_out": self.conv_out,
                "batch_size": self.batch_size,
                "epochs": self.epochs}

    def build_model(self):
        input_shape = self.preprocess()[0].shape[1]  # Menggunakan bentuk data input yang sebenarnya
        model = tf.keras.Sequential([
            tf.keras.layers.Embedding(self.max_features, self.embed_dim, input_length=input_shape),
            tf.keras.layers.Conv1D(self.conv_out, 3, activation='relu'),
            tf.keras.layers.GlobalMaxPooling1D(),
            tf.keras.layers.Dense(256, activation='relu'),  # Add another Dense layer
            tf.keras.layers.Dense(1, activation='sigmoid')  # Ubah jumlah neuron di lapisan Dense menjadi 1 dan fungsi aktivasi menjadi sigmoid
        ])
        model.compile(loss = 'binary_crossentropy', optimizer='adam', metrics = ['accuracy'])  # Ubah fungsi loss menjadi binary_crossentropy
        return model

    def preprocess(self):
        X = self.df['aspek'] + ' ' + self.df['opini']
        X = [preprocess_text(text) for text in X]  # Use preprocess_text here
        Y = self.df['sentimen']
        le = LabelEncoder()
        Y = le.fit_transform(Y)
        Y = Y.reshape(-1,1)
        self.tokenizer.fit_on_texts(X)
        X = self.tokenizer.texts_to_sequences(X)
        X = pad_sequences(X)
        return train_test_split(X, Y, test_size = 0.2)

    def train(self):
        X_train, X_test, Y_train, Y_test = self.preprocess()
        self.model.fit(X_train, Y_train, epochs = self.epochs, batch_size=self.batch_size, verbose = 2, validation_data=(X_test, Y_test))
        accuracy = self.model.evaluate(X_test, Y_test, verbose = 2, batch_size = self.batch_size)
        return accuracy

    def predict(self, review):
        review = preprocess_text(review)  # Pra-pemrosesan teks review
        aspect_opinion_extractor = AspectOpinionExtractor()
        aspects, opinions = aspect_opinion_extractor.extract_aspects_and_opinions(review)
        X = [aspect + ' ' + opinion for aspect, opinion in zip(aspects, opinions)]
        X_seq = self.tokenizer.texts_to_sequences(X)
        X_pad = pad_or_truncate_sequences(X_seq, maxlen=7)
        predictions = self.model.predict(X_pad)
        sentiments = ['positif' if prediction > 0.5 else 'negatif' for prediction in predictions]
        result = [({aspect: opinion}, sentiment) for aspect, opinion, sentiment in zip(aspects, opinions, sentiments)]
        result_json = json.dumps(result)
        return result_json


In [50]:
import pandas as pd
df = pd.read_excel('review.xlsx')
df

Unnamed: 0,title,text,aspek,opini,sentimen
0,Katenjo Chill Space & kopi Bar,Tempatnya lumayan luas,tempat,luas,positif
1,Katenjo Chill Space & kopi Bar,Lokasi strategis,lokasi,strategis,positif
2,Katenjo Chill Space & kopi Bar,Makanan lumayan enak,makanan,enak,positif
3,Katenjo Chill Space & kopi Bar,Harga cukup terjangkau,harga,terjangkau,positif
4,Katenjo Chill Space & kopi Bar,Pelayan bertanggungjawab,pelayan,bertanggungjawab,positif
...,...,...,...,...,...
53486,Rumah Kopi Baretto,Rahal,,,
53487,Rumah Kopi Baretto,bagus,,,
53488,Rumah Kopi Baretto,Bagus sekali,,,
53489,Rumah Kopi Baretto,berlari,,,


In [51]:
df_new = df.dropna(subset=['opini'])
df_new

Unnamed: 0,title,text,aspek,opini,sentimen
0,Katenjo Chill Space & kopi Bar,Tempatnya lumayan luas,tempat,luas,positif
1,Katenjo Chill Space & kopi Bar,Lokasi strategis,lokasi,strategis,positif
2,Katenjo Chill Space & kopi Bar,Makanan lumayan enak,makanan,enak,positif
3,Katenjo Chill Space & kopi Bar,Harga cukup terjangkau,harga,terjangkau,positif
4,Katenjo Chill Space & kopi Bar,Pelayan bertanggungjawab,pelayan,bertanggungjawab,positif
...,...,...,...,...,...
1355,Toga Peak Cafe,Caffe dengan pemandangan pemandangan kota sume...,pemandangan,cityskype,positif
1356,Toga Peak Cafe,buat yang berkunjung di sumedang silahkan data...,suasana,ciamik,positif
1357,Toga Peak Cafe,"Tempatnya enak, sambil nongkrong, makan, ngemi...",tempat,enak,positif
1358,Toga Peak Cafe,Pelayan nya ga ramah,pelayanan,ga ramah,negatif


In [52]:
df_new = df_new.applymap(lambda s:s.lower() if type(s) == str else s)
df_new['sentimen'] = df_new['sentimen'].replace("posiif", "positif")
df_new.dropna(axis=0, inplace=True)

In [53]:
data_augmented = pd.read_excel('/content/augmented.xlsx')
# Menambahkan data_augmented ke data asli
df_new = pd.concat([df_new, data_augmented])
df_new = df_new.sample(frac=1).reset_index(drop=True)
df_new.drop(['title', 'Unnamed: 0', 'label'], axis=1, inplace=True)


In [54]:
df_new['text'] = df_new['text'].astype(str).apply(preprocess_text)

In [55]:
df_new

Unnamed: 0,text,aspek,opini,sentimen
0,makanannya murah dan enak pelayanannya baik,makanan,enak,positif
1,kopi susu paling enak di sumedang.,kopi susu,enak,positif
2,tempat yang nyaman untuk ngopi,tempat,nyaman,positif
3,"sukajaya, kec. sumedang sel., kabupaten sumeda...",menu,terjangkau,positif
4,"suasana adem, nyaman untuk bersantai dan berca...",suasana,adem,positif
...,...,...,...,...
1187,lokasi mudah ditemukan,kopi,mudah ditemukan,positif
1188,tempat ngopi yang cocok jika lepas dari hiruk ...,suasana,mantep poll,positif
1189,"kopi nya asli mantap pisan, kuat di pakai bega...",kopi,mantap,positif
1190,"kamar bersih, pencahayaan bagus, tempat duduk ...",tempatduduk,nyaman,positif


In [56]:
model = SentimentModel(df_new, max_features=10000, embed_dim=480, conv_out=288, batch_size=16, epochs=20)
model.train()

Epoch 1/20
60/60 - 7s - loss: 0.2671 - accuracy: 0.8772 - val_loss: 0.1061 - val_accuracy: 0.9749 - 7s/epoch - 119ms/step
Epoch 2/20
60/60 - 7s - loss: 0.0574 - accuracy: 0.9811 - val_loss: 0.0968 - val_accuracy: 0.9749 - 7s/epoch - 118ms/step
Epoch 3/20
60/60 - 5s - loss: 0.0298 - accuracy: 0.9927 - val_loss: 0.0890 - val_accuracy: 0.9707 - 5s/epoch - 83ms/step
Epoch 4/20
60/60 - 6s - loss: 0.0259 - accuracy: 0.9916 - val_loss: 0.0928 - val_accuracy: 0.9623 - 6s/epoch - 97ms/step
Epoch 5/20
60/60 - 6s - loss: 0.0248 - accuracy: 0.9895 - val_loss: 0.0871 - val_accuracy: 0.9707 - 6s/epoch - 102ms/step
Epoch 6/20
60/60 - 6s - loss: 0.0181 - accuracy: 0.9937 - val_loss: 0.0865 - val_accuracy: 0.9791 - 6s/epoch - 100ms/step
Epoch 7/20
60/60 - 6s - loss: 0.0159 - accuracy: 0.9906 - val_loss: 0.0850 - val_accuracy: 0.9749 - 6s/epoch - 103ms/step
Epoch 8/20
60/60 - 6s - loss: 0.0142 - accuracy: 0.9937 - val_loss: 0.0868 - val_accuracy: 0.9749 - 6s/epoch - 105ms/step
Epoch 9/20
60/60 - 5s - lo

[0.09770534932613373, 0.9790794849395752]

In [57]:
export_dir = '/content/drive/MyDrive/Capstone'
tf.saved_model.save(model,export_dir=export_dir)



In [58]:
converter = tf.lite.TFLiteConverter.from_keras_model(model.model)
tflite_model = converter.convert()

In [59]:
tflite_model_file = Path('/content/drive/MyDrive/Capstone/model.tflite')
tflite_model_file.write_bytes(tflite_model)

21160804

In [60]:
review = ["makanannya kotor, tempat mantap", "lokasi sempit", "tempatnya nyaman enak makan disana, pelayan tidak ramah"]
predictions = model.predict(review)
print("Predictions:", predictions)

INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.7.0.json:   0%|   …

INFO:stanza:Loading these models for language: id (Indonesian):
| Processor    | Package      |
-------------------------------
| tokenize     | gsd          |
| mwt          | gsd          |
| pos          | gsd_charlm   |
| lemma        | gsd_nocharlm |
| constituency | icon_charlm  |
| depparse     | gsd_charlm   |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Loading: pos
INFO:stanza:Loading: lemma
INFO:stanza:Loading: constituency
INFO:stanza:Loading: depparse
INFO:stanza:Done loading processors!


Predictions: [[{"makanan": "kotor"}, "negatif"], [{"tempat": "mantap"}, "positif"], [{"lokasi": "sempit"}, "negatif"], [{"tempatnya": "nyaman"}, "positif"], [{"tempatnya": "enak"}, "positif"], [{"pelayan": "tidak ramah"}, "positif"]]
