<a href="https://colab.research.google.com/github/Alanjamlu34/Web-Scraping-IMDB-Movie-Reviews/blob/main/Analisis_Review.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Read Dataset

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf

# Load the data from the specified path
data_path = "/content/data.csv"
df = pd.read_csv(data_path)

# Clear previous models and sessions
tf.keras.backend.clear_session()

# Mapping dictionary for Stars column
stars_mapping = {
    0: 'buruk', 1: 'buruk', 2: 'buruk', 3: 'buruk',
    4: 'netral', 5: 'netral', 6: 'netral', 7: 'netral',
    8: 'bagus', 9: 'bagus', 10: 'bagus'
}

# Map the Stars column to categorical labels
df['Label'] = df['Stars'].map(stars_mapping)

# Split the data into training and testing sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Initialize sentences and labels lists for training
training_sentences = train_df['Review'].tolist()
training_labels = train_df['Label'].tolist()

# Initialize sentences and labels lists for testing
testing_sentences = test_df['Review'].tolist()
testing_labels = test_df['Label'].tolist()

# Convert labels lists to numpy arrays
label_mapping = {'buruk': 0, 'netral': 1, 'bagus': 2}
training_labels_final = np.array([label_mapping[label] for label in training_labels])
testing_labels_final = np.array([label_mapping[label] for label in testing_labels])

# Function to remove stopwords
def remove_stopwords(sentence):
    stopwords = ["a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at",
                 "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", "did",
                 "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", "have",
                 "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself",
                 "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's",
                 "its", "itself", "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only",
                 "or", "other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "she", "she'd",
                 "she'll", "she's", "should", "so", "some", "such", "than", "that", "that's", "the", "their", "theirs",
                 "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", "they're",
                 "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we",
                 "we'd", "we'll", "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's",
                 "which", "while", "who", "who's", "whom", "why", "why's", "with", "would", "you", "you'd", "you'll",
                 "you're", "you've", "your", "yours", "yourself", "yourselves"]

    sentence = sentence.lower()
    words = sentence.split()
    no_stopwords = [w for w in words if w not in stopwords]
    return " ".join(no_stopwords)

# Clean the sentences by removing stopwords
training_sentences = [remove_stopwords(sentence) for sentence in training_sentences]
testing_sentences = [remove_stopwords(sentence) for sentence in testing_sentences]

# Parameters
vocab_size = 10000
max_length = 100
embedding_dim = 2000
trunc_type = 'post'
oov_tok = "<OOV>"

# Initialize the Tokenizer class
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)

# Generate the word index dictionary for the training sentences
tokenizer.fit_on_texts(training_sentences)
word_index = tokenizer.word_index

# Generate and pad the training sequences
train_sequences = tokenizer.texts_to_sequences(training_sentences)
train_padded = pad_sequences(train_sequences, maxlen=max_length, truncating=trunc_type)

# Generate and pad the test sequences
test_sequences = tokenizer.texts_to_sequences(testing_sentences)
test_padded = pad_sequences(test_sequences, maxlen=max_length, truncating=trunc_type)

# Build the model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(6, activation='relu'),
    tf.keras.layers.Dense(3, activation='softmax')  # 3 neurons for 'buruk', 'netral', 'bagus'
])

# Setup the training parameters
model.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam',
              metrics=['sparse_categorical_accuracy'])

# Print the model summary
model.summary()

# Train the model
model.fit(train_padded, training_labels_final, epochs=10, validation_data=(test_padded, testing_labels_final), verbose=2)


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 2000)         20000000  
                                                                 
 global_average_pooling1d (  (None, 2000)              0         
 GlobalAveragePooling1D)                                         
                                                                 
 dense (Dense)               (None, 6)                 12006     
                                                                 
 dense_1 (Dense)             (None, 3)                 21        
                                                                 
Total params: 20012027 (76.34 MB)
Trainable params: 20012027 (76.34 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/10
75/75 - 17s - loss: 0.5866 - sparse_categorical_accuracy: 0.8561 - va

<keras.src.callbacks.History at 0x7b35ee362350>

# Visualize Word Embeddings

In [2]:
# Get the embedding layer from the model (i.e. first layer)
embedding_layer = model.layers[0]

# Get the weights of the embedding layer
embedding_weights = embedding_layer.get_weights()[0]

# Print the shape. Expected is (vocab_size, embedding_dim)
print(embedding_weights.shape)

(10000, 2000)


In [3]:
# Get the index-word dictionary
reverse_word_index = tokenizer.index_word

In [4]:
import io

# Open writeable files
out_v = io.open('vecs.tsv', 'w', encoding='utf-8')
out_m = io.open('meta.tsv', 'w', encoding='utf-8')

# Initialize the loop. Start counting at `1` because `0` is just for the padding
for word_num in range(1, vocab_size):

  # Get the word associated at the current index
  word_name = reverse_word_index[word_num]

  # Get the embedding weights associated with the current index
  word_embedding = embedding_weights[word_num]

  # Write the word name
  out_m.write(word_name + "\n")

  # Write the word embedding
  out_v.write('\t'.join([str(x) for x in word_embedding]) + "\n")

# Close the files
out_v.close()
out_m.close()

In [5]:
# Import files utilities in Colab
try:
  from google.colab import files
except ImportError:
  pass

# Download the files
else:
  files.download('vecs.tsv')
  files.download('meta.tsv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>