<a href="https://colab.research.google.com/github/Alanjamlu34/Web-Scraping-IMDB-Movie-Reviews/blob/main/Analisis_Review.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import Library

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf

# Read Dataset

In [19]:
# Load the data from the specified path
data_path = "/content/data.csv"
df = pd.read_csv(data_path)
df.head()

Unnamed: 0,Date,overview,Stars,Review
0,25 May 2005,Life's Lessons in one Movie...,10.0,When I first saw this movie I didn't appreciat...
1,8 April 2002,A beautiful fable for now and the future,10.0,I believe everyone has a right to their opinio...
2,10 June 2002,Let's see the world through the eyes of Forres...,10.0,"""I've made about 20 films and 5 of them are pr..."
3,14 May 2007,"In my opinion, no film has touched me more tha...",10.0,"Quite simply, the greatest film ever made.Humo..."
4,20 June 2005,The zen of Forrest Gump.,10.0,This is a powerful yet charming movie; fun for...


# Preprocessing Data

## Maping data

In [3]:
# Mapping dictionary for Stars column
stars_mapping = {
    0: 'buruk', 1: 'buruk', 2: 'buruk', 3: 'buruk',
    4: 'netral', 5: 'netral', 6: 'netral', 7: 'netral',
    8: 'bagus', 9: 'bagus', 10: 'bagus'
}

# Map the Stars column to categorical labels
df['Label'] = df['Stars'].map(stars_mapping)

## Split Dataset

In [4]:

# Split the data into training and testing sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Initialize sentences and labels lists for training
training_sentences = train_df['Review'].tolist()
training_labels = train_df['Label'].tolist()

# Initialize sentences and labels lists for testing
testing_sentences = test_df['Review'].tolist()
testing_labels = test_df['Label'].tolist()

# Convert labels lists to numpy arrays
label_mapping = {'buruk': 0, 'netral': 1, 'bagus': 2}
training_labels_final = np.array([label_mapping[label] for label in training_labels])
testing_labels_final = np.array([label_mapping[label] for label in testing_labels])

## Remove Stopword

In [5]:
# Function to remove stopwords
def remove_stopwords(sentence):
    stopwords = ["a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at",
                 "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", "did",
                 "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", "have",
                 "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself",
                 "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's",
                 "its", "itself", "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only",
                 "or", "other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "she", "she'd",
                 "she'll", "she's", "should", "so", "some", "such", "than", "that", "that's", "the", "their", "theirs",
                 "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", "they're",
                 "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we",
                 "we'd", "we'll", "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's",
                 "which", "while", "who", "who's", "whom", "why", "why's", "with", "would", "you", "you'd", "you'll",
                 "you're", "you've", "your", "yours", "yourself", "yourselves"]

    sentence = sentence.lower()
    words = sentence.split()
    no_stopwords = [w for w in words if w not in stopwords]
    return " ".join(no_stopwords)

# Clean the sentences by removing stopwords
training_sentences = [remove_stopwords(sentence) for sentence in training_sentences]
testing_sentences = [remove_stopwords(sentence) for sentence in testing_sentences]

## Set Parameter

In [6]:
# Parameters
vocab_size = 10000
max_length = 100
embedding_dim = 2000
trunc_type = 'post'
oov_tok = "<OOV>"

# Tokenize and Padding

In [7]:
# Clear previous models and sessions
tf.keras.backend.clear_session()

In [8]:
# Initialize the Tokenizer class
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)

# Generate the word index dictionary for the training sentences
tokenizer.fit_on_texts(training_sentences)
word_index = tokenizer.word_index

# Generate and pad the training sequences
train_sequences = tokenizer.texts_to_sequences(training_sentences)
train_padded = pad_sequences(train_sequences, maxlen=max_length, truncating=trunc_type)

# Generate and pad the test sequences
test_sequences = tokenizer.texts_to_sequences(testing_sentences)
test_padded = pad_sequences(test_sequences, maxlen=max_length, truncating=trunc_type)

1. Inisialisasi Tokenizer
 - Tokenizer: Menginisialisasi kelas Tokenizer dari Keras untuk mempersiapkan teks agar dapat diproses oleh model.
 - num_words: Menentukan jumlah maksimum kata yang akan dipertimbangkan dalam tokenizer berdasarkan frekuensi.
 - oov_token: Menentukan token yang akan digunakan untuk kata-kata yang tidak ditemukan dalam vocab_size (out-of-vocabulary).

2. Membuat Kamus Kata untuk Kalimat Pelatihan
 - fit_on_texts: Melatih tokenizer pada kalimat pelatihan untuk membuat kamus kata.
 - word_index: Kamus kata yang dihasilkan, di mana setiap kata diberikan indeks unik.

3. Membuat dan Mengisi Sequence Pelatihan
 - texts_to_sequences: Mengonversi kalimat pelatihan menjadi urutan indeks berdasarkan kamus kata (word_index).
 - pad_sequences: Mengisi (padding) urutan indeks agar memiliki panjang yang sama (max_length). truncating menentukan cara memotong urutan yang lebih panjang dari max_length.

 4. Membuat dan Mengisi Sequence Pengujian
  - texts_to_sequences: Mengonversi kalimat pengujian menjadi urutan indeks seperti pada kalimat pelatihan.
 - pad_sequences: Mengisi urutan indeks untuk kalimat pengujian agar memiliki panjang yang sama (max_length).



# Build Model

In [9]:
# Build the model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(6, activation='relu'),
    tf.keras.layers.Dense(3, activation='softmax')  # 3 neurons for 'buruk', 'netral', 'bagus'
])


# Compile and Train Model

In [10]:
from tensorflow.keras.callbacks import EarlyStopping
early_stop = EarlyStopping(monitor='val_loss', patience=10)

In [11]:
# Setup the training parameters
model.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam',
              metrics=['sparse_categorical_accuracy'])

# Print the model summary
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 2000)         20000000  
                                                                 
 global_average_pooling1d (  (None, 2000)              0         
 GlobalAveragePooling1D)                                         
                                                                 
 dense (Dense)               (None, 6)                 12006     
                                                                 
 dense_1 (Dense)             (None, 3)                 21        
                                                                 
Total params: 20012027 (76.34 MB)
Trainable params: 20012027 (76.34 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


- tf.keras.Sequential: Menginisialisasi model Sequential dari Keras yang memungkinkan pembuatan layer secara berurutan.
- Embedding: Layer embedding yang mengonversi indeks kata menjadi vektor dengan dimensi tertentu (embedding_dim). input_length adalah panjang maksimum urutan kata.
- GlobalAveragePooling1D: Layer pooling yang mengurangi dimensi dengan mengambil rata-rata dari semua vektor dalam urutan, menghasilkan satu vektor per urutan.
- Dense: Layer Dense dengan 6 neuron dan fungsi aktivasi relu, untuk menambah kompleksitas dan kemampuan belajar non-linear.
- Dense: Layer Dense dengan 3 neuron dan fungsi aktivasi softmax, untuk menghasilkan probabilitas untuk 3 kelas output ('buruk', 'netral', 'bagus').
- Model ini dirancang untuk klasifikasi teks ke dalam tiga kategori dengan menggunakan embedding kata dan layer pooling untuk mengurangi dimensi sebelum klasifikasi.

In [12]:
# Train the model
model.fit(train_padded, training_labels_final, epochs=100, validation_data=(test_padded, testing_labels_final), verbose=2, callbacks = [early_stop])

Epoch 1/100
75/75 - 17s - loss: 0.5596 - sparse_categorical_accuracy: 0.8590 - val_loss: 0.4315 - val_sparse_categorical_accuracy: 0.8737 - 17s/epoch - 229ms/step
Epoch 2/100
75/75 - 9s - loss: 0.3992 - sparse_categorical_accuracy: 0.8708 - val_loss: 0.4028 - val_sparse_categorical_accuracy: 0.8721 - 9s/epoch - 121ms/step
Epoch 3/100
75/75 - 6s - loss: 0.3340 - sparse_categorical_accuracy: 0.8729 - val_loss: 0.3758 - val_sparse_categorical_accuracy: 0.8704 - 6s/epoch - 84ms/step
Epoch 4/100
75/75 - 6s - loss: 0.2589 - sparse_categorical_accuracy: 0.9108 - val_loss: 0.3687 - val_sparse_categorical_accuracy: 0.8670 - 6s/epoch - 76ms/step
Epoch 5/100
75/75 - 5s - loss: 0.1777 - sparse_categorical_accuracy: 0.9369 - val_loss: 0.3873 - val_sparse_categorical_accuracy: 0.8687 - 5s/epoch - 70ms/step
Epoch 6/100
75/75 - 5s - loss: 0.1262 - sparse_categorical_accuracy: 0.9630 - val_loss: 0.4446 - val_sparse_categorical_accuracy: 0.8670 - 5s/epoch - 70ms/step
Epoch 7/100
75/75 - 4s - loss: 0.093

<keras.src.callbacks.History at 0x7a79f14af310>

# Visualize Word Embeddings

In [13]:
# Get the embedding layer from the model (i.e. first layer)
embedding_layer = model.layers[0]

# Get the weights of the embedding layer
embedding_weights = embedding_layer.get_weights()[0]

# Print the shape. Expected is (vocab_size, embedding_dim)
print(embedding_weights.shape)

(10000, 2000)


In [14]:
# Get the index-word dictionary
reverse_word_index = tokenizer.index_word

In [15]:
import io

# Open writeable files
out_v = io.open('vecs.tsv', 'w', encoding='utf-8')
out_m = io.open('meta.tsv', 'w', encoding='utf-8')

# Initialize the loop. Start counting at `1` because `0` is just for the padding
for word_num in range(1, vocab_size):

  # Get the word associated at the current index
  word_name = reverse_word_index[word_num]

  # Get the embedding weights associated with the current index
  word_embedding = embedding_weights[word_num]

  # Write the word name
  out_m.write(word_name + "\n")

  # Write the word embedding
  out_v.write('\t'.join([str(x) for x in word_embedding]) + "\n")

# Close the files
out_v.close()
out_m.close()

In [None]:
# Import files utilities in Colab
try:
  from google.colab import files
except ImportError:
  pass

# Download the files
else:
  files.download('vecs.tsv')
  files.download('meta.tsv')