<a href="https://colab.research.google.com/github/2303A51786/nlp/blob/main/PROJECT_NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
# Install required libraries (run once)
!pip install pandas numpy nltk gensim torch transformers tensorflow scikit-learn

# Import libraries
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Load dataset
df = pd.read_csv("indonesia_news_titles.csv")
print("Dataset shape:", df.shape)
print(df.head())

# Clean text
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)
    text = re.sub(r"[^a-z\s]", " ", text)
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if w not in stop_words and len(w) > 1]
    return " ".join(tokens)

df["clean_text"] = df["title"].apply(clean_text)
print(df["clean_text"].head())



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Dataset shape: (1174, 3)
          source                                              title  \
0  CNN Indonesia  Wamendagri Sebut Dalam Waktu Dekat Ada Kepala ...   
1  CNN Indonesia  Kepala Daerah Awali Hari Ketiga Retret Pembeka...   
2  CNN Indonesia  Lemparan Arhan Makan Korban, Bangkok United Me...   
3  CNN Indonesia                      PSSI Resmi Pecat Indra Sjafri   
4  CNN Indonesia  VIDEO: Hizbullah Gelar Prosesi Pemakaman Hasan...   

           date  
0  Unknown Date  
1  Unknown Date  
2  Unknown Date  
3  Unknown Date  
4  Unknown Date  
0    wamendagri sebut dalam waktu dekat ada kepala ...
1    kepala daerah awali hari ketiga retret pembeka...
2    lemparan arhan makan korban bangkok united men...
3                        pssi resmi pecat indra sjafri
4    video hizbullah gelar prosesi pemakaman hasan ...
Name: clean_text, dtype: object


In [5]:
from gensim.models import Word2Vec

# Tokenize sentences
sentences = [text.split() for text in df["clean_text"]]

# Train Word2Vec model
model_w2v = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4, epochs=10)
print("Vocabulary size:", len(model_w2v.wv.index_to_key))

# Example: Similar words
word = model_w2v.wv.index_to_key[0]
print(f"Most similar words to '{word}':")
print(model_w2v.wv.most_similar(word))

Vocabulary size: 3560
Most similar words to 'di':
[('dan', 0.9655696153640747), ('yang', 0.9433751702308655), ('untuk', 0.937593400478363), ('jadi', 0.9363900423049927), ('dalam', 0.9345992207527161), ('dengan', 0.9298462867736816), ('ke', 0.9269905686378479), ('dari', 0.9183618426322937), ('ini', 0.9094700813293457), ('baru', 0.9089093804359436)]


In [7]:
from transformers import AutoTokenizer, AutoModel
import torch

# Load pretrained BERT model
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = AutoModel.from_pretrained("distilbert-base-uncased")

# Encode first 5 sentences
texts = df["clean_text"].head(5).tolist()
inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")

# Get embeddings
with torch.no_grad():
    outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1)

print("Embeddings shape:", embeddings.shape)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Embeddings shape: torch.Size([5, 768])


In [8]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split

# Prepare data
texts = df["clean_text"].values
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
padded = pad_sequences(sequences, maxlen=40, padding="post")

# Dummy target (since dataset unlabeled)
y = np.array([len(t.split()) % 2 for t in texts])

X_train, X_val, y_train, y_val = train_test_split(padded, y, test_size=0.2, random_state=42)

# LSTM model
model_lstm = Sequential([
    Embedding(10000, 128, input_length=40),
    LSTM(64),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])

model_lstm.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model_lstm.summary()

# Train
model_lstm.fit(X_train, y_train, epochs=3, batch_size=32, validation_data=(X_val, y_val))




Epoch 1/3
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 71ms/step - accuracy: 0.5221 - loss: 0.6935 - val_accuracy: 0.5574 - val_loss: 0.6905
Epoch 2/3
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 70ms/step - accuracy: 0.5398 - loss: 0.6931 - val_accuracy: 0.5574 - val_loss: 0.6917
Epoch 3/3
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 43ms/step - accuracy: 0.5110 - loss: 0.6933 - val_accuracy: 0.5574 - val_loss: 0.6898


<keras.src.callbacks.history.History at 0x7cdbaeb55b80>

In [9]:
from tensorflow.keras.layers import Bidirectional

# BiLSTM model
model_bilstm = Sequential([
    Embedding(10000, 128, input_length=40),
    Bidirectional(LSTM(64)),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])

model_bilstm.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model_bilstm.summary()

# Train
model_bilstm.fit(X_train, y_train, epochs=3, batch_size=32, validation_data=(X_val, y_val))


Epoch 1/3
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 177ms/step - accuracy: 0.5025 - loss: 0.6920 - val_accuracy: 0.5574 - val_loss: 0.6822
Epoch 2/3
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 173ms/step - accuracy: 0.5838 - loss: 0.6526 - val_accuracy: 0.5787 - val_loss: 0.6687
Epoch 3/3
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 135ms/step - accuracy: 0.8454 - loss: 0.5076 - val_accuracy: 0.5872 - val_loss: 0.7914


<keras.src.callbacks.history.History at 0x7cdba5f52450>