In [20]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))


Num GPUs Available:  1


In [9]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


Using device: cpu


### Persiapan Dataset

In [1]:
import os
import json


def load_dataset(folder_path):
    dataset = []
    for filename in os.listdir(folder_path):
        if filename.endswith('.json'):
            file_path = os.path.join(folder_path, filename)
            try:
                with open(file_path, 'r') as f:
                    data = json.load(f)
                    if isinstance(data, dict):  # Check if the data is a dictionary
                        dataset.append(data)
            except json.JSONDecodeError:
                print(f"Skipping file {file_path}, not a valid JSON.")
    return dataset
folder_path = './Dataset/generative_dataset'
dataset = load_dataset(folder_path)

Skipping file ./Dataset/generative_dataset\0312.json, not a valid JSON.
Skipping file ./Dataset/generative_dataset\0540.json, not a valid JSON.
Skipping file ./Dataset/generative_dataset\0541.json, not a valid JSON.
Skipping file ./Dataset/generative_dataset\0542.json, not a valid JSON.
Skipping file ./Dataset/generative_dataset\0543.json, not a valid JSON.
Skipping file ./Dataset/generative_dataset\0544.json, not a valid JSON.
Skipping file ./Dataset/generative_dataset\0545.json, not a valid JSON.
Skipping file ./Dataset/generative_dataset\0546.json, not a valid JSON.
Skipping file ./Dataset/generative_dataset\0547.json, not a valid JSON.
Skipping file ./Dataset/generative_dataset\0548.json, not a valid JSON.
Skipping file ./Dataset/generative_dataset\0549.json, not a valid JSON.
Skipping file ./Dataset/generative_dataset\0550.json, not a valid JSON.
Skipping file ./Dataset/generative_dataset\0551.json, not a valid JSON.
Skipping file ./Dataset/generative_dataset\0552.json, not a vali

### Tokenisasi

In [6]:
import re
from collections import defaultdict

class SimpleTokenizer:
    def __init__(self):
        self.word2index = defaultdict(int)
        self.index2word = []
        self.word_count = 0
    
    def fit(self, texts):
        for text in texts:
            for word in re.findall(r'\w+', text):
                if word not in self.word2index:
                    self.word2index[word] = self.word_count
                    self.index2word.append(word)
                    self.word_count += 1
    
    def transform(self, text):
        return [self.word2index[word] for word in re.findall(r'\w+', text)]
    
    def fit_transform(self, texts):
        self.fit(texts)
        return [self.transform(text) for text in texts]
    
    def save(self, path):
        with open(path, 'w') as f:
            json.dump({'word2index': self.word2index, 'index2word': self.index2word}, f)
    
    def load(self, path):
        with open(path, 'r') as f:
            data = json.load(f)
            self.word2index = data['word2index']
            self.index2word = data['index2word']
            self.word_count = len(self.index2word)

tokenizer = SimpleTokenizer()

# Mengumpulkan semua teks untuk tokenisasi
texts = []
for entry in dataset:
    if 'judul' in entry and 'konten' in entry and 'contoh' in entry:
        texts.append(entry['judul'])
        texts.append(entry['konten'])
        for example in entry['contoh']:
            if 'isi' in example:
                texts.append(example['isi'])

tokenizer.fit(texts)

# Membuat tokenized_data dengan struktur yang benar
tokenized_data = []
for entry in dataset:
    if 'judul' in entry and 'konten' in entry and 'contoh' in entry:
        title_tokens = tokenizer.transform(entry['judul'])
        content_tokens = tokenizer.transform(entry['konten'])
        tokenized_data.append({
            'title_tokens': {'input_ids': title_tokens},
            'content_tokens': {'input_ids': content_tokens}
        })


# Simpan tokenizer
tokenizer.save('./saved_model/token/simple_tokenizer.json')

### Model

In [7]:
import torch
import torch.nn as nn
import torch.optim as optim

class TransformerModel(nn.Module):
    def __init__(self, vocab_size, d_model=512, nhead=8, num_encoder_layers=6, num_decoder_layers=6, dim_feedforward=2048, max_seq_length=512):
        super(TransformerModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.positional_encoding = nn.Parameter(torch.zeros(1, max_seq_length, d_model))
        self.transformer = nn.Transformer(d_model, nhead, num_encoder_layers, num_decoder_layers, dim_feedforward, batch_first=True)
        self.fc_out = nn.Linear(d_model, vocab_size)
    
    def forward(self, src, tgt):
        src = self.embedding(src) + self.positional_encoding[:, :src.size(1), :]
        tgt = self.embedding(tgt) + self.positional_encoding[:, :tgt.size(1), :]
        output = self.transformer(src, tgt)
        return self.fc_out(output)

vocab_size = len(tokenizer.index2word)
model = TransformerModel(vocab_size)


### Train Model

In [8]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

class CustomDataset(Dataset):
    def __init__(self, data):
        self.data = data
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return self.data[idx]

def collate_fn(batch):
    src = [item['src'] for item in batch]
    tgt = [item['tgt'] for item in batch]
    src = torch.nn.utils.rnn.pad_sequence(src, padding_value=0, batch_first=True)
    tgt = torch.nn.utils.rnn.pad_sequence(tgt, padding_value=0, batch_first=True)
    return src, tgt

train_data = [{'src': torch.tensor(entry['title_tokens']['input_ids']), 'tgt': torch.tensor(entry['content_tokens']['input_ids'])} for entry in tokenized_data]
train_dataset = CustomDataset(train_data)
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)

criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.Adam(model.parameters(), lr=0.0001)

for epoch in range(30):  # Adjust the number of epochs as needed
    model.train()
    total_loss = 0
    for src, tgt in train_dataloader:
        optimizer.zero_grad()
        output = model(src, tgt[:, :-1])
        loss = criterion(output.view(-1, vocab_size), tgt[:, 1:].reshape(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch + 1}, Loss: {total_loss / len(train_dataloader)}")

# Save the model
torch.save(model.state_dict(), './saved_model/transformer_model.pth')


Epoch 1, Loss: 6.874313286372593
Epoch 2, Loss: 6.200914178575788
Epoch 3, Loss: 5.853746959141323
Epoch 4, Loss: 5.4872443335396905


KeyboardInterrupt: 