In [1]:
import string
import openpyxl
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

In [2]:
# Fungsi untuk membaca file Excel
def read_excel(file_name):
    wb = openpyxl.load_workbook(file_name)
    sheet = wb.active
    source_data = []
    target_data = []
    for row in sheet.iter_rows(values_only=True):
        source_data.append(row[0])
        target_data.append(row[1])
    return source_data, target_data

In [3]:
# Fungsi untuk case folding (mengubah ke huruf kecil)
def case_folding(text):
    return text.lower()

In [4]:
# Fungsi untuk cleansing (menghapus tanda baca)
def cleansing(text):
    return text.translate(str.maketrans('', '', string.punctuation))

In [5]:
# Fungsi untuk tokenisasi
def tokenization(text):
    return text.split()

In [6]:
# Fungsi untuk menambahkan padding pada token
def add_padding(tokens, max_length):
    return tokens + ['<PAD>'] * (max_length - len(tokens))

In [7]:
# Fungsi untuk embedding kata (mengubah kata menjadi vektor numerik)
def word_embedding(tokens, vocab):
    return [vocab[word] if word in vocab else vocab['<UNK>'] for word in tokens]

In [8]:
# Fungsi untuk membangun kosakata (vocab) dari data
def build_vocab(data):
    vocab = {'<PAD>': 0, '<UNK>': 1, '<SOS>': 2, '<EOS>': 3}
    index = 4
    for sentence in data:
        for word in sentence:
            if word not in vocab:
                vocab[word] = index
                index += 1
    return vocab

In [9]:
# Fungsi untuk memproses data teks
def preprocess(file_name):
    source_data, target_data = read_excel(file_name)
    processed_source_data = []
    processed_target_data = []

    for source_text, target_text in zip(source_data, target_data):
        source_text = case_folding(source_text)
        source_text = cleansing(source_text)
        source_tokens = tokenization(source_text)
        source_tokens = ['<SOS>'] + source_tokens + ['<EOS>']
        processed_source_data.append(source_tokens)

        target_text = case_folding(target_text)
        target_text = cleansing(target_text)
        target_tokens = tokenization(target_text)
        target_tokens = ['<SOS>'] + target_tokens + ['<EOS>']
        processed_target_data.append(target_tokens)

    return processed_source_data, processed_target_data


In [10]:
# Kelas Dataset untuk NMT
class NMTCorpusDataset(Dataset):
    def __init__(self, source_data, target_data, vocab):
        self.source_data = source_data
        self.target_data = target_data
        self.vocab = vocab
        self.vocab_size = len(vocab)
        self.max_length = max(max(len(sentence) for sentence in source_data), max(len(sentence) for sentence in target_data))

    def __len__(self):
        return len(self.source_data)
    
    def __getitem__(self, idx):
        source_tokens = self.source_data[idx]
        target_tokens = self.target_data[idx]

        padded_source_tokens = add_padding(source_tokens, self.max_length)
        padded_target_tokens = add_padding(target_tokens, self.max_length)

        embedded_source_tokens = torch.tensor(word_embedding(padded_source_tokens, self.vocab))
        embedded_target_tokens = torch.tensor(word_embedding(padded_target_tokens, self.vocab))

        return embedded_source_tokens, embedded_target_tokens

In [11]:
# Kelas Encoder dengan LSTM di PyTorch
class Encoder(nn.Module):
    def __init__(self, input_dim, hidden_dim, vocab_size):
        super(Encoder, self).__init__()
        self.hidden_dim = hidden_dim
        self.embedding = nn.Embedding(vocab_size, input_dim)
        self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True)
        self.dropout = nn.Dropout(0.3)

    def forward(self, input_seq):
        embedded = self.embedding(input_seq)
        embedded = self.dropout(embedded)
        output, (hidden, cell) = self.lstm(embedded)
        return hidden, cell

In [12]:
# Kelas Decoder dengan LSTM di PyTorch
class Decoder(nn.Module):
    def __init__(self, input_dim, hidden_dim, vocab_size):
        super(Decoder, self).__init__()
        self.hidden_dim = hidden_dim
        self.embedding = nn.Embedding(vocab_size, input_dim)
        self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True)
        self.output_layer = nn.Linear(hidden_dim, vocab_size)
        self.softmax = nn.LogSoftmax(dim=2)
        self.dropout = nn.Dropout(0.3)

    def forward(self, target_seq, hidden, cell):
        embedded = self.embedding(target_seq)
        embedded = self.dropout(embedded)
        output, (hidden, cell) = self.lstm(embedded, (hidden, cell))
        output = self.output_layer(output)
        output = self.softmax(output)
        return output, hidden, cell

In [13]:
# Fungsi untuk membangun model NMT
def build_NMT(input_dim, hidden_dim, vocab_size):
    encoder = Encoder(input_dim, hidden_dim, vocab_size)
    decoder = Decoder(input_dim, hidden_dim, vocab_size)
    return encoder, decoder

In [14]:
# Fungsi untuk melatih model NMT
def train_NMT(encoder, decoder, train_loader, epochs, learning_rate, vocab):
    criterion = nn.NLLLoss(ignore_index=vocab['<PAD>'])
    optimizer = optim.Adam(list(encoder.parameters()) + list(decoder.parameters()), lr=learning_rate)
    clip = 1.0

    for epoch in range(epochs):
        encoder.train()
        decoder.train()
        total_loss = 0
        
        for batch in train_loader:
            optimizer.zero_grad()
            
            input_seq, target_seq = batch
            input_seq = input_seq.to(torch.int64)  # Convert to long
            target_seq = target_seq.to(torch.int64)  # Convert to long

            hidden, cell = encoder(input_seq)
            output, _, _ = decoder(target_seq, hidden, cell)
            
            loss = criterion(output.permute(0, 2, 1), target_seq)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(encoder.parameters(), clip)
            torch.nn.utils.clip_grad_norm_(decoder.parameters(), clip)
            optimizer.step()
            
            total_loss += loss.item()
        
        avg_loss = total_loss / len(train_loader)
        print(f'Epoch {epoch + 1}/{epochs}, Train Loss: {avg_loss}')

        # Tampilkan beberapa contoh kalimat dan hasil prediksi
        with torch.no_grad():
            encoder.eval()
            decoder.eval()
            for i in range(min(5, len(batch[0]))):
                input_seq = batch[0][i, :-1].unsqueeze(0)
                target_seq = batch[1][i, 1:].unsqueeze(0)
                hidden, cell = encoder(input_seq)
                output, _, _ = decoder(target_seq, hidden, cell)
                pred_tokens = output.argmax(2).squeeze().tolist()

                source_text = [key for key, value in vocab.items() if value in input_seq.squeeze().tolist()]
                target_text = [key for key, value in vocab.items() if value in target_seq.squeeze().tolist()]
                predicted_text = [key for key, value in vocab.items() if value in pred_tokens]

                print(f'Source: {" ".join(source_text)}')
                print(f'Target: {" ".join(target_text)}')
                print(f'Predicted: {" ".join(predicted_text)}\n')

In [15]:
# Fungsi untuk memprediksi hasil dari model
def predict(encoder, decoder, dataset, vocab, max_length):
    encoder.eval()
    decoder.eval()
    predictions = []

    with torch.no_grad():
        for source_tokens, _ in dataset:
            source_tokens = source_tokens.unsqueeze(0)  # Tambahkan batch dimension
            hidden, cell = encoder(source_tokens)

            # Inisialisasi token awal untuk decoder
            token = torch.tensor([[vocab['<SOS>']]])

            # Membuat list untuk menampung prediksi
            predicted_sentence = []

            # Melakukan prediksi satu per satu token
            for _ in range(max_length):
                output, hidden, cell = decoder(token, hidden, cell)
                topv, topi = output.topk(1)
                token = topi.squeeze().detach()
                predicted_sentence.append(token.item())
                if token.item() == vocab['<EOS>']:
                    break

            predictions.append(predicted_sentence)

    return predictions


In [16]:
# Fungsi untuk menghitung BLEU score
def calculate_bleu_score(predictions, references, vocab):
    smoothie = SmoothingFunction().method4
    bleu_scores = []

    for pred, ref in zip(predictions, references):
        pred_tokens = [key for key, value in vocab.items() if value in pred]
        ref_tokens = [[key for key, value in vocab.items() if value in ref]]
        bleu = sentence_bleu(ref_tokens, pred_tokens, smoothing_function=smoothie)
        bleu_scores.append(bleu)

    avg_bleu_score = np.mean(bleu_scores)
    return avg_bleu_score


In [17]:
# Main program
if __name__ == '__main__':
    # Nama file Excel
    file_name = 'Corpus_indo.xlsx'
    
    # Melakukan preprocessing
    processed_source_data, processed_target_data = preprocess(file_name)
    
    # Membangun kosakata
    vocab = build_vocab(processed_source_data + processed_target_data)
    
    # Membuat dataset
    dataset = NMTCorpusDataset(processed_source_data, processed_target_data, vocab)
    
    # Membagi dataset menjadi train dan test
    train_size = int(0.8 * len(dataset))
    test_size = len(dataset) - train_size
    train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])
    
    # Membuat DataLoader untuk pelatihan dan pengujian
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)  # Batch size 1 untuk prediksi
    
    # Inisialisasi parameter
    input_dim = 128
    hidden_dim = 256
    vocab_size = len(vocab)
    epochs = 10
    learning_rate = 0.001
    
    # Membangun model NMT
    encoder, decoder = build_NMT(input_dim, hidden_dim, vocab_size)
    
    # Melakukan pelatihan
    train_NMT(encoder, decoder, train_loader, epochs, learning_rate, vocab)
    
    # Memilih beberapa contoh dari data pengujian untuk menampilkan hasil
    num_examples = 5
    indices = random.sample(range(len(test_dataset)), num_examples)
    examples = [test_dataset[idx] for idx in indices]
    
    # Melakukan prediksi untuk contoh yang dipilih
    predictions = predict(encoder, decoder, examples, vocab)
    
    # Menampilkan source text, target text, dan hasil prediksi
    print("\nExamples:")
    for example, prediction in zip(examples, predictions):
        source_text = [token for token in example[0].numpy().tolist() if token != vocab['<PAD>']]
        target_text = [token for token in example[1].numpy().tolist() if token != vocab['<PAD>']]
        predicted_text = [token for token in prediction if token != vocab['<EOS>']]
        
        source_text = ' '.join([key for key, value in vocab.items() if value in source_text])
        target_text = ' '.join([key for key, value in vocab.items() if value in target_text])
        predicted_text = ' '.join([key for key, value in vocab.items() if value in predicted_text])
        
        print(f'Source Text: {source_text}')
        print(f'Target Text: {target_text}')
        print(f'Predicted Text: {predicted_text}\n')

    # Menghitung BLEU score
    references = [[sentence[1].numpy().tolist() for sentence in test_dataset]]
    bleu_score = calculate_bleu_score(predictions, references)
    print(f'BLEU Score: {bleu_score}')

Epoch 1/10, Train Loss: 6.597913646697998
Source: <PAD> <SOS> <EOS> saya kamu mengapa merobek buku
Target: <PAD> <EOS> noafa obhinighoo bokuku
Predicted: <SOS> <EOS>

Source: <PAD> <SOS> <EOS> belum yang sudah dan dua tiga ekor pernah beranak ayamnya
Target: <PAD> <EOS> raa tolu manuno ghulu bhangkele moose
Predicted: <SOS> <EOS>

Epoch 2/10, Train Loss: 4.917537460327148
Source: <PAD> <SOS> <EOS> dia dibelikan sarung
Target: <PAD> <EOS> degholiane bheta
Predicted: <SOS> <EOS> o

Source: <PAD> <SOS> <EOS> kalau jagung patah tongkol mudah dipatahkan
Target: <PAD> <EOS> ane kahitela wurino nopobhera dobherae
Predicted: <SOS> <EOS> o

Epoch 3/10, Train Loss: 4.433584880828858
Source: <PAD> <SOS> <EOS> anak panah
Target: <PAD> <EOS> anano pana
Predicted: <SOS> <EOS> o

Source: <PAD> <SOS> <EOS> kami pekerjaan upah bagibagi
Target: <PAD> <EOS> mani tambono karadhaa tabagebagee
Predicted: <SOS> <EOS> o miina bhe

Epoch 4/10, Train Loss: 4.035468187332153
Source: <PAD> <SOS> <EOS> saya orang 

KeyboardInterrupt: 