### Percobaan Awal

# Simple Language Model Implementation with TensorFlow

## Import Library yang Diperlukan



In [None]:
import tensorflow as tf
import numpy as np
import os
import pickle
from typing import List, Dict, Tuple, Optional
import matplotlib.pyplot as plt
from datetime import datetime


## Data Preprocessing dan Utility Functions


In [None]:
class TextProcessor:
    def __init__(self):
        self.char_to_idx: Dict = {}
        self.idx_to_char: Dict = {}
        self.vocab_size: int = 0
        
    def fit(self, text: str) -> None:
        """
        Membuat vocabulary dari text
        
        Args:
            text (str): Text input
        """
        chars = sorted(list(set(text)))
        self.char_to_idx = {ch: i for i, ch in enumerate(chars)}
        self.idx_to_char = {i: ch for i, ch in enumerate(chars)}
        self.vocab_size = len(chars)
        
    def encode(self, text: str) -> np.ndarray:
        """
        Mengubah text menjadi sequence of indices
        
        Args:
            text (str): Text input
            
        Returns:
            np.ndarray: Array of indices
        """
        return np.array([self.char_to_idx[ch] for ch in text])
    
    def decode(self, indices: np.ndarray) -> str:
        """
        Mengubah sequence of indices menjadi text
        
        Args:
            indices (np.ndarray): Array of indices
            
        Returns:
            str: Decoded text
        """
        return ''.join([self.idx_to_char[idx] for idx in indices])
    
    def save(self, path: str) -> None:
        """
        Menyimpan processor ke file
        
        Args:
            path (str): Path untuk menyimpan file
        """
        with open(path, 'wb') as f:
            pickle.dump({
                'char_to_idx': self.char_to_idx,
                'idx_to_char': self.idx_to_char,
                'vocab_size': self.vocab_size
            }, f)
    
    @classmethod
    def load(cls, path: str) -> 'TextProcessor':
        """
        Memuat processor dari file
        
        Args:
            path (str): Path file processor
            
        Returns:
            TextProcessor: Instance processor yang dimuat
        """
        processor = cls()
        with open(path, 'rb') as f:
            data = pickle.load(f)
            processor.char_to_idx = data['char_to_idx']
            processor.idx_to_char = data['idx_to_char']
            processor.vocab_size = data['vocab_size']
        return processor

## Model Implementation dengan TensorFlow


In [None]:
class TFLM(tf.keras.Model):
    def __init__(self, 
                 vocab_size: int, 
                 embedding_dim: int, 
                 rnn_units: int,
                 dropout_rate: float = 0.1):
        """
        Inisialisasi TensorFlow Language Model
        
        Args:
            vocab_size (int): Ukuran vocabulary
            embedding_dim (int): Dimensi embedding
            rnn_units (int): Jumlah unit RNN
            dropout_rate (float): Rate untuk dropout
        """
        super(TFLM, self).__init__()
        
        # Layer definitions
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(rnn_units,
                                     return_sequences=True,
                                     return_state=True,
                                     recurrent_initializer='glorot_uniform')
        self.dropout = tf.keras.layers.Dropout(dropout_rate)
        self.dense = tf.keras.layers.Dense(vocab_size)
        
    def call(self, inputs: tf.Tensor, 
            states: Optional[tf.Tensor] = None, 
            training: bool = False) -> Tuple[tf.Tensor, tf.Tensor]:
        """
        Forward pass dari model
        
        Args:
            inputs (tf.Tensor): Input sequence
            states (Optional[tf.Tensor]): Initial state
            training (bool): Training mode flag
            
        Returns:
            Tuple[tf.Tensor, tf.Tensor]: Output dan final state
        """
        x = self.embedding(inputs)
        x = self.dropout(x, training=training)
        output, states = self.gru(x, initial_state=states)
        x = self.dropout(output, training=training)
        logits = self.dense(x)
        return logits, states
    
    def initialize_states(self, batch_size: int) -> tf.Tensor:
        """
        Inisialisasi state untuk inference
        
        Args:
            batch_size (int): Batch size
            
        Returns:
            tf.Tensor: Initial state
        """
        return tf.zeros([batch_size, self.gru.units])

## Training Manager




In [None]:
class ModelTrainer:
    def __init__(self, 
                 model: TFLM,
                 processor: TextProcessor,
                 checkpoint_dir: str = './training_checkpoints'):
        """
        Inisialisasi training manager
        
        Args:
            model (TFLM): Instance model
            processor (TextProcessor): Text processor
            checkpoint_dir (str): Directory untuk checkpoints
        """
        self.model = model
        self.processor = processor
        self.checkpoint_dir = checkpoint_dir
        self.checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
        self.history = {'loss': [], 'val_loss': []}
        
        # Setup optimizer dan loss
        self.optimizer = tf.keras.optimizers.Adam()
        self.loss = tf.keras.losses.SparseCategoricalCrossentropy(
            from_logits=True)
            
        # Setup checkpoint
        self.checkpoint = tf.train.Checkpoint(
            optimizer=self.optimizer,
            model=self.model)
            
    @tf.function
    def train_step(self, 
                  inputs: tf.Tensor, 
                  targets: tf.Tensor) -> tf.Tensor:
        """
        Single training step
        
        Args:
            inputs (tf.Tensor): Input batch
            targets (tf.Tensor): Target batch
            
        Returns:
            tf.Tensor: Loss value
        """
        with tf.GradientTape() as tape:
            predictions, _ = self.model(inputs, training=True)
            loss = self.loss(targets, predictions)
            
        gradients = tape.gradient(loss, self.model.trainable_variables)
        self.optimizer.apply_gradients(
            zip(gradients, self.model.trainable_variables))
            
        return loss
        
    def train(self, 
              text: str,
              epochs: int,
              batch_size: int = 64,
              seq_length: int = 100,
              validation_split: float = 0.1,
              log_every: int = 10):
        """
        Training loop
        
        Args:
            text (str): Training text
            epochs (int): Number of epochs
            batch_size (int): Batch size
            seq_length (int): Sequence length
            validation_split (float): Validation data ratio
            log_every (int): Logging frequency
        """
        # Prepare data
        text_as_int = self.processor.encode(text)
        char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)
        sequences = char_dataset.batch(seq_length+1, drop_remainder=True)
        
        def split_input_target(chunk):
            input_text = chunk[:-1]
            target_text = chunk[1:]
            return input_text, target_text
            
        dataset = sequences.map(split_input_target)
        
        # Split into train and validation
        n_samples = len(list(dataset))
        n_val = int(n_samples * validation_split)
        
        train_data = dataset.skip(n_val).shuffle(10000).batch(
            batch_size, drop_remainder=True)
        val_data = dataset.take(n_val).batch(
            batch_size, drop_remainder=True)
        
        # Training loop
        for epoch in range(epochs):
            start = datetime.now()
            
            # Training
            total_loss = 0
            n_batches = 0
            
            for batch in train_data:
                loss = self.train_step(batch[0], batch[1])
                total_loss += loss
                n_batches += 1
                
            train_loss = total_loss / n_batches
            
            # Validation
            total_val_loss = 0
            n_val_batches = 0
            
            for batch in val_data:
                predictions, _ = self.model(batch[0], training=False)
                val_loss = self.loss(batch[1], predictions)
                total_val_loss += val_loss
                n_val_batches += 1
                
            val_loss = total_val_loss / n_val_batches
            
            # Record history
            self.history['loss'].append(train_loss.numpy())
            self.history['val_loss'].append(val_loss.numpy())
            
            # Logging
            if epoch % log_every == 0:
                print(
                    f'Epoch {epoch+1} | '
                    f'Loss: {train_loss:.4f} | '
                    f'Val Loss: {val_loss:.4f} | '
                    f'Time: {datetime.now() - start}'
                )
                
                # Save checkpoint
                self.checkpoint.save(file_prefix=self.checkpoint_prefix)
                
    def generate_text(self, 
                     start_string: str,
                     num_chars: int = 1000,
                     temperature: float = 1.0) -> str:
        """
        Generate text dari model
        
        Args:
            start_string (str): String awal
            num_chars (int): Jumlah karakter yang akan digenerate
            temperature (float): Sampling temperature
            
        Returns:
            str: Generated text
        """
        # Converting start string to numbers
        input_eval = self.processor.encode(start_string)
        input_eval = tf.expand_dims(input_eval, 0)
        
        # Empty string to store result
        text_generated = []
        
        # Reset states
        states = None
        
        for _ in range(num_chars):
            predictions, states = self.model(
                input_eval, states=states, training=False)
                
            # Remove batch dimension
            predictions = tf.squeeze(predictions, 0)
            
            # Using temperature for sampling
            predictions = predictions / temperature
            predicted_id = tf.random.categorical(
                predictions, num_samples=1)[-1,0].numpy()
                
            # Pass the predicted char as next input
            input_eval = tf.expand_dims([predicted_id], 0)
            
            text_generated.append(
                self.processor.idx_to_char[predicted_id])
                
        return start_string + ''.join(text_generated)
        
    def plot_history(self):
        """
        Plot training history
        """
        plt.figure(figsize=(12, 4))
        
        plt.subplot(1, 2, 1)
        plt.plot(self.history['loss'], label='Training Loss')
        plt.plot(self.history['val_loss'], label='Validation Loss')
        plt.title('Model Loss')
        plt.xlabel('Epoch')
        plt.ylabel('Loss')
        plt.legend()
        
        plt.tight_layout()
        plt.show()
        
    def save_model(self, model_dir: str):
        """
        Menyimpan model dan processor
        
        Args:
            model_dir (str): Directory untuk menyimpan model
        """
        if not os.path.exists(model_dir):
            os.makedirs(model_dir)
            
        # Save model
        self.model.save(os.path.join(model_dir, 'model'))
        
        # Save processor
        self.processor.save(
            os.path.join(model_dir, 'processor.pkl'))
            
    @classmethod
    def load_model(cls, 
                  model_dir: str,
                  checkpoint_dir: str = './training_checkpoints') -> 'ModelTrainer':
        """
        Memuat model dan processor
        
        Args:
            model_dir (str): Directory model
            checkpoint_dir (str): Directory checkpoints
            
        Returns:
            ModelTrainer: Instance trainer yang dimuat
        """
        # Load processor
        processor = TextProcessor.load(
            os.path.join(model_dir, 'processor.pkl'))
            
        # Load model
        model = tf.keras.models.load_model(
            os.path.join(model_dir, 'model'))
            
        # Create trainer instance
        trainer = cls(model, processor, checkpoint_dir)
        
        return trainer

## Contoh Penggunaan



In [None]:
# Sample text
text = """
Natural language processing (NLP) is a subfield of linguistics, computer science, and artificial intelligence 
concerned with the interactions between computers and human language, in particular how to program computers to 
process and analyze large amounts of natural language data.
"""

# Initialize processor and create vocabulary
processor = TextProcessor()
processor.fit(text)

# Create model
model = TFLM(
    vocab_size=processor.vocab_size,
    embedding_dim=256,
    rnn_units=512,
    dropout_rate=0.2
)

# Initialize trainer
trainer = ModelTrainer(model, processor)

# Training
trainer.train(
    text=text,
    epochs=50,
    batch_size=32,
    seq_length=100,
    validation_split=0.1,
    log_every=5
)

# Plot training history
trainer.plot_history()

# Generate some text
generated_text = trainer.generate_text(
    start_string="Natural",
    num_chars=200,
    temperature=0.7
)
print("Generated text:", generated_text)

# Save model
trainer.save_model('./saved_model')

# Load model (untuk penggunaan nanti)
loaded_trainer = ModelTrainer.load_model('./saved_model')