In [1]:
import torch

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda', index=0)

# LSTM:

<img src="images/lstm_1.png"  width="900" height="450">

- **Forget Gate:** Geçmişten gelecek bilginin hangilerinin unutulup hangilerinin unutulmayacağını belirlemeyi sağlar.
- **Input Gate:** Gelecek yeni input'un cell'e eklenip eklenmeyeceğini belirlemeyi sağlar. Burada Candidate memory ifadesi, memory'e tutulacak input'ların belirlenmesinde rol oynar.
- **Output Gate:** Bir sonraki cell'e hangi bilgilerin aktarılacağının belirlenmesini sağlar.

### LSTM'deki İfadelerin Sayısal Gösterimleri:

<img src="images/lstm_2.png"  width="900" height="450">

In [2]:
import torch.nn as nn
import torch.optim as optim
import re
from collections import Counter # For counting elements
from itertools import product # For generating combinations

In [3]:
# 1. Read the text file and clean it
def preprocess_shakespeare(text):
    # Remove stage directions and other text in square brackets (e.g., [Enter stage left])
    text = re.sub(r'\[.*?\]', '', text)
    
    # Remove "ACT" and "Scene" markers (e.g., "ACT 1", "Scene 2"), ignoring case
    text = re.sub(r'ACT\s+\d+|Scene\s+\d+', '', text, flags=re.IGNORECASE)
    
    # Remove character names, which are typically in all caps at the start of a line
    text = re.sub(r'^[A-Z][A-Z\s]+(?=\n)', '', text, flags=re.MULTILINE)
    
    # Replace multiple consecutive newlines with a single newline
    text = re.sub(r'\n+', '\n', text)
    
    # Replace multiple consecutive spaces with a single space
    text = re.sub(r'[ ]{2,}', ' ', text)
    
    # Remove any stray "=" characters from the text
    text = text.replace('=', '')
    
    # Remove full web URLs that start with http or https
    text = re.sub(r'http[s]?://\S+', '', text)
    
    # Remove web addresses that start with www.
    text = re.sub(r'www\.\S+', '', text)
    
    text = text.lower()
    return text.strip()

In [4]:
with open('data/a-midsummer-nights-dream_TXT_FolgerShakespeare.txt', 'r') as file:
    raw_data = file.read()

cleaned_data = preprocess_shakespeare(raw_data)

cleaned_data[:500]

"a midsummer night's dream\nby william shakespeare\nedited by barbara a. mowat and paul werstine\n with michael poston and rebecca niles\nfolger shakespeare library\n\ncreated on jul 31, 2015, from fdt version 0.9.2\ncharacters in the play\n\nfour lovers:\n hermia\n lysander\n helena\n demetrius\ntheseus, duke of athens\nhippolyta, queen of the amazons\negeus, father to hermia\nphilostrate, master of the revels to theseus\nnick bottom, weaver\npeter quince, carpenter\nfrancis flute, bellows-mender\ntom snout, tinker\n"

In [5]:
# 2. Split text into words
words = cleaned_data.split()
words

['a',
 'midsummer',
 "night's",
 'dream',
 'by',
 'william',
 'shakespeare',
 'edited',
 'by',
 'barbara',
 'a.',
 'mowat',
 'and',
 'paul',
 'werstine',
 'with',
 'michael',
 'poston',
 'and',
 'rebecca',
 'niles',
 'folger',
 'shakespeare',
 'library',
 'created',
 'on',
 'jul',
 '31,',
 '2015,',
 'from',
 'fdt',
 'version',
 '0.9.2',
 'characters',
 'in',
 'the',
 'play',
 'four',
 'lovers:',
 'hermia',
 'lysander',
 'helena',
 'demetrius',
 'theseus,',
 'duke',
 'of',
 'athens',
 'hippolyta,',
 'queen',
 'of',
 'the',
 'amazons',
 'egeus,',
 'father',
 'to',
 'hermia',
 'philostrate,',
 'master',
 'of',
 'the',
 'revels',
 'to',
 'theseus',
 'nick',
 'bottom,',
 'weaver',
 'peter',
 'quince,',
 'carpenter',
 'francis',
 'flute,',
 'bellows-mender',
 'tom',
 'snout,',
 'tinker',
 'snug,',
 'joiner',
 'robin',
 'starveling,',
 'tailor',
 'oberon,',
 'king',
 'of',
 'the',
 'fairies',
 'titania,',
 'queen',
 'of',
 'the',
 'fairies',
 'robin',
 'goodfellow,',
 'a',
 '"puck,"',
 'or',


In [6]:
# 3. Build Vocabulary
word_counts = Counter(words)
sorted_vocab = sorted(word_counts, key=word_counts.get, reverse=True)

word_to_idx = {word: i for i, word in enumerate(sorted_vocab)}
idx_to_word = {i: word for i, word in enumerate(sorted_vocab)}

vocab_size = len(word_to_idx)

In [7]:
word_to_idx

{'the': 0,
 'and': 1,
 'i': 2,
 'to': 3,
 'of': 4,
 'a': 5,
 'in': 6,
 'you': 7,
 'my': 8,
 'is': 9,
 'with': 10,
 'that': 11,
 'not': 12,
 'this': 13,
 'for': 14,
 'your': 15,
 'as': 16,
 'it': 17,
 'thou': 18,
 'will': 19,
 'but': 20,
 'me': 21,
 'have': 22,
 'do': 23,
 'his': 24,
 'be': 25,
 'so': 26,
 'he': 27,
 'her': 28,
 'all': 29,
 'shall': 30,
 'we': 31,
 'by': 32,
 'love': 33,
 'are': 34,
 'thy': 35,
 'no': 36,
 'on': 37,
 'or': 38,
 'if': 39,
 'our': 40,
 'when': 41,
 'she': 42,
 'what': 43,
 'from': 44,
 'am': 45,
 'their': 46,
 'bottom': 47,
 'now': 48,
 'must': 49,
 'at': 50,
 'here': 51,
 'more': 52,
 'quince': 53,
 'sweet': 54,
 'thee': 55,
 'good': 56,
 'hath': 57,
 'would': 58,
 'o': 59,
 'one': 60,
 'did': 61,
 'some': 62,
 'how': 63,
 'come': 64,
 'let': 65,
 'make': 66,
 'should': 67,
 'demetrius': 68,
 'then': 69,
 'never': 70,
 'me,': 71,
 'man': 72,
 'him': 73,
 'than': 74,
 'an': 75,
 'theseus': 76,
 'you,': 77,
 'can': 78,
 'they': 79,
 'see': 80,
 'doth': 81,

In [8]:
vocab_size

4352

In [9]:
# Convert words to indices
indexed_words = [word_to_idx[w] for w in words]

In [10]:
indexed_words

[5,
 1427,
 440,
 258,
 32,
 1428,
 839,
 1429,
 32,
 1430,
 1431,
 1432,
 1,
 1433,
 1434,
 10,
 1435,
 1436,
 1,
 1437,
 1438,
 1439,
 839,
 1440,
 840,
 37,
 1441,
 1442,
 1443,
 44,
 1444,
 1445,
 1446,
 1447,
 6,
 0,
 87,
 300,
 1448,
 181,
 88,
 146,
 68,
 356,
 226,
 4,
 129,
 301,
 205,
 4,
 0,
 1449,
 441,
 574,
 3,
 181,
 357,
 227,
 4,
 0,
 442,
 3,
 76,
 575,
 117,
 1450,
 206,
 358,
 1451,
 841,
 147,
 1452,
 1453,
 359,
 1454,
 443,
 1455,
 182,
 259,
 1456,
 260,
 360,
 4,
 0,
 302,
 228,
 205,
 4,
 0,
 302,
 182,
 1457,
 5,
 1458,
 38,
 1459,
 6,
 1460,
 576,
 5,
 1461,
 6,
 0,
 576,
 4,
 164,
 302,
 1462,
 84,
 1463,
 361,
 303,
 444,
 304,
 842,
 1,
 1464,
 37,
 76,
 1,
 183,
 207,
 302,
 6,
 0,
 1465,
 4,
 164,
 1,
 229,
 156,
 89,
 301,
 40,
 445,
 843,
 577,
 37,
 1466,
 300,
 261,
 578,
 230,
 6,
 262,
 231,
 184,
 105,
 362,
 63,
 579,
 13,
 305,
 185,
 1467,
 42,
 1468,
 8,
 1469,
 93,
 3,
 5,
 1470,
 38,
 5,
 844,
 208,
 845,
 114,
 5,
 363,
 446,
 1471,
 300,


In [11]:
# 4. Create input sequences and targets
sequence_length = 20  # Örnek dizi uzunluğu, bunu artırıp azaltarak deneyebilirsiniz.
sequences = []

for i in range(len(indexed_words) - sequence_length):
    seq_in = indexed_words[i:i + sequence_length]
    seq_out = indexed_words[i + sequence_length]
    sequences.append((seq_in, seq_out))

In [12]:
sequences

[([5,
   1427,
   440,
   258,
   32,
   1428,
   839,
   1429,
   32,
   1430,
   1431,
   1432,
   1,
   1433,
   1434,
   10,
   1435,
   1436,
   1,
   1437],
  1438),
 ([1427,
   440,
   258,
   32,
   1428,
   839,
   1429,
   32,
   1430,
   1431,
   1432,
   1,
   1433,
   1434,
   10,
   1435,
   1436,
   1,
   1437,
   1438],
  1439),
 ([440,
   258,
   32,
   1428,
   839,
   1429,
   32,
   1430,
   1431,
   1432,
   1,
   1433,
   1434,
   10,
   1435,
   1436,
   1,
   1437,
   1438,
   1439],
  839),
 ([258,
   32,
   1428,
   839,
   1429,
   32,
   1430,
   1431,
   1432,
   1,
   1433,
   1434,
   10,
   1435,
   1436,
   1,
   1437,
   1438,
   1439,
   839],
  1440),
 ([32,
   1428,
   839,
   1429,
   32,
   1430,
   1431,
   1432,
   1,
   1433,
   1434,
   10,
   1435,
   1436,
   1,
   1437,
   1438,
   1439,
   839,
   1440],
  840),
 ([1428,
   839,
   1429,
   32,
   1430,
   1431,
   1432,
   1,
   1433,
   1434,
   10,
   1435,
   1436,
   1,
   1437,
   14

In [13]:
print(f"Created a total of {len(sequences)} sequences.")

Created a total of 16653 sequences.


In [14]:
import torch.nn.functional as F
# Define the model
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers=1, dropout=0.3):
        super(LSTMModel, self).__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        
        # Use pre-trained embeddings or larger embedding dimension
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        # Use Bidirectional LSTM
        self.lstm = nn.LSTM(embedding_dim, 
                           hidden_dim,
                           num_layers=num_layers,
                           dropout=dropout if num_layers > 1 else 0,
                           batch_first=True,
                           bidirectional=True)  # Added bidirectional
        
        # Add Layer Normalization
        self.layer_norm = nn.LayerNorm(hidden_dim * 2)  # x2 for bidirectional
        
        self.dropout = nn.Dropout(dropout)
        
        # Deeper FC layers
        self.fc1 = nn.Linear(hidden_dim * 2, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, vocab_size)
        
    def forward(self, x, hidden=None):
        batch_size = x.size(0)
        
        if hidden is None:
            h0 = torch.zeros(self.num_layers * 2, batch_size, self.hidden_dim).to(x.device)  # x2 for bidirectional
            c0 = torch.zeros(self.num_layers * 2, batch_size, self.hidden_dim).to(x.device)
            hidden = (h0, c0)
        
        embeds = self.embedding(x)
        lstm_out, hidden = self.lstm(embeds, hidden)
        
        # Get the last time step
        lstm_out = lstm_out[:, -1, :]
        
        # Apply layer normalization
        lstm_out = self.layer_norm(lstm_out)
        lstm_out = self.dropout(lstm_out)
        
        # Use two FC layers
        output = F.relu(self.fc1(lstm_out))
        output = self.dropout(output)
        output = self.fc2(output)
        
        return output, hidden

In [15]:
import torch
from torch.utils.data import TensorDataset, DataLoader
import itertools
import time

# --- TRAINING FUNCTION FOR GRID SEARCH ---

# Split the sequences data into (input_sequence, target_word)
inputs = torch.tensor([item[0] for item in sequences])
targets = torch.tensor([item[1] for item in sequences])

def train_for_grid_search(params):
    """
    Trains the model with the given hyperparameters and returns the final loss.
    """
    print(f"--- Trying parameters: {params} ---")
    
    # Create model, DataLoader, Criterion and Optimizer with parameters
    model = LSTMModel(
        vocab_size=vocab_size,
        embedding_dim=params['embedding_dim'],
        hidden_dim=params['hidden_dim'],
        num_layers=params['num_layers'],
        dropout=params['dropout']
    ).to(device)
    
    dataset = TensorDataset(inputs, targets)
    data_loader = DataLoader(dataset, shuffle=True, batch_size=params['batch_size'])
    
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=params['learning_rate'])
    
    # Using fewer epochs for grid search to save time
    num_epochs = 5
    model.train()
    
    start_time = time.time()
    for epoch in range(num_epochs):
        epoch_loss = 0
        for inputs_batch, targets_batch in data_loader:
            inputs_batch, targets_batch = inputs_batch.to(device), targets_batch.to(device)
            
            optimizer.zero_grad()
            outputs, hidden = model(inputs_batch)
            loss = criterion(outputs, targets_batch)
            loss.backward()
            optimizer.step()
            
            epoch_loss += loss.item()
        
        avg_loss = epoch_loss / len(data_loader)
        print(f"  Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}")

    end_time = time.time()
    print(f"  Completed in: {end_time - start_time:.2f} seconds")
    
    # Return the average loss of the last epoch
    return avg_loss

In [16]:
# --- GRID SEARCH SETUP ---

# 1. The larger this list, the longer the process will take!
param_grid = {
    'embedding_dim': [256, 300, 400],  # Larger embeddings
    'hidden_dim': [256, 384, 512],     # Larger hidden layers
    'learning_rate': [0.0001, 0.0005, 0.001, 0.005],  # More LR options
    'batch_size': [32],      
    'num_layers': [2],
    'dropout': [0.2]
}

# 2. Create all possible parameter combinations
keys, values = zip(*param_grid.items())
param_combinations = [dict(zip(keys, v)) for v in itertools.product(*values)]

results = []
best_loss = float('inf')
best_params = None

print(f"Starting Grid Search. Trying {len(param_combinations)} combinations in total.")

# 3. Train the model for each combination
for params in param_combinations:
    loss = train_for_grid_search(params)
    results.append({'params': params, 'loss': loss})
    
    if loss < best_loss:
        best_loss = loss
        best_params = params

# 4. Print the best result
print("\n--- Grid Search Complete ---")
print(f"Lowest Loss: {best_loss:.4f}")
print(f"Best Hyperparameters: {best_params}")

Starting Grid Search. Trying 36 combinations in total.
--- Trying parameters: {'embedding_dim': 256, 'hidden_dim': 256, 'learning_rate': 0.0001, 'batch_size': 32, 'num_layers': 2, 'dropout': 0.2} ---
  Epoch 1/5, Loss: 7.2548
  Epoch 2/5, Loss: 6.5273
  Epoch 3/5, Loss: 6.1839
  Epoch 4/5, Loss: 5.8840
  Epoch 5/5, Loss: 5.6163
  Completed in: 39.91 seconds
--- Trying parameters: {'embedding_dim': 256, 'hidden_dim': 256, 'learning_rate': 0.0005, 'batch_size': 32, 'num_layers': 2, 'dropout': 0.2} ---
  Epoch 1/5, Loss: 7.0262
  Epoch 2/5, Loss: 6.3857
  Epoch 3/5, Loss: 5.8915
  Epoch 4/5, Loss: 5.4145
  Epoch 5/5, Loss: 4.9584
  Completed in: 39.77 seconds
--- Trying parameters: {'embedding_dim': 256, 'hidden_dim': 256, 'learning_rate': 0.001, 'batch_size': 32, 'num_layers': 2, 'dropout': 0.2} ---
  Epoch 1/5, Loss: 7.0627
  Epoch 2/5, Loss: 6.6800
  Epoch 3/5, Loss: 6.3962
  Epoch 4/5, Loss: 6.0561
  Epoch 5/5, Loss: 5.7047
  Completed in: 39.90 seconds
--- Trying parameters: {'embedd

In [17]:
# --- FINAL MODEL TRAINING ---

# 1. Create the final model with the best parameters
final_model = LSTMModel(
    vocab_size=vocab_size,
    embedding_dim=best_params['embedding_dim'],
    hidden_dim=best_params['hidden_dim'],
    num_layers=best_params['num_layers'],
    dropout=best_params['dropout']
).to(device)

# 2. Create DataLoader
dataset = TensorDataset(inputs, targets)
data_loader = DataLoader(dataset, shuffle=True, batch_size=best_params['batch_size'])

# 3. Define Loss Function and Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(final_model.parameters(), lr=best_params['learning_rate'])

# 4. Run the final training loop for more epochs
num_epochs = 40
final_model.train()

print("Starting final training with the best hyperparameters...")
for epoch in range(num_epochs):
    epoch_loss = 0
    for inputs_batch, targets_batch in data_loader:
        inputs_batch, targets_batch = inputs_batch.to(device), targets_batch.to(device)
        optimizer.zero_grad()
        outputs, hidden = final_model(inputs_batch)  # FIX: Removed duplicate variable name
        loss = criterion(outputs, targets_batch)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()

    avg_loss = epoch_loss / len(data_loader)
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}")

print("Final training complete.")

Starting final training with the best hyperparameters...
Epoch 1/40, Loss: 7.0097
Epoch 2/40, Loss: 6.3231
Epoch 3/40, Loss: 5.7846
Epoch 4/40, Loss: 5.2605
Epoch 5/40, Loss: 4.7544
Epoch 6/40, Loss: 4.2271
Epoch 7/40, Loss: 3.7221
Epoch 8/40, Loss: 3.2386
Epoch 9/40, Loss: 2.7589
Epoch 10/40, Loss: 2.3693
Epoch 11/40, Loss: 1.9847
Epoch 12/40, Loss: 1.6689
Epoch 13/40, Loss: 1.4222
Epoch 14/40, Loss: 1.2106
Epoch 15/40, Loss: 1.0298
Epoch 16/40, Loss: 0.9192
Epoch 17/40, Loss: 0.7872
Epoch 18/40, Loss: 0.7091
Epoch 19/40, Loss: 0.6113
Epoch 20/40, Loss: 0.5619
Epoch 21/40, Loss: 0.5121
Epoch 22/40, Loss: 0.4705
Epoch 23/40, Loss: 0.4277
Epoch 24/40, Loss: 0.4154
Epoch 25/40, Loss: 0.3892
Epoch 26/40, Loss: 0.3437
Epoch 27/40, Loss: 0.3308
Epoch 28/40, Loss: 0.3249
Epoch 29/40, Loss: 0.3116
Epoch 30/40, Loss: 0.3127
Epoch 31/40, Loss: 0.2823
Epoch 32/40, Loss: 0.2795
Epoch 33/40, Loss: 0.2588
Epoch 34/40, Loss: 0.2543
Epoch 35/40, Loss: 0.2581
Epoch 36/40, Loss: 0.2483
Epoch 37/40, Los

In [18]:
def generate_text(model, start_text, num_words=50, temperature=0.8):
    """
    Generates text using the trained model.
    
    Args:
        model: The trained LSTM model
        start_text: Initial text to start generation (must be at least sequence_length words)
        num_words: Number of words to generate
        temperature: Controls randomness (0.1=conservative, 1.0=normal, 2.0=creative)
    """
    model.eval() # Set the model to evaluation mode

    # Process the starting text
    words = start_text.lower().split()
    
    # Make sure we have enough words for a full sequence
    if len(words) < sequence_length:
        print(f"Warning: Start text must have at least {sequence_length} words. Padding with 'the'.")
        words = ['the'] * (sequence_length - len(words)) + words
    
    # Take only the last sequence_length words if we have too many
    words = words[-sequence_length:]
    
    # Convert words to indices (use 0 for unknown words)
    current_sequence = [word_to_idx.get(w, 0) for w in words]
    generated_words = words.copy()
    
    with torch.no_grad(): # We don't need to calculate gradients for generation
        for _ in range(num_words):
            # Convert current sequence to tensor
            input_tensor = torch.tensor([current_sequence], device=device)
            
            # Get the model's output
            output, _ = model(input_tensor)
            
            # Apply temperature to the output logits
            word_weights = output.squeeze().div(temperature).exp().cpu()
            
            # Sample the next word index from the probability distribution
            word_idx = torch.multinomial(word_weights, 1)[0].item()
            
            # Update the sequence: remove first word, add new word
            current_sequence = current_sequence[1:] + [word_idx]
            
            # Convert the index back to a word and add to our list
            generated_word = idx_to_word[word_idx]
            generated_words.append(generated_word)

    return ' '.join(generated_words)

In [19]:
# Test the text generation
start_prompt = "shall i compare thee to a summers day thou art more lovely and temperate rough winds do shake the"
generated_text = generate_text(final_model, start_prompt, num_words=30, temperature=0.8)
print("\nGenerated text:")
print(generated_text)


Generated text:
the shall i compare thee to a summers day thou art more lovely and temperate rough winds do shake the head of this athenian swain, that he, awaking when the other do, may all to athens back again repair and think no more of this night's accidents but as the
