## VERİ

In [21]:
import numpy as np
from numpy.random import randn
import random

train_data = {
  'good': True,
  'bad': False,
  'happy': True,
  'sad': False,
  'not good': False,
  'not bad': True,
  'not happy': False,
  'not sad': True,
  'very good': True,
  'very bad': False,
  'very happy': True,
  'very sad': False,
  'i am happy': True,
  'this is good': True,
  'i am bad': False,
  'this is bad': False,
  'i am sad': False,
  'this is sad': False,
  'i am not happy': False,
  'this is not good': False,
  'i am not bad': True,
  'this is not sad': True,
  'i am very happy': True,
  'this is very good': True,
  'i am very bad': False,
  'this is very sad': False,
  'this is very happy': True,
  'i am good not bad': True,
  'this is good not bad': True,
  'i am bad not good': False,
  'i am good and happy': True,
  'this is not good and not happy': False,
  'i am not at all good': False,
  'i am not at all bad': True,
  'i am not at all happy': False,
  'this is not at all sad': True,
  'this is not at all happy': False,
  'i am good right now': True,
  'i am bad right now': False,
  'this is bad right now': False,
  'i am sad right now': False,
  'i was good earlier': True,
  'i was happy earlier': True,
  'i was bad earlier': False,
  'i was sad earlier': False,
  'i am very bad right now': False,
  'this is very good right now': True,
  'this is very sad right now': False,
  'this was bad earlier': False,
  'this was very good earlier': True,
  'this was very bad earlier': False,
  'this was very happy earlier': True,
  'this was very sad earlier': False,
  'i was good and not bad earlier': True,
  'i was not good and not happy earlier': False,
  'i am not at all bad or sad right now': True,
  'i am not at all good or happy right now': False,
  'this was not happy and not good earlier': False,
}

test_data = {
  'this is happy': True,
  'i am good': True,
  'this is not happy': False,
  'i am not good': False,
  'this is not bad': True,
  'i am not sad': True,
  'i am very good': True,
  'this is very bad': False,
  'i am very sad': False,
  'this is bad not good': False,
  'this is good and happy': True,
  'i am not good and not happy': False,
  'i am not at all sad': True,
  'this is not at all good': False,
  'this is not at all bad': True,
  'this is good right now': True,
  'this is sad right now': False,
  'this is very bad right now': False,
  'this was good earlier': True,
  'i was not happy and not good earlier': False,
}

## MODEL

In [22]:
class LSTM:
    def __init__(self, input_size, output_size, hidden_size=64):
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size

        # Ağırlıklar ve biaslar (He başlatma veya Xavier/Glorot başlatma daha iyi olabilir ama şimdilik basit tutalım)
        scale = 1000 # Orjinal RNN'deki gibi
        # Giriş kapısı (input gate)
        self.W_i = randn(hidden_size, input_size) / scale
        self.U_i = randn(hidden_size, hidden_size) / scale
        self.b_i = np.zeros((hidden_size, 1))

        # Unutma kapısı (forget gate) - Bias'ı pozitif başlatmak genellikle iyi bir pratiktir (unutmayı zorlaştırır)
        self.W_f = randn(hidden_size, input_size) / scale
        self.U_f = randn(hidden_size, hidden_size) / scale
        self.b_f = np.ones((hidden_size, 1)) # Unutma kapısı biasını 1'e ayarlamak öğrenmeye yardımcı olabilir

        # Çıkış kapısı (output gate)
        self.W_o = randn(hidden_size, input_size) / scale
        self.U_o = randn(hidden_size, hidden_size) / scale
        self.b_o = np.zeros((hidden_size, 1))

        # Hücre durumu (cell state)
        self.W_c = randn(hidden_size, input_size) / scale
        self.U_c = randn(hidden_size, hidden_size) / scale
        self.b_c = np.zeros((hidden_size, 1))

        # Çıkış katmanı ağırlıkları
        self.W_y = randn(output_size, hidden_size) / scale
        self.b_y = np.zeros((output_size, 1))

    def sigmoid(self, x):
        # Sayısal stabilite için klipleme eklenebilir: np.clip(x, -500, 500)
        return 1 / (1 + np.exp(-np.clip(x, -50, 50))) # Klipleme eklendi

    def tanh(self, x):
        return np.tanh(np.clip(x, -25, 25)) # Klipleme eklendi

    def forward(self, inputs):
        h_prev = np.zeros((self.hidden_size, 1))
        c_prev = np.zeros((self.hidden_size, 1))

        self.last_inputs_sequence = inputs # Girdi dizisini sakla
        self.last_hs_sequence = {0: h_prev}
        self.last_cs_sequence = {0: c_prev}
        self.gate_is_sequence = {} # input gates
        self.gate_fs_sequence = {} # forget gates
        self.gate_os_sequence = {} # output gates
        self.gate_c_hats_sequence = {} # candidate cell states

        for t, x_t in enumerate(inputs):
            # Birleşik girdi (concat x_t and h_prev) yerine ayrı matris çarpımları kullanılıyor
            # Unutma kapısı
            f_t = self.sigmoid(self.W_f @ x_t + self.U_f @ h_prev + self.b_f)
            # Giriş kapısı
            i_t = self.sigmoid(self.W_i @ x_t + self.U_i @ h_prev + self.b_i)
            # Aday hücre durumu
            c_hat_t = self.tanh(self.W_c @ x_t + self.U_c @ h_prev + self.b_c)
            # Hücre durumu güncellemesi
            c_t = f_t * c_prev + i_t * c_hat_t
            # Çıkış kapısı
            o_t = self.sigmoid(self.W_o @ x_t + self.U_o @ h_prev + self.b_o)
            # Gizli durum güncellemesi
            h_t = o_t * self.tanh(c_t)

            # Sonraki adım için sakla
            self.last_hs_sequence[t + 1] = h_t
            self.last_cs_sequence[t + 1] = c_t
            self.gate_is_sequence[t] = i_t
            self.gate_fs_sequence[t] = f_t
            self.gate_os_sequence[t] = o_t
            self.gate_c_hats_sequence[t] = c_hat_t

            h_prev = h_t
            c_prev = c_t

        # Çıkış hesapla (sadece son gizli durumdan)
        y = self.W_y @ h_t + self.b_y
        return y, h_t

    def backprop(self, d_L_d_y, learn_rate=0.01): # Öğrenme oranını biraz daha düşük tutmak iyi olabilir
        n = len(self.last_inputs_sequence)

        # Gradyanları başlat (her dizi için sıfırlanır)
        d_W_i, d_U_i, d_b_i = np.zeros_like(self.W_i), np.zeros_like(self.U_i), np.zeros_like(self.b_i)
        d_W_f, d_U_f, d_b_f = np.zeros_like(self.W_f), np.zeros_like(self.U_f), np.zeros_like(self.b_f)
        d_W_o, d_U_o, d_b_o = np.zeros_like(self.W_o), np.zeros_like(self.U_o), np.zeros_like(self.b_o)
        d_W_c, d_U_c, d_b_c = np.zeros_like(self.W_c), np.zeros_like(self.U_c), np.zeros_like(self.b_c)

        # Çıkış katmanı gradyanları
        # d_L_d_y: d(Loss)/d(Output Layer Pre-Activation)
        # h_N: self.last_hs_sequence[n]
        d_W_y = d_L_d_y @ self.last_hs_sequence[n].T
        d_b_y = d_L_d_y.copy() # Toplam gradyan için

        # Bir sonraki (aslında önceki zaman adımı için) gizli durum ve hücre durumu gradyanları
        # dL/dh_N (son gizli durumun gradyanı)
        dL_dh_t = self.W_y.T @ d_L_d_y
        # dL/dc_N (son hücre durumunun gradyanı, başlangıçta sıfır çünkü c_N+1 yok)
        dL_dc_t = np.zeros_like(dL_dh_t)

        # Zaman içinde geriye doğru yayılım
        for t in reversed(range(n)):
            x_t = self.last_inputs_sequence[t]
            h_prev_t = self.last_hs_sequence[t] # h_{t-1}
            c_prev_t = self.last_cs_sequence[t] # c_{t-1}

            # İleri yayılımdan saklanan değerler (t zaman adımı için)
            i_val_t = self.gate_is_sequence[t]
            f_val_t = self.gate_fs_sequence[t]
            o_val_t = self.gate_os_sequence[t]
            c_hat_val_t = self.gate_c_hats_sequence[t]
            c_val_t = self.last_cs_sequence[t+1] # c_t

            # dL/dc_t hesaplamasına h_t üzerinden gelen katkıyı ekle
            # dL/dc_t = (dL/dh_t * dh_t/dc_t) + dL/dc_{t+1}*dc_{t+1}/dc_t (bu dL_dc_t olarak geliyor)
            dL_dc_t_combined = dL_dc_t + dL_dh_t * o_val_t * (1 - np.tanh(c_val_t)**2)

            # Çıkış kapısı gradyanları (o_t)
            # dL/do_t_input = dL/dh_t * tanh(c_t)
            # dL/do_t_pre_activation = dL/do_t_input * o_t * (1-o_t)
            dL_do_t_pre_act = (dL_dh_t * np.tanh(c_val_t)) * (o_val_t * (1 - o_val_t))
            d_W_o += dL_do_t_pre_act @ x_t.T
            d_U_o += dL_do_t_pre_act @ h_prev_t.T
            d_b_o += dL_do_t_pre_act

            # Aday hücre durumu gradyanları (c_hat_t)
            # dL/dc_hat_t_output = dL/dc_t_combined * i_t
            # dL/dc_hat_t_pre_activation = dL/dc_hat_t_output * (1 - c_hat_t^2)
            dL_dc_hat_t_pre_act = (dL_dc_t_combined * i_val_t) * (1 - c_hat_val_t**2)
            d_W_c += dL_dc_hat_t_pre_act @ x_t.T
            d_U_c += dL_dc_hat_t_pre_act @ h_prev_t.T
            d_b_c += dL_dc_hat_t_pre_act

            # Giriş kapısı gradyanları (i_t)
            # dL/di_t_input = dL/dc_t_combined * c_hat_t
            # dL/di_t_pre_activation = dL/di_t_input * i_t * (1-i_t)
            dL_di_t_pre_act = (dL_dc_t_combined * c_hat_val_t) * (i_val_t * (1 - i_val_t))
            d_W_i += dL_di_t_pre_act @ x_t.T
            d_U_i += dL_di_t_pre_act @ h_prev_t.T
            d_b_i += dL_di_t_pre_act

            # Unutma kapısı gradyanları (f_t)
            # dL/df_t_input = dL/dc_t_combined * c_{t-1}
            # dL/df_t_pre_activation = dL/df_t_input * f_t * (1-f_t)
            dL_df_t_pre_act = (dL_dc_t_combined * c_prev_t) * (f_val_t * (1 - f_val_t))
            d_W_f += dL_df_t_pre_act @ x_t.T
            d_U_f += dL_df_t_pre_act @ h_prev_t.T
            d_b_f += dL_df_t_pre_act

            # Bir önceki gizli durum (h_{t-1}) ve hücre durumu (c_{t-1}) için gradyanları hesapla
            # Bu gradyanlar, U matrisleri aracılığıyla geri yayılır
            dL_dh_prev_t = (self.U_f.T @ dL_df_t_pre_act +
                             self.U_i.T @ dL_di_t_pre_act +
                             self.U_c.T @ dL_dc_hat_t_pre_act +
                             self.U_o.T @ dL_do_t_pre_act)

            dL_dc_prev_t = dL_dc_t_combined * f_val_t # dL/dc_{t-1} = dL/dc_t * f_t

            # Bir sonraki (döngüde t-1) adım için gradyanları güncelle
            dL_dh_t = dL_dh_prev_t
            dL_dc_t = dL_dc_prev_t


        # Patlayan gradyanları önlemek için kırpma
        gradients = [d_W_i, d_U_i, d_b_i, d_W_f, d_U_f, d_b_f,
                       d_W_o, d_U_o, d_b_o, d_W_c, d_U_c, d_b_c, d_W_y, d_b_y]
        clip_value = 1.0 # Kırpma değeri
        for grad in gradients:
            np.clip(grad, -clip_value, clip_value, out=grad)

        # Ağırlıkları ve biasları güncelle (negatif gradyan yönünde)
        self.W_i -= learn_rate * d_W_i; self.U_i -= learn_rate * d_U_i; self.b_i -= learn_rate * d_b_i
        self.W_f -= learn_rate * d_W_f; self.U_f -= learn_rate * d_U_f; self.b_f -= learn_rate * d_b_f
        self.W_o -= learn_rate * d_W_o; self.U_o -= learn_rate * d_U_o; self.b_o -= learn_rate * d_b_o
        self.W_c -= learn_rate * d_W_c; self.U_c -= learn_rate * d_U_c; self.b_c -= learn_rate * d_b_c
        self.W_y -= learn_rate * d_W_y; self.b_y -= learn_rate * d_b_y

In [23]:
import numpy as np
import random

# Create the vocabulary.
vocab = list(set([w for text in train_data.keys() for w in text.split(' ')]))
vocab_size = len(vocab)
print('%d unique words found' % vocab_size)

# Assign indices to each word.
word_to_idx = { w: i for i, w in enumerate(vocab) }
idx_to_word = { i: w for i, w in enumerate(vocab) }
# print(word_to_idx['good'])
# print(idx_to_word[0])

def createInputs(text):
  '''
  Returns an array of one-hot vectors representing the words in the input text string.
  - text is a string
  - Each one-hot vector has shape (vocab_size, 1)
  '''
  inputs = []
  for w in text.split(' '):
    v = np.zeros((vocab_size, 1))
    v[word_to_idx[w]] = 1
    inputs.append(v)
  return inputs

def softmax(xs):
  # Applies the Softmax Function to the input array.
  return np.exp(xs) / sum(np.exp(xs))

# Initialize our RNN!
rnn = RNN(vocab_size, 2)

def processData(data, backprop=True):
  '''
  Returns the RNN's loss and accuracy for the given data.
  - data is a dictionary mapping text to True or False.
  - backprop determines if the backward phase should be run.
  '''
  items = list(data.items())
  random.shuffle(items)

  loss = 0
  num_correct = 0

  for x, y in items:
    inputs = createInputs(x)
    target = int(y)

    # Forward
    out, _ = rnn.forward(inputs)
    probs = softmax(out)

    # Calculate loss / accuracy
    loss -= np.log(probs[target])
    num_correct += int(np.argmax(probs) == target)

    if backprop:
      # Build dL/dy
      d_L_d_y = probs
      d_L_d_y[target] -= 1

      # Backward
      rnn.backprop(d_L_d_y)

  return loss / len(data), num_correct / len(data)

# Training loop
for epoch in range(1000):
  train_loss, train_acc = processData(train_data)

  if epoch % 100 == 99:
    print('--- Epoch %d' % (epoch + 1))
    print('Train:\tLoss %.3f | Accuracy: %.3f' % (train_loss, train_acc))

    test_loss, test_acc = processData(test_data, backprop=False)
    print('Test:\tLoss %.3f | Accuracy: %.3f' % (test_loss, test_acc))

18 unique words found
--- Epoch 100
Train:	Loss 0.689 | Accuracy: 0.552
Test:	Loss 0.699 | Accuracy: 0.500


  print('Train:\tLoss %.3f | Accuracy: %.3f' % (train_loss, train_acc))
  print('Test:\tLoss %.3f | Accuracy: %.3f' % (test_loss, test_acc))


--- Epoch 200
Train:	Loss 0.669 | Accuracy: 0.621
Test:	Loss 0.722 | Accuracy: 0.650
--- Epoch 300
Train:	Loss 0.643 | Accuracy: 0.690
Test:	Loss 0.961 | Accuracy: 0.550
--- Epoch 400
Train:	Loss 0.407 | Accuracy: 0.862
Test:	Loss 0.577 | Accuracy: 0.650
--- Epoch 500
Train:	Loss 0.328 | Accuracy: 0.828
Test:	Loss 0.705 | Accuracy: 0.650
--- Epoch 600
Train:	Loss 0.130 | Accuracy: 0.948
Test:	Loss 0.737 | Accuracy: 0.700
--- Epoch 700
Train:	Loss 0.016 | Accuracy: 1.000
Test:	Loss 0.340 | Accuracy: 0.900
--- Epoch 800
Train:	Loss 0.003 | Accuracy: 1.000
Test:	Loss 0.331 | Accuracy: 0.900
--- Epoch 900
Train:	Loss 0.002 | Accuracy: 1.000
Test:	Loss 0.355 | Accuracy: 0.900
--- Epoch 1000
Train:	Loss 0.001 | Accuracy: 1.000
Test:	Loss 0.378 | Accuracy: 0.900
