<a href="https://colab.research.google.com/github/arashkhgit/NLP_task/blob/main/LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!wget http://nlp.stanford.edu/data/glove.6B.zip


--2023-08-29 15:48:49--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2023-08-29 15:48:49--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2023-08-29 15:48:50--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

In [2]:
!unzip glove.6B.zip

Archive:  glove.6B.zip
  inflating: glove.6B.50d.txt        
  inflating: glove.6B.100d.txt       
  inflating: glove.6B.200d.txt       
  inflating: glove.6B.300d.txt       


In [None]:
import csv
import numpy as np

# Load and preprocess the dataset
data = []
with open('/content/IMDB Dataset.csv', 'r', encoding='utf-8') as file:
    csv_reader = csv.reader(file)
    next(csv_reader)  # Skip header
    for row in csv_reader:
        review = row[0]
        sentiment = row[1]
        data.append((review, sentiment))

for i in range(len(data)):
    review = data[i][0].lower().split()
    sentiment = data[i][1]
    data[i] = (review, sentiment)

# Build vocabulary and convert text to numerical data
vocab = {}
for review, _ in data:
    for word in review:
        if word in vocab:
            vocab[word] += 1
        else:
            vocab[word] = 1

# Choose a vocabulary size (e.g., 5000 most common words)
vocab_size = 1000
common_words = sorted(vocab.keys(), key=lambda x: vocab[x], reverse=True)[:vocab_size]
word_to_index = {word: idx for idx, word in enumerate(common_words)}

numerical_data = []
for review, sentiment in data:
    numerical_review = [word_to_index[word] for word in review if word in word_to_index]
    numerical_data.append((numerical_review, sentiment))

# Load pretrained embeddings (GloVe)
embedding_path = '/content/glove.6B.50d.txt'  # Change this to match the downloaded file name
embedding_dim = 50

embeddings_index = {}
with open(embedding_path, 'r', encoding='utf-8') as f:
    for line in f:
        values = line.strip().split()
        word = values[0]
        embedding = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = embedding

# Create an embedding matrix for the limited vocabulary
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, idx in word_to_index.items():
    if word in embeddings_index:
        embedding_matrix[idx] = embeddings_index[word]

# Prepare training and testing data
split_ratio = 0.8
split_idx = int(len(numerical_data) * split_ratio)
train_data = numerical_data[:split_idx]
test_data = numerical_data[split_idx:]

class MiniBatchLSTM:
    def __init__(self, vocab_size, hidden_size, output_size):
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.output_size = output_size

        # Initialize LSTM weights and biases here
        self.W_f = np.random.randn(hidden_size, vocab_size)
        self.W_i = np.random.randn(hidden_size, vocab_size)
        self.W_o = np.random.randn(hidden_size, vocab_size)
        self.W_c = np.random.randn(hidden_size, vocab_size)

        self.b_f = np.random.randn(hidden_size, 1)
        self.b_i = np.random.randn(hidden_size, 1)
        self.b_o = np.random.randn(hidden_size, 1)
        self.b_c = np.random.randn(hidden_size, 1)

    def sigmoid(self, x):
        return 1 / (1 + np.exp(-x))

    def tanh(self, x):
        return np.tanh(x)

    def forward(self, input_sequence):
        h = np.zeros((self.hidden_size, 1))
        c = np.zeros((self.hidden_size, 1))

        for t in input_sequence:
            f_t = self.sigmoid(np.dot(self.W_f, t) + self.b_f)
            i_t = self.sigmoid(np.dot(self.W_i, t) + self.b_i)
            o_t = self.sigmoid(np.dot(self.W_o, t) + self.b_o)
            g_t = self.tanh(np.dot(self.W_c, t) + self.b_c)

            c = f_t * c + i_t * g_t
            h = o_t * self.tanh(c)

        return h

    def train(self, train_data, num_epochs, learning_rate, batch_size):
        for epoch in range(num_epochs):
            np.random.shuffle(train_data)
            total_loss = 0

            for batch_start in range(0, len(train_data), batch_size):
                batch_data = train_data[batch_start:batch_start+batch_size]
                batch_gradients = []

                for review, sentiment in batch_data:
                    input_sequence = [np.zeros((self.vocab_size, 1)) for _ in range(self.vocab_size)]
                    for idx in review:
                        if idx < self.vocab_size:
                            input_sequence[idx] = np.ones((self.vocab_size, 1))

                    # Forward pass
                    predicted_sentiment = self.forward(input_sequence)

                    # Loss and gradient calculations
                    loss = (predicted_sentiment - int(sentiment)) ** 2
                    loss_gradient = 2 * (predicted_sentiment - int(sentiment))
                    batch_gradients.append(loss_gradient)

                # Backward pass and update weights for the batch
                # ...

                # Update total loss
                total_loss += sum([loss ** 2 for loss in batch_gradients])

            print(f"Epoch {epoch+1}, Loss: {total_loss / len(train_data)}")


# Train the MiniBatchLSTM model
vocab_size = len(vocab)
hidden_size = 32
output_size = 1
learning_rate = 0.01
num_epochs = 10
batch_size = 16

minibatch_lstm = MiniBatchLSTM(vocab_size, hidden_size, output_size)
minibatch_lstm.train(train_data, num_epochs, learning_rate, batch_size)

# Evaluate and test
correct_predictions = 0

for review, sentiment in test_data:
    input_sequence = [np.zeros((vocab_size, 1)) for _ in range(len(review))]
    for idx in review:
        input_sequence[idx] = np.ones((vocab_size, 1))

    predicted_sentiment = minibatch_lstm.forward(input_sequence)
    predicted_sentiment = 1 if predicted_sentiment > 0.5 else 0

    if predicted_sentiment == int(sentiment):
        correct_predictions += 1

accuracy = correct_predictions / len(test_data)
print("Accuracy:", accuracy)
