<a href="https://colab.research.google.com/github/Ariiiff/NLP_Practices/blob/main/simple_lstm_example2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.models import Model
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding
from keras.optimizers import RMSprop
from tensorflow.keras.preprocessing.text import Tokenizer 
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.preprocessing import sequence
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping
from keras.models import Sequential
%matplotlib inline

In [7]:
# example data
texts = ['This is a positive sentence.',
         'This is a negative sentence.',
         'Another positive sentence here.',
         'And another negative sentence.']

labels = np.array([1, 0, 1, 0])

In [8]:
# tokenize the texts
tokenizer = Tokenizer(num_words=1000)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

In [18]:
sequences

[[2, 3, 4, 5, 1], [2, 3, 4, 6, 1], [7, 5, 1, 8], [9, 7, 6, 1]]

In [9]:
# pad the sequences to ensure they are all the same length
max_length = max([len(seq) for seq in sequences])
padded_sequences = pad_sequences(sequences, maxlen=max_length)

In [19]:
padded_sequences

array([[2, 3, 4, 5, 1],
       [2, 3, 4, 6, 1],
       [0, 7, 5, 1, 8],
       [0, 9, 7, 6, 1]], dtype=int32)

###Model 
#####Sequential : A sequential model is a linear stack of layers. In this case, we will add several layers to the model, one after the other.
#####Embedding : An embedding layer maps discrete values (in this case, words represented by integers) to a continuous vector space. In this case, we will use an embedding layer with 1000 input dimensions, 64 output dimensions, and a fixed input length of max_length.
#####Dropout: Tdropout layer is a regularization technique that randomly sets some of the inputs to zero during training, which can help prevent overfitting. In this case, we will use a dropout rate of 0.5.
#####LSTM:  A LSTM layer is a type of recurrent neural network (RNN) layer that can maintain long-term dependencies in the data. In this case, we will use an LSTM layer with 32 units.
#####Dense: A dense layer is a fully connected layer that applies a linear transformation to its input. In this case, we will use a dense layer with a single unit and a sigmoid activation function, which will output a probability between 0 and 1 representing the predicted class label.

In [10]:
# define the model
model = Sequential() 
model.add(Embedding(1000, 64, input_length=max_length))
model.add(Dropout(0.5))
model.add(LSTM(32)) 
model.add(Dense(1, activation='sigmoid')) 

In [11]:
# compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])


In [12]:
# fit the model
history = model.fit(padded_sequences, labels, epochs=10, batch_size=32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [13]:
# evaluate the model on the training data
loss, accuracy = model.evaluate(padded_sequences, labels)
print('Training accuracy:', accuracy)

Training accuracy: 1.0


In [16]:
# predict on some new texts
new_texts = ['This is a very positive sentence.', 'I hate this negative sentence.']
new_sequences = tokenizer.texts_to_sequences(new_texts)
new_padded_sequences = pad_sequences(new_sequences, maxlen=max_length)
predictions = model.predict(new_padded_sequences)




In [17]:
# Print the predicted labels
for i in range(len(new_texts)):
    if predictions[i] > 0.5:
        print('{}: positive'.format(new_texts[i]))
    else:
        print('{}: negetive'.format(new_texts[i]))

This is a very positive sentence.: positive
I hate this negative sentence.: negetive
