In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split


In [None]:
# Load dataset
data = pd.read_csv('data.csv',names=['Label', 'Text'], encoding='latin-1')

In [None]:
# Display basic info about the data
print(data.head())
print(data.info())

      Label                                               Text
0   neutral  According to Gran , the company has no plans t...
1   neutral  Technopolis plans to develop in stages an area...
2  negative  The international electronic industry company ...
3  positive  With the new production plant the company woul...
4  positive  According to the company 's updated strategy f...
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4846 entries, 0 to 4845
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Label   4846 non-null   object
 1   Text    4846 non-null   object
dtypes: object(2)
memory usage: 75.8+ KB
None


In [None]:
# Preprocessing functions
def get_sequences(texts):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(texts)

    sequences = tokenizer.texts_to_sequences(texts)
    vocab_length = len(tokenizer.word_index) + 1
    max_seq_length = np.max(list(map(len, sequences)))

    print("Vocabulary length:", vocab_length)
    print("Maximum sequence length:", max_seq_length)

    padded_sequences = pad_sequences(sequences, maxlen=max_seq_length, padding='post')
    return padded_sequences, tokenizer, max_seq_length, vocab_length

In [None]:
def preprocess_inputs(df):
    label_mapping = {'negative': 0, 'neutral': 1, 'positive': 2}
    df['Label'] = df['Label'].replace(label_mapping)

    sequences, tokenizer, max_seq_length, vocab_length = get_sequences(df['Text'])
    train_sequences, test_sequences, y_train, y_test = train_test_split(
        sequences, df['Label'], train_size=0.7, shuffle=True, random_state=1
    )
    return train_sequences, test_sequences, y_train, y_test, tokenizer, max_seq_length, vocab_length

In [None]:
# Preprocess the data
train_sequences, test_sequences, y_train, y_test, tokenizer, max_seq_length, vocab_length = preprocess_inputs(data)


  df['Label'] = df['Label'].replace(label_mapping)


Vocabulary length: 10123
Maximum sequence length: 71


In [None]:
# Define LSTM-based model
def create_lstm_model(input_length, vocab_size):
    inputs = tf.keras.Input(shape=(input_length,))
    x = tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=128, input_length=input_length)(inputs)
    x = tf.keras.layers.LSTM(256, return_sequences=False, activation='tanh')(x)
    outputs = tf.keras.layers.Dense(3, activation='softmax')(x)

    model = tf.keras.Model(inputs=inputs, outputs=outputs)
    model.compile(
        optimizer='adam',
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )
    return model

In [None]:
# Create and train the model
model = create_lstm_model(max_seq_length, vocab_length)

history = model.fit(
    train_sequences,
    y_train,
    validation_split=0.2,
    batch_size=32,
    epochs=100,
    callbacks=[
        tf.keras.callbacks.EarlyStopping(
            monitor='val_loss',
            patience=3,
            restore_best_weights=True
        )
    ]
)



Epoch 1/100
[1m85/85[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 12ms/step - accuracy: 0.5799 - loss: 0.9378 - val_accuracy: 0.5700 - val_loss: 0.9592
Epoch 2/100
[1m85/85[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 11ms/step - accuracy: 0.6097 - loss: 0.9066 - val_accuracy: 0.5700 - val_loss: 0.9579
Epoch 3/100
[1m85/85[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.6040 - loss: 0.9130 - val_accuracy: 0.5700 - val_loss: 0.9671
Epoch 4/100
[1m85/85[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.6095 - loss: 0.9116 - val_accuracy: 0.5700 - val_loss: 0.9632
Epoch 5/100
[1m85/85[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.6027 - loss: 0.9246 - val_accuracy: 0.5700 - val_loss: 0.9579


In [None]:
# Evaluate the model on the test set
results = model.evaluate(test_sequences, y_test, verbose=0)

print(f"Test Loss: {results[0]:.5f}")
print(f"Test Accuracy: {results[1] * 100:.2f}%")

Test Loss: 0.93571
Test Accuracy: 58.46%


In [None]:
## reframe LSTM code to increase accuracy

In [None]:
def create_lstm_model(input_length, vocab_length):
    inputs = tf.keras.Input(shape=(input_length,))
    x = tf.keras.layers.Embedding(
        input_dim=vocab_length,
        output_dim=128,
        input_length=input_length
    )(inputs)
    x = tf.keras.layers.LSTM(
        256,
        return_sequences=True,
        activation='tanh'
    )(x)
    x = tf.keras.layers.Flatten()(x)  # Flattening the outputs
    outputs = tf.keras.layers.Dense(3, activation='softmax')(x)
    model = tf.keras.Model(inputs=inputs, outputs=outputs)
    model.compile(
        optimizer='adam',
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )
    return model

In [None]:
# Create and train the LSTM model
lstm_model = create_lstm_model(max_seq_length, vocab_length)

history = lstm_model.fit(
    train_sequences,
    y_train,
    validation_split=0.2,
    batch_size=32,
    epochs=100,
    callbacks=[
        tf.keras.callbacks.EarlyStopping(
            monitor='val_loss',
            patience=3,
            restore_best_weights=True
        )
    ],
    verbose=1
)

Epoch 1/100
[1m85/85[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 12ms/step - accuracy: 0.5774 - loss: 0.9027 - val_accuracy: 0.6244 - val_loss: 0.8415
Epoch 2/100
[1m85/85[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.7794 - loss: 0.5250 - val_accuracy: 0.6745 - val_loss: 0.8275
Epoch 3/100
[1m85/85[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.9429 - loss: 0.1741 - val_accuracy: 0.6863 - val_loss: 1.0620
Epoch 4/100
[1m85/85[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.9824 - loss: 0.0613 - val_accuracy: 0.6863 - val_loss: 1.0372
Epoch 5/100
[1m85/85[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.9916 - loss: 0.0366 - val_accuracy: 0.6848 - val_loss: 1.3117


In [None]:
results = lstm_model.evaluate(test_sequences, y_test, verbose=0)

# Print evaluation results
print("LSTM Model Test Loss: {:.5f}".format(results[0]))
print("LSTM Model Test Accuracy: {:.2f}%".format(results[1] * 100))

LSTM Model Test Loss: 0.72925
LSTM Model Test Accuracy: 72.90%


In [None]:
# Function to create GRU-based model
def create_gru_model(input_length, vocab_length):
    inputs = tf.keras.Input(shape=(input_length,))
    x = tf.keras.layers.Embedding(input_dim=vocab_length, output_dim=128, input_length=input_length)(inputs)
    x = tf.keras.layers.GRU(256, return_sequences=True, activation='tanh')(x)
    outputs = tf.keras.layers.Dense(3, activation='softmax')(x)
    model = tf.keras.Model(inputs=inputs, outputs=outputs)
    model.compile(
        optimizer='adam',
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )
    return model

In [None]:
# Create and train the GRU model
inputs = tf.keras.Input(shape=(train_sequences.shape[1],))
x = tf.keras.layers.Embedding(
    input_dim=10123,
    output_dim=128,
    input_length=train_sequences.shape[1]
)(inputs)
x = tf.keras.layers.GRU(256, return_sequences=True, activation='tanh')(x)
x = tf.keras.layers.Flatten()(x)
outputs = tf.keras.layers.Dense(3, activation='softmax')(x)

gru_model = tf.keras.Model(inputs=inputs, outputs=outputs)
gru_model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

history = gru_model.fit(
    train_sequences,
    y_train,
    validation_split=0.2,
    batch_size=32,
    epochs=100,
    callbacks=[
        tf.keras.callbacks.EarlyStopping(
            monitor='val_loss',
            patience=3,
            restore_best_weights=True
        )
    ]
)

Epoch 1/100
[1m85/85[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - accuracy: 0.6068 - loss: 0.8961 - val_accuracy: 0.6495 - val_loss: 0.8109
Epoch 2/100
[1m85/85[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.7791 - loss: 0.4914 - val_accuracy: 0.6951 - val_loss: 0.7637
Epoch 3/100
[1m85/85[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.9549 - loss: 0.1471 - val_accuracy: 0.7054 - val_loss: 0.9028
Epoch 4/100
[1m85/85[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.9867 - loss: 0.0536 - val_accuracy: 0.7099 - val_loss: 1.0840
Epoch 5/100
[1m85/85[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.9977 - loss: 0.0168 - val_accuracy: 0.6848 - val_loss: 1.3765


In [None]:
# Evaluate the GRU model
results = gru_model.evaluate(test_sequences, y_test, verbose=0)

# Print evaluation results
print("GRU Model Test Loss: {:.5f}".format(results[0]))
print("GRU Model Test Accuracy: {:.2f}%".format(results[1] * 100))

GRU Model Test Loss: 0.64034
GRU Model Test Accuracy: 74.00%


In [None]:
lstm_model.save("lstm_model.keras")

In [None]:
import pickle

In [None]:
with open("tokenizer.pkl", "wb") as file:
    pickle.dump(tokenizer, file)


In [None]:
from google.colab import files
files.download("lstm_model.keras")  # or "lstm_model.h5"


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
files.download("tokenizer.pkl")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>