In [1]:
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import numpy as np


In [2]:
# Load the dataset
df = pd.read_csv('cleaned_dataset.csv')

In [3]:
# Tokenize 
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(df['nlp_statements'])

In [4]:
sequences = tokenizer.texts_to_sequences(df['nlp_statements'])
padded_sequences = pad_sequences(sequences, maxlen=100)

In [5]:

le = LabelEncoder()
labels = le.fit_transform(df['ltl_formulas'])
one_hot_labels = to_categorical(labels)

In [6]:
# Split the dataset 
train_sequences, test_sequences, train_labels, test_labels = train_test_split(padded_sequences, one_hot_labels, test_size=0.2, random_state=42)

#LSTM model
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128, input_length=100))
model.add(LSTM(units=64, dropout=0.2))
model.add(Dense(one_hot_labels.shape[1], activation='softmax'))



In [7]:
# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(train_sequences, train_labels, epochs=75, batch_size=32, validation_data=(test_sequences, test_labels))

Epoch 1/75
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 96ms/step - accuracy: 0.0000e+00 - loss: 6.5238 - val_accuracy: 0.0000e+00 - val_loss: 6.5301
Epoch 2/75
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 53ms/step - accuracy: 0.0138 - loss: 6.5128 - val_accuracy: 0.0000e+00 - val_loss: 6.5521
Epoch 3/75
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 72ms/step - accuracy: 0.0415 - loss: 6.4964 - val_accuracy: 0.0000e+00 - val_loss: 7.0720
Epoch 4/75
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 46ms/step - accuracy: 0.0000e+00 - loss: 6.4241 - val_accuracy: 0.0000e+00 - val_loss: 7.0359
Epoch 5/75
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 58ms/step - accuracy: 0.0061 - loss: 6.3893 - val_accuracy: 0.0000e+00 - val_loss: 7.7027
Epoch 6/75
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 42ms/step - accuracy: 0.0127 - loss: 6.3159 - val_accuracy: 0.0000e+00 - val_loss: 8.2160
Ep

<keras.src.callbacks.history.History at 0x23b7d8c15d0>

In [8]:
# Use the trained model to predict LTL formulas for new NLP statements
def convert_nlp_to_ltl(nlp_statements):
    sequences = tokenizer.texts_to_sequences(nlp_statements)
    padded_sequences = pad_sequences(sequences, maxlen=100)
    predicted_labels = model.predict(padded_sequences)
    predicted_ltl_formulas = le.inverse_transform(np.argmax(predicted_labels, axis=1))
    return predicted_ltl_formulas

print("done")

done


In [14]:
new_nlp_statements = ['can you check for new updates']
predicted_ltl_formulas = convert_nlp_to_ltl(new_nlp_statements)
print(predicted_ltl_formulas[0])

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
 "G(!software_update -> check)"


In [10]:
import pickle

# Save the tokenizer
#with open('tokenizer.pickle', 'wb') as handle:
    #pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)


In [11]:
# After fitting the Label Encoder
#with open('label_encoder.pickle', 'wb') as handle:
    #pickle.dump(le, handle, protocol=pickle.HIGHEST_PROTOCOL)


In [12]:
# Save the trained LSTM model
#model.save('lstm_model.h5')
