In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding, Attention
from sklearn.metrics.pairwise import cosine_similarity

# Step 1: Prepare the Data
# Load dataframe
data = pd.read_csv('your_dataframe.csv')

# Preprocess input and output texts
input_texts = data['input_text'].values
target_texts = data['output_text'].values

# Tokenize input and output texts
tokenizer_input = Tokenizer()
tokenizer_input.fit_on_texts(input_texts)
input_sequences = tokenizer_input.texts_to_sequences(input_texts)

tokenizer_target = Tokenizer()
tokenizer_target.fit_on_texts(target_texts)
target_sequences = tokenizer_target.texts_to_sequences(target_texts)

# Pad sequences
max_encoder_seq_length = max([len(seq) for seq in input_sequences])
max_decoder_seq_length = max([len(seq) for seq in target_sequences])
encoder_input_data = pad_sequences(input_sequences, maxlen=max_encoder_seq_length, padding='post')
decoder_input_data = pad_sequences(target_sequences, maxlen=max_decoder_seq_length, padding='post')

# Step 2: Define the Model
latent_dim = 256

# Encoder
encoder_inputs = Input(shape=(None,))
encoder_embedding = Embedding(input_dim=len(tokenizer_input.word_index)+1, output_dim=latent_dim)
encoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding(encoder_inputs))
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(None,))
decoder_embedding = Embedding(input_dim=len(tokenizer_target.word_index)+1, output_dim=latent_dim)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding(decoder_inputs), initial_state=encoder_states)
attention_layer = Attention()
attention_output = attention_layer([decoder_outputs, encoder_outputs])
decoder_concat_input = Dense(latent_dim, activation='tanh')(attention_output)
decoder_dense = Dense(len(tokenizer_target.word_index)+1, activation='softmax')
decoder_outputs = decoder_dense(decoder_concat_input)

# Define model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

# Step 3: Train the Model
encoder_input_train, encoder_input_val, decoder_input_train, decoder_input_val = train_test_split(
    encoder_input_data, decoder_input_data, test_size=0.2, random_state=42)

model.fit([encoder_input_train, decoder_input_train], decoder_input_train,
          validation_data=([encoder_input_val, decoder_input_val], decoder_input_val),
          batch_size=64, epochs=10)

# Step 4: Evaluate the Model
def generate_output_sentence(input_sentence):
    input_sequence = tokenizer_input.texts_to_sequences([input_sentence])
    input_sequence = pad_sequences(input_sequence, maxlen=max_encoder_seq_length, padding='post')
    decoded_sentence = ''
    stop_condition = False
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = tokenizer_target.word_index['<start>']
    states_value = encoder_model.predict(input_sequence)
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_token = reverse_target_word_index[sampled_token_index]
        if (sampled_token != '<end>'):
            decoded_sentence += ' ' + sampled_token
        if (sampled_token == '<end>' or len(decoded_sentence.split()) > max_decoder_seq_length):
            stop_condition = True
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index
        states_value = [h, c]
    return decoded_sentence.strip()

# Step 5: Similarity Calculation
def calculate_similarity(sentence, target_texts):
    similarity_scores = []
    input_sequence = tokenizer_input.texts_to_sequences([sentence])
    input_sequence = pad_sequences(input_sequence, maxlen=max_encoder_seq_length, padding='post')
    generated_output = generate_output_sentence(sentence)
    for target_text in target_texts:
        similarity_score = cosine_similarity([generated_output], [target_text])[0][0]
        similarity_scores.append(similarity_score)
    return similarity_scores

# Example usage
random_input_sentence = "Your random input sentence here"
similarity_scores = calculate_similarity(random_input_sentence, target_texts)
print(similarity_scores)