## Import Libraries

In [26]:
# Import necessary libraries
import os
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Reshape, GlobalAveragePooling1D, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import plot_model


## Load Embedding Model

In [27]:
# Load the pre-trained embedding model
embedding_model = tf.keras.models.load_model('../saved_model/base_model_saved/base_model_02/flexible_embedding_model.h5')


# Display the model summary
embedding_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 50)          1465950   
                                                                 
 global_average_pooling1d (G  (None, 50)               0         
 lobalAveragePooling1D)                                          
                                                                 
 dense (Dense)               (None, 64)                3264      
                                                                 
 dense_1 (Dense)             (None, 29319)             1905735   
                                                                 
Total params: 3,374,949
Trainable params: 3,374,949
Non-trainable params: 0
_________________________________________________________________


## Read and Clean Text Files

In [15]:
# Function to read and clean text files from a folder
def read_text_files(folder_path):
    texts = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as file:
                content = file.read().strip()
                if content:  # Check if the file is not empty
                    cleaned_content = clean_text(content)
                    texts.append(cleaned_content)
    return texts

def clean_text(text):
    # Remove unwanted characters
    unwanted_chars = ['*', '#', '_', ')', '(', '!', '?', '.', ',', '-']
    for char in unwanted_chars:
        text = text.replace(char, '')
    return text

# Read texts from the new dataset
folder_path = '../Dataset/nlp_dataset'
texts = read_text_files(folder_path)


## Preprocess Text Data

In [28]:
# Tokenize and pad sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)

def preprocess_text(text):
    # Convert text to sequence of numbers
    sequence = tokenizer.texts_to_sequences([text])
    padded_sequence = pad_sequences(sequence, maxlen=100)  # Adjust maxlen as needed
    return padded_sequence


## Split Dataset

In [17]:
# Split dataset into train, validation, and test sets
train_texts = texts[:int(0.8 * len(texts))]
val_texts = texts[int(0.8 * len(texts)):int(0.9 * len(texts))]
test_texts = texts[int(0.9 * len(texts)):]

# Example labels (assuming all labels are 1, adjust as per your dataset)
train_labels = [1] * len(train_texts)
val_labels = [1] * len(val_texts)


## Define Attention Layer

In [31]:
# Define Attention Layer
class AttentionLayer(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super(AttentionLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        self.W = self.add_weight(name='attention_weight', shape=(input_shape[-1], input_shape[-1]), initializer='random_normal', trainable=True)
        self.b = self.add_weight(name='attention_bias', shape=(input_shape[-1],), initializer='zeros', trainable=True)
        super(AttentionLayer, self).build(input_shape)

    def call(self, inputs):
        score = tf.nn.tanh(tf.tensordot(inputs, self.W, axes=[2, 0]) + self.b)
        attention_weights = tf.nn.softmax(score, axis=1)
        context_vector = attention_weights * inputs
        context_vector = tf.reduce_sum(context_vector, axis=1)
        return context_vector


## Define New Model

In [32]:
# Define new flexible model with LSTM and Attention using Functional API
input_layer = Input(shape=(100,), name='input_layer')  # Adjust input shape to match the padded sequence length

# Use the embedding layer from the loaded model
embedding_output = embedding_model.layers[0](input_layer)

# Add LSTM layers
lstm_output = LSTM(128, return_sequences=True, name='lstm_1')(embedding_output)
lstm_output = LSTM(64, return_sequences=True, name='lstm_2')(lstm_output)

# Add Attention layer
attention_output = AttentionLayer(name='attention_layer')(lstm_output)

# Add Global Average Pooling for more flexibility
global_avg_pool = GlobalAveragePooling1D(name='global_avg_pooling')(attention_output)

# Add Dense layer for classification task
dense_output = Dense(64, activation='relu', name='dense_1')(global_avg_pool)
output_layer = Dense(1, activation='sigmoid', name='output_layer')(dense_output)  # Example for binary classification

# Create model
flexible_model = Model(inputs=input_layer, outputs=output_layer)

# Compile model
flexible_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Display the model summary
flexible_model.summary()


ValueError: Input 0 of layer "global_avg_pooling" is incompatible with the layer: expected ndim=3, found ndim=2. Full shape received: (None, 64)

## Convert Text to Embedding

In [24]:
# Function to convert text to embedding
def text_to_embedding(text, model):
    processed_text = preprocess_text(text)
    embedding = model.predict(processed_text)
    return embedding

# Convert dataset to embeddings
train_embeddings = [text_to_embedding(text, embedding_model) for text in train_texts]
val_embeddings = [text_to_embedding(text, embedding_model) for text in val_texts]

# Convert embeddings to numpy array and add sequence length dimension
train_embeddings = np.expand_dims(np.array(train_embeddings), axis=1)
val_embeddings = np.expand_dims(np.array(val_embeddings), axis=1)

# Convert labels to numpy array
train_labels = np.array(train_labels)
val_labels = np.array(val_labels)


## Train New Model

In [25]:
# Train the new model
new_model.fit(train_embeddings, train_labels, epochs=10, validation_data=(val_embeddings, val_labels))


Epoch 1/10


ValueError: in user code:

    File "C:\Users\gabri\anaconda3\envs\myenv\lib\site-packages\keras\engine\training.py", line 1021, in train_function  *
        return step_function(self, iterator)
    File "C:\Users\gabri\anaconda3\envs\myenv\lib\site-packages\keras\engine\training.py", line 1010, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "C:\Users\gabri\anaconda3\envs\myenv\lib\site-packages\keras\engine\training.py", line 1000, in run_step  **
        outputs = model.train_step(data)
    File "C:\Users\gabri\anaconda3\envs\myenv\lib\site-packages\keras\engine\training.py", line 859, in train_step
        y_pred = self(x, training=True)
    File "C:\Users\gabri\anaconda3\envs\myenv\lib\site-packages\keras\utils\traceback_utils.py", line 67, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "C:\Users\gabri\anaconda3\envs\myenv\lib\site-packages\keras\engine\input_spec.py", line 264, in assert_input_compatibility
        raise ValueError(f'Input {input_index} of layer "{layer_name}" is '

    ValueError: Input 0 of layer "model_1" is incompatible with the layer: expected shape=(None, 100), found shape=(None, 1, 1, 29319)


## Cell 10: Evaluate and Predict