<a href="https://colab.research.google.com/github/ArkS0001/RAG/blob/main/RAG_Memory_Hallucination.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Memorization:** The LSTM accurately reproduces sequences it has seen in training.
**Hallucination:** The LSTM generates plausible but novel sequences when given unseen or incomplete inputs.

In [7]:
import numpy as np
import random
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

# Step 1: Generate Synthetic Data (Memorization and Hallucination)
# Generate sequences of symbolic data
symbols = ["A", "B", "C", "D", "E"]
patterns = [
    "A B C D",  # Common sequence 1
    "B C D E",  # Common sequence 2
    "A E D C",  # Common sequence 3
    "C B A E",  # Rare sequence
]

# Augment data with variations and noise
def generate_synthetic_data(patterns, num_samples=1000):
    data = []
    for _ in range(num_samples):
        pattern = random.choice(patterns)
        # Introduce random hallucination or noise
        if random.random() > 0.8:  # 20% chance of hallucination
            hallucination = random.choices(symbols, k=random.randint(3, 5))
            data.append(" ".join(hallucination))
        else:
            data.append(pattern)
    return data

synthetic_data = generate_synthetic_data(patterns)

# Step 2: Preprocess Data
# Tokenize the data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(synthetic_data)
vocab_size = len(tokenizer.word_index) + 1

# Convert to sequences
sequences = tokenizer.texts_to_sequences(synthetic_data)

# Pad sequences
max_sequence_len = max(len(seq) for seq in sequences)
sequences = pad_sequences(sequences, maxlen=max_sequence_len, padding="post")

# Create predictors (X) and labels (y)
X, y = sequences[:, :-1], sequences[:, -1]
y = np.eye(vocab_size)[y]

# Step 3: Build the LSTM Model
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=50, input_length=max_sequence_len - 1),
    LSTM(150, return_sequences=False),
    Dense(100, activation="relu"),
    Dense(vocab_size, activation="softmax")
])

model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

# Step 4: Train the Model
model.fit(X, y, epochs=50, batch_size=32, verbose=1)

# Step 5: Evaluate Memorization and Hallucination
# Generate new sequences based on a seed text
def generate_sequence(seed_text, next_words=5):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len - 1, padding="post")
        predicted = np.argmax(model.predict(token_list), axis=-1)[0]
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " " + output_word
    return seed_text

# Example Usage
seed_text = "A B C"
generated_sequence = generate_sequence(seed_text)
print("Seed Text:", seed_text)
print("Generated Sequence:", generated_sequence)

# Evaluate memorization by providing common sequences
test_sequences = ["A B C D", "B C D E", "A E D C"]
for seq in test_sequences:
    print(f"Input: {seq} \nGenerated: {generate_sequence(seq)}\n")

# Evaluate hallucination by providing incomplete or unseen sequences
hallucination_tests = ["E A", "D C", "B E"]
for seq in hallucination_tests:
    print(f"Input: {seq} \nHallucinated: {generate_sequence(seq)}\n")


Epoch 1/50
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 14ms/step - accuracy: 0.8098 - loss: 1.4074
Epoch 2/50
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.9201 - loss: 0.4345
Epoch 3/50
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.9320 - loss: 0.3495
Epoch 4/50
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.9353 - loss: 0.3331
Epoch 5/50
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 23ms/step - accuracy: 0.9299 - loss: 0.3570
Epoch 6/50
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 24ms/step - accuracy: 0.9365 - loss: 0.3078
Epoch 7/50
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 23ms/step - accuracy: 0.9241 - loss: 0.3491
Epoch 8/50
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 24ms/step - accuracy: 0.9250 - loss: 0.3442
Epoch 9/50
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━

1. Generate Synthetic Data:

    The code starts by defining a set of symbolic patterns and generates synthetic data by choosing patterns at random.
    Occasionally (20% of the time), a "hallucination" (random noise) is introduced, which deviates from the normal patterns. This simulates the model's tendency to hallucinate when it sees unfamiliar sequences.

2. Preprocess Data:

    Tokenization: The Tokenizer class is used to assign a unique integer to each word in the dataset.
    Padding: Sequences are padded so that all input sequences have the same length, ensuring consistency in LSTM input.
    Prepare Inputs and Labels: The sequences are split into predictors (X) and labels (y). The labels are one-hot encoded to fit the categorical nature of the output (a classification problem where each word is treated as a separate class).

3. Build the LSTM Model:

    The model is built with an Embedding layer to map words into a dense vector space, an LSTM layer to capture sequential dependencies, and Dense layers to predict the next word based on the learned patterns.
    The final layer uses a softmax activation function, which outputs a probability distribution over all possible words, and the model is trained using categorical cross-entropy as the loss function.

4. Train the Model:

    The model is trained on the synthetic data for 50 epochs, using a batch size of 32.

5. Evaluate Memorization and Hallucination:

    Memorization: By providing common patterns, the model is expected to accurately predict the continuation of these sequences.
    Hallucination: For unseen or incomplete sequences, the model might generate unexpected outputs or hallucinate words that deviate from the known patterns.

Example Test:

    The code then demonstrates the model's ability to generate sequences based on a seed text, and it shows how the model performs with both familiar patterns and rare or hallucinated sequences.

Improvements/Considerations:

    Data Augmentation: You could add more variations or increase the data size to improve generalization.
    Model Evaluation: To properly evaluate the model's performance on memorization and hallucination, a more structured evaluation (e.g., using perplexity, accuracy on unseen sequences) can be added.
    Hallucination Handling: You could adjust the model to handle hallucinations more effectively by introducing regularization or refining the training data.

In [9]:
import numpy as np
import random
from tensorflow.keras.models import Model
from tensorflow.keras.layers import LSTM, Dense, Embedding, Bidirectional, Input, Attention, LayerNormalization
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.optimizers import Adam

# Step 1: Generate Complex Synthetic Data (Memorization and Hallucination)
sentences = [
    "The quick brown fox jumps over the lazy dog",
    "Hello, how are you today?",
    "I am learning natural language processing",
    "This is a test sentence for sequence generation",
    "AI models are improving everyday with new architectures"
]
# Augment data with variations and noise (hallucinations)
def generate_synthetic_data(sentences, num_samples=1000):
    data = []
    for _ in range(num_samples):
        sentence = random.choice(sentences)
        # Introduce random hallucination or noise
        if random.random() > 0.8:  # 20% chance of hallucination
            hallucination = random.choices(["hello", "quick", "AI", "learning", "models"], k=random.randint(1, 4))
            data.append(" ".join(hallucination))
        else:
            data.append(sentence)
    return data

synthetic_data = generate_synthetic_data(sentences)

# Step 2: Preprocess Data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(synthetic_data)
vocab_size = len(tokenizer.word_index) + 1

sequences = tokenizer.texts_to_sequences(synthetic_data)

max_sequence_len = max(len(seq) for seq in sequences)
sequences = pad_sequences(sequences, maxlen=max_sequence_len, padding="post")

X, y = sequences[:, :-1], sequences[:, -1]
y = np.eye(vocab_size)[y]

# Step 3: Build a More Complex Model with BiLSTM and Attention
input_seq = Input(shape=(max_sequence_len - 1,))
embedding = Embedding(input_dim=vocab_size, output_dim=50, input_length=max_sequence_len - 1)(input_seq)
bi_lstm = Bidirectional(LSTM(150, return_sequences=True))(embedding)

# Apply Attention mechanism
attention = Attention()([bi_lstm, bi_lstm])
attention_output = LayerNormalization()(attention)
lstm_out = LSTM(150)(attention_output)

# Dense layers and output
dense_1 = Dense(100, activation="relu")(lstm_out)
output = Dense(vocab_size, activation="softmax")(dense_1)

# Create and compile model
model = Model(inputs=input_seq, outputs=output)
model.compile(optimizer=Adam(), loss="categorical_crossentropy", metrics=["accuracy"])

# Step 4: Train the Model
model.fit(X, y, epochs=50, batch_size=32, verbose=1)

# Step 5: Generate Sequences
def generate_sequence(seed_text, next_words=5):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len - 1, padding="post")
        predicted = np.argmax(model.predict(token_list), axis=-1)[0]
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " " + output_word
    return seed_text

# Example Usage
seed_text = "The quick brown"
generated_sequence = generate_sequence(seed_text)
print("Seed Text:", seed_text)
print("Generated Sequence:", generated_sequence)

# Evaluate memorization by providing common sequences
test_sequences = ["The quick brown fox", "I am learning natural", "AI models are"]
for seq in test_sequences:
    print(f"Input: {seq} \nGenerated: {generate_sequence(seq)}\n")

# Evaluate hallucination by providing incomplete or unseen sequences
hallucination_tests = ["Hello world", "Natural language", "AI systems"]
for seq in hallucination_tests:
    print(f"Input: {seq} \nHallucinated: {generate_sequence(seq)}\n")


Epoch 1/50
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 72ms/step - accuracy: 0.8453 - loss: 1.2951
Epoch 2/50
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 74ms/step - accuracy: 1.0000 - loss: 7.1084e-05
Epoch 3/50
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 72ms/step - accuracy: 1.0000 - loss: 2.6733e-05
Epoch 4/50
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 130ms/step - accuracy: 1.0000 - loss: 2.2997e-05
Epoch 5/50
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 67ms/step - accuracy: 1.0000 - loss: 2.0797e-05
Epoch 6/50
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 70ms/step - accuracy: 1.0000 - loss: 2.0255e-05
Epoch 7/50
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 68ms/step - accuracy: 1.0000 - loss: 1.7926e-05
Epoch 8/50
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 74ms/step - accuracy: 1.0000 - loss: 1.6433e-05
Epoch 9/50
[1m32/

In [10]:
import numpy as np
import random
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

# Step 1: Generate Synthetic Data (Memorization and Hallucination)
# Generate sequences of sentences
patterns = [
    "The quick brown fox jumped over the lazy dog",
    "I am learning natural language processing",
    "Artificial intelligence is transforming industries",
    "Deep learning models are very powerful",
    "Natural language understanding is a key aspect of AI",
    "The quick brown fox is a classic typing test sentence",
    "Machine learning enables data-driven decisions",
]

# Augment data with variations and noise
def generate_synthetic_data(patterns, num_samples=1000):
    data = []
    for _ in range(num_samples):
        pattern = random.choice(patterns)
        # Introduce random hallucination or noise (random sentence fragments)
        if random.random() > 0.8:  # 20% chance of hallucination
            hallucination = random.choices(patterns, k=random.randint(1, 3))
            data.append(" ".join(hallucination))
        else:
            data.append(pattern)
    return data

synthetic_data = generate_synthetic_data(patterns)

# Step 2: Preprocess Data
# Tokenize the data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(synthetic_data)
vocab_size = len(tokenizer.word_index) + 1

# Convert to sequences
sequences = tokenizer.texts_to_sequences(synthetic_data)

# Pad sequences
max_sequence_len = max(len(seq) for seq in sequences)
sequences = pad_sequences(sequences, maxlen=max_sequence_len, padding="post")

# Create predictors (X) and labels (y)
X, y = sequences[:, :-1], sequences[:, -1]
y = np.eye(vocab_size)[y]

# Step 3: Build the LSTM Model
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=50, input_length=max_sequence_len - 1),
    LSTM(150, return_sequences=False),
    Dense(100, activation="relu"),
    Dense(vocab_size, activation="softmax")
])

model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

# Step 4: Train the Model
model.fit(X, y, epochs=50, batch_size=32, verbose=1)

# Step 5: Generate Sequences with Sampling (Temperature)
def generate_sequence_with_sampling(seed_text, next_words=5, temperature=1.0):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len - 1, padding="post")

        # Predict probabilities
        predictions = model.predict(token_list)[0]

        # Adjust probabilities based on temperature
        predictions = np.asarray(predictions).astype("float64")
        predictions = np.log(predictions + 1e-7) / temperature
        predictions = np.exp(predictions) / np.sum(np.exp(predictions))  # Softmax

        # Sample from the predicted distribution
        predicted = np.random.choice(range(vocab_size), p=predictions)

        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break

        seed_text += " " + output_word

    return seed_text

# Example Usage with Sampling:
seed_text = "The quick brown fox"
generated_sequence = generate_sequence_with_sampling(seed_text, temperature=0.8)
print("Generated Sequence:", generated_sequence)

# Evaluate memorization by providing common sequences
test_sequences = [
    "The quick brown fox jumped over the lazy dog",
    "I am learning natural language processing",
    "Artificial intelligence is transforming industries"
]
for seq in test_sequences:
    print(f"Input: {seq} \nGenerated: {generate_sequence_with_sampling(seq)}\n")

# Evaluate hallucination by providing incomplete or unseen sequences
hallucination_tests = ["The quick brown", "I am learning", "Deep learning"]
for seq in hallucination_tests:
    print(f"Input: {seq} \nHallucinated: {generate_sequence_with_sampling(seq)}\n")


Epoch 1/50
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 55ms/step - accuracy: 0.8756 - loss: 1.8458
Epoch 2/50
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 84ms/step - accuracy: 0.9997 - loss: 0.0056
Epoch 3/50
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 51ms/step - accuracy: 0.9994 - loss: 0.0085
Epoch 4/50
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 53ms/step - accuracy: 0.9993 - loss: 0.0080
Epoch 5/50
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 52ms/step - accuracy: 0.9997 - loss: 0.0025
Epoch 6/50
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 56ms/step - accuracy: 0.9987 - loss: 0.0099
Epoch 7/50
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 89ms/step - accuracy: 0.9993 - loss: 0.0055
Epoch 8/50
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 87ms/step - accuracy: 0.9997 - loss: 0.0030
Epoch 9/50
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━

In [11]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import random

# Step 1: Generate Synthetic Data (Memorization and Hallucination)
# Generate some sentences
sentences = [
    "The quick brown fox jumped over the lazy dog",
    "I am learning natural language processing",
    "Artificial intelligence is transforming industries",
    "Deep learning is revolutionizing AI applications",
    "Natural language processing is a field of AI",
    "Machine learning models can analyze large datasets",
    "AI systems are shaping the future of technology",
    "The cat sat on the mat",
    "The dog barked at the stranger",
    "We are building smarter AI systems"
]

# Augment data with variations and noise
def generate_synthetic_data(sentences, num_samples=1000):
    data = []
    for _ in range(num_samples):
        sentence = random.choice(sentences)
        # Introduce random hallucination or noise
        if random.random() > 0.8:  # 20% chance of hallucination
            hallucination = random.choices(sentence.split(), k=random.randint(3, 5))
            data.append(" ".join(hallucination))
        else:
            data.append(sentence)
    return data

synthetic_data = generate_synthetic_data(sentences)

# Step 2: Preprocess Data
# Tokenize the data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(synthetic_data)
vocab_size = len(tokenizer.word_index) + 1  # Adding 1 for padding token

# Convert to sequences
sequences = tokenizer.texts_to_sequences(synthetic_data)

# Pad sequences to ensure uniform length
max_sequence_len = max(len(seq) for seq in sequences)
sequences = pad_sequences(sequences, maxlen=max_sequence_len, padding="post")

# Create predictors (X) and labels (y)
X, y = sequences[:, :-1], sequences[:, -1]
y = np.eye(vocab_size)[y]

# Step 3: Build the LSTM Model with more complexity
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=50, input_length=max_sequence_len - 1),
    LSTM(300, return_sequences=False),  # Increased LSTM units
    Dense(150, activation="relu"),  # Increased units in Dense layer
    Dense(vocab_size, activation="softmax")  # Output layer
])

model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

# Step 4: Train the Model
model.fit(X, y, epochs=100, batch_size=32, verbose=1)  # Increased epochs for better learning

# Step 5: Generate Sequences with Sampling (Increase temperature for randomness)
def generate_sequence_with_sampling(seed_text, next_words=20, temperature=1.2):  # Increased temperature
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len - 1, padding="post")

        # Predict probabilities
        predictions = model.predict(token_list)[0]

        # Adjust probabilities based on temperature
        predictions = np.asarray(predictions).astype("float64")
        predictions = np.log(predictions + 1e-7) / temperature
        predictions = np.exp(predictions) / np.sum(np.exp(predictions))  # Softmax

        # Sample from the predicted distribution
        predicted = np.random.choice(range(vocab_size), p=predictions)

        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break

        seed_text += " " + output_word

    return seed_text

# Example Usage with Sampling (increased next_words for longer output):
seed_text = "The quick brown fox"
generated_sequence = generate_sequence_with_sampling(seed_text, next_words=20, temperature=1.2)
print("Generated Sequence:", generated_sequence)

# Evaluate memorization by providing common sequences
test_sequences = [
    "The quick brown fox jumped over the lazy dog",
    "I am learning natural language processing",
    "Artificial intelligence is transforming industries"
]

for seq in test_sequences:
    print(f"Input: {seq} \nGenerated: {generate_sequence_with_sampling(seq, next_words=20, temperature=1.2)}\n")

# Evaluate hallucination by providing incomplete or unseen sequences
hallucination_tests = [
    "The quick brown",
    "I am learning",
    "Deep learning"
]

for seq in hallucination_tests:
    print(f"Input: {seq} \nHallucinated: {generate_sequence_with_sampling(seq, next_words=20, temperature=1.2)}\n")


Epoch 1/100
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 40ms/step - accuracy: 0.7896 - loss: 2.5053
Epoch 2/100
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 64ms/step - accuracy: 0.9518 - loss: 0.0758
Epoch 3/100
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 76ms/step - accuracy: 1.0000 - loss: 0.0024
Epoch 4/100
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 47ms/step - accuracy: 1.0000 - loss: 4.2406e-04
Epoch 5/100
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 42ms/step - accuracy: 1.0000 - loss: 3.0114e-04
Epoch 6/100
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 39ms/step - accuracy: 1.0000 - loss: 2.7189e-04
Epoch 7/100
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 41ms/step - accuracy: 1.0000 - loss: 1.6656e-04
Epoch 8/100
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 42ms/step - accuracy: 1.0000 - loss: 1.1750e-04
Epoch 9/100
[1m32/3