In [2]:
!pip install gradio tensorflow numpy pandas



In [None]:
import os
import re
import pickle
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import gradio as gr
import time

# Function to clean the text
def clean_text(text):
    text = re.sub(r'\([^)]*\)', '', text)  # Remove anything between ( and )
    text = re.sub(r'\[[^]]*\]', '', text)  # Remove anything between [ and ]
    text = re.sub(r'[-"“”‘’\'„“]', '', text)
    text = re.sub(r'\d+', '', text)  # Remove all numbers
    text = re.sub(r'\s+', ' ', text).strip()  # Reduce multiple spaces
    return text

# File paths
input_path_primary = 'Corpus.txt'
input_path_secondary = '/kaggle/input/corpus/Corpus.txt'
output_path = 'Corpus-cleaned.txt'
model_path = 'word_prediction_model.keras'  # For Keras
tokenizer_path = 'tokenizer.pickle'

# Clean and save the text if the cleaned file doesn't exist
if os.path.exists(input_path_primary):
    input_path = input_path_primary
elif os.path.exists(input_path_secondary):
    input_path = input_path_secondary
else:
    raise FileNotFoundError("None of the specified input paths exist.")

if not os.path.exists(output_path):
    with open(input_path, 'r', encoding='utf-8') as file:
        text = file.read()

    cleaned_text = clean_text(text)
    with open(output_path, 'w', encoding='utf-8') as file:
        file.write(cleaned_text)

# Train the model or load if already trained
if os.path.exists(model_path) and os.path.exists(tokenizer_path):
    model = tf.keras.models.load_model(model_path)
    with open(tokenizer_path, 'rb') as handle:
        tokenizer = pickle.load(handle)
else:
    with open(output_path, 'r', encoding='utf-8') as file:
        cleaned_text = file.read()

    tokenizer = Tokenizer()
    tokenizer.fit_on_texts([cleaned_text])
    total_words = len(tokenizer.word_index) + 1
    input_sequences = []

    for line in cleaned_text.split('.'):
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)

    max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

    X, y = input_sequences[:, :-1], input_sequences[:, -1]
    y = tf.keras.utils.to_categorical(y, num_classes=total_words)

    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(total_words, 100, input_length=max_sequence_len-1),
        tf.keras.layers.LSTM(100, return_sequences=True),
        tf.keras.layers.LSTM(100),
        tf.keras.layers.Dense(total_words, activation='softmax')
    ])

    optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)
    model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    model.fit(X, y, epochs=120, batch_size=32)

    model.save(model_path)

    with open(tokenizer_path, 'wb') as handle:
        pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

max_sequence_len = model.input_shape[1] + 1
stop_signal = False

def predict_next_words(prompt, top_k=5):
    tokens = tokenizer.texts_to_sequences([prompt])
    padded_seq = pad_sequences(tokens, maxlen=max_sequence_len-1, padding='pre')
    predictions = model.predict(padded_seq)
    top_indices = np.argsort(predictions[0])[-top_k:][::-1]
    top_words = [(tokenizer.index_word.get(i, ''), predictions[0][i]) for i in top_indices]
    return top_words

def format_predictions(predictions):
    formatted_predictions = []
    for word, prob in predictions:
        formatted_prob = f"{prob*100:.2f}".replace('.', ',')
        formatted_predictions.append([f"{word} ({formatted_prob} %)"])
    return formatted_predictions

def calculate_perplexity(predictions):
    perplexity = np.exp(-np.sum(np.log(predictions)) / len(predictions))
    return f"{perplexity:.2f}".replace('.', ',')

def generate_text(prompt, num_words=10):
    result = prompt
    used_words = set()
    for _ in range(num_words):
        next_word = predict_next_words(result, top_k=1)[0][0]
        if next_word not in used_words:
            result += ' ' + next_word
            used_words.add(next_word)
        else:
            break
    return result

def append_word(prompt):
    next_word = predict_next_words(prompt, top_k=1)[0][0]
    return prompt + ' ' + next_word

def auto_generate_text(prompt):
    global stop_signal
    stop_signal = False
    generated_text = prompt
    num_words_generated = 0

    while not stop_signal and num_words_generated < 10:
        next_word = predict_next_words(generated_text, top_k=1)[0][0]
        generated_text += ' ' + next_word
        num_words_generated += 1
        time.sleep(0.2)
        predictions = predict_next_words(generated_text)
        yield generated_text, calculate_perplexity([prob for word, prob in predictions]), format_predictions(predictions)

def stop_auto_generation(prompt, perplexity, probabilities):
    global stop_signal
    stop_signal = True
    return prompt, perplexity, probabilities

def append_clicked_word(evt, prompt):
    clicked_word = evt.value.split()[0]
    return prompt + ' ' + clicked_word

def predict_text(prompt):
    predictions = predict_next_words(prompt)
    top_words = format_predictions(predictions)
    perplexity = calculate_perplexity([prob for word, prob in predictions])
    return prompt, perplexity, top_words

def append_next_word(prompt):
    next_word = predict_next_words(prompt, top_k=1)[0][0]
    new_prompt = prompt + ' ' + next_word
    predictions = predict_next_words(new_prompt)
    top_words = format_predictions(predictions)
    perplexity = calculate_perplexity([prob for word, prob in predictions])
    return new_prompt, perplexity, top_words

def reset_text():
    return "", "", []

with gr.Blocks() as demo:
    gr.Markdown("## Language Model mit LSTM")
    input_text = gr.Textbox(label="Gib beliebige Wörter ein:", interactive=True)

    with gr.Row():

        with gr.Column():

          probabilities = gr.Dataframe(
          headers=["Wähle nächstes Wort aus:"],
          datatype=["str"],
          col_count=1
        )

        with gr.Column():
          predict_button = gr.Button("Vorhersage")
          next_button = gr.Button("Weiter")
          auto_button = gr.Button("Auto")
          stop_button = gr.Button("Stopp")

        with gr.Column():
          perplexity_text = gr.Textbox(label="Perplexity", interactive=False)
          reset_button = gr.Button("Reset")

          predict_button.click(fn=predict_text, inputs=input_text, outputs=[input_text, perplexity_text, probabilities])
          next_button.click(fn=append_next_word, inputs=input_text, outputs=[input_text, perplexity_text, probabilities])
          reset_button.click(fn=reset_text, outputs=[input_text, perplexity_text, probabilities])

    auto_button.click(
      fn=auto_generate_text,
      inputs=input_text,
      outputs=[input_text, perplexity_text, probabilities]
    )

    stop_button.click(
      fn=stop_auto_generation,
      inputs=[input_text, perplexity_text, probabilities],
      outputs=[input_text, perplexity_text, probabilities]
    )

    def select_predicted_word(evt: gr.SelectData, prompt):
        word = evt.value.split(' ')[0]  # Extract word from format "word (probability%)"
        new_prompt = prompt + ' ' + word
        next_words = predict_next_words(new_prompt)
        top_words = format_predictions(next_words)
        perplexity = calculate_perplexity([prob for word, prob in next_words])
        return new_prompt, perplexity, top_words

    probabilities.select(fn=select_predicted_word, inputs=[input_text], outputs=[input_text, perplexity_text, probabilities])

demo.launch()




Epoch 1/120
[1m797/797[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 24ms/step - accuracy: 0.0460 - loss: 7.2144
Epoch 2/120
[1m797/797[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 24ms/step - accuracy: 0.0672 - loss: 6.5390
Epoch 3/120
[1m797/797[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 26ms/step - accuracy: 0.0952 - loss: 6.1511
Epoch 4/120
[1m797/797[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 26ms/step - accuracy: 0.1059 - loss: 5.8740
Epoch 5/120
[1m797/797[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 27ms/step - accuracy: 0.1189 - loss: 5.5495
Epoch 6/120
[1m797/797[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 25ms/step - accuracy: 0.1293 - loss: 5.2682
Epoch 7/120
[1m797/797[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 25ms/step - accuracy: 0.1397 - loss: 4.9636
Epoch 8/120
[1m797/797[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 25ms/step - accuracy: 0.1494 - loss: 4.6706
Epoch 9/120
[1m



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 179ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1