In [None]:
import numpy as np
import tensorflow as tf
import warnings
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense

# Suppress specific warnings
warnings.filterwarnings("ignore", category=UserWarning, module='tensorflow')

# Read the text file
with open('/kaggle/input/sportss/sports.txt', 'r', encoding='utf-8') as file:
    text = file.read()

# Word-level Tokenization and Model
word_tokenizer = Tokenizer()
word_tokenizer.fit_on_texts([text])

# Convert text to sequences of words
word_encoded = word_tokenizer.texts_to_sequences([text])[0]

# Create input sequences using list of n-gram sequences
input_seq_word = []
for i in range(1, len(word_encoded)):
    n_gram_sequence = word_encoded[:i+1]
    input_seq_word.append(n_gram_sequence)

# Pad sequences
max_sequence_len_word = max([len(seq) for seq in input_seq_word])
input_sequences_word = pad_sequences(input_seq_word, maxlen=max_sequence_len_word, padding='pre')

# Split input sequences into x and y
x_word = input_sequences_word[:, :-1]
y_word = input_sequences_word[:, -1]

# One-hot encode the labels
total_words = len(word_tokenizer.word_index) + 1
y_word = tf.keras.utils.to_categorical(y_word, num_classes=total_words)

# Define the word-level model
word_model = Sequential()
word_model.add(Embedding(total_words, 100, input_length=max_sequence_len_word-1))
word_model.add(SimpleRNN(150))
word_model.add(Dense(total_words, activation='softmax'))

# Build the model
word_model.build(input_shape=(None, max_sequence_len_word-1))
print(word_model.summary())

# Compile the model
word_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the word-level model
word_model.fit(x_word, y_word, epochs=100, verbose=1)

In [None]:
# Character-level Tokenization and Model
char_tokenizer = Tokenizer(char_level=True)
char_tokenizer.fit_on_texts([text])

# Convert text to sequences of characters
char_encoded = char_tokenizer.texts_to_sequences([text])[0]

# Create input sequences using list of n-gram sequences
input_seq_char = []
for i in range(1, len(char_encoded)):
    n_gram_sequence = char_encoded[:i+1]
    input_seq_char.append(n_gram_sequence)

# Pad sequences
max_sequence_len_char = max([len(seq) for seq in input_seq_char])
input_sequences_char = pad_sequences(input_seq_char, maxlen=max_sequence_len_char, padding='pre')

# Split input sequences into x and y
x_char = input_sequences_char[:, :-1]
y_char = input_sequences_char[:, -1]

# One-hot encode the labels
total_chars = len(char_tokenizer.word_index) + 1
y_char = tf.keras.utils.to_categorical(y_char, num_classes=total_chars)

# Define the character-level model
char_model = Sequential()
char_model.add(Embedding(total_chars, 50, input_length=max_sequence_len_char-1))
char_model.add(SimpleRNN(100))
char_model.add(Dense(total_chars, activation='softmax'))

# Build the model
char_model.build(input_shape=(None, max_sequence_len_char-1))
print(char_model.summary())

# Compile the model
char_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the character-level model
char_model.fit(x_char, y_char, epochs=10, verbose=1)

In [13]:
# Function to predict the next characters
def predict_next_chars(input_text, num_chars):
    for _ in range(num_chars):
        token_list = char_tokenizer.texts_to_sequences([input_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len_char-1, padding='pre')
        predicted = np.argmax(char_model.predict(token_list), axis=-1)
        output_char = ""
        for char, index in char_tokenizer.word_index.items():
            if index == predicted:
                output_char = char
                break
        input_text += output_char
    return input_text



# Function to predict the next words
def predict_next_words(input_text, num_words):
    for _ in range(num_words):
        token_list = word_tokenizer.texts_to_sequences([input_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len_word-1, padding='pre')
        predicted = np.argmax(word_model.predict(token_list), axis=-1)
        output_word = ""
        for word, index in word_tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        input_text += " " + output_word
    return input_text

In [14]:
# Test the character-level model
input_text_char = "Sp"
num_predict_chars = int(input("Enter the number of characters to predict: "))
output_text_char = predict_next_chars(input_text_char, num_predict_chars)
print("Predicted next characters:", output_text_char)

Enter the number of characters to predict:  4


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 205ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 195ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 197ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 189ms/step
Predicted next characters: Sports


In [15]:
# Test the word-level model
input_text_word = "activities with different"
num_predict_words = int(input("Enter the number of words to predict: "))
output_text_word = predict_next_words(input_text_word, num_predict_words)
print("Predicted next words:", output_text_word)

Enter the number of words to predict:  15


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38