<a href="https://colab.research.google.com/github/AI-Enthusiast-ATK/My-AI-Playground/blob/main/next_word_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import spacy
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
import numpy as np
from string import punctuation

# Load data
def read_lines(filepath):
    with open(filepath, 'r', encoding='utf-8') as f:
        return pd.DataFrame({'text': f.read().splitlines()})

test_data = read_lines('/content/wiki.test.tokens')
train_data = read_lines('/content/wiki.train.tokens')
valid_data = read_lines('/content/wiki.valid.tokens')

# Clean text
nlp = spacy.load('en_core_web_sm')
def clean_text(text):
    analyzed_text = nlp(text.lower())
    return ' '.join([token.text for token in analyzed_text if token.text not in punctuation])

train_data["cleaned"] = train_data['text'].apply(clean_text)
test_data["cleaned"] = test_data['text'].apply(clean_text)
valid_data["cleaned"] = valid_data['text'].apply(clean_text)

# Prepare corpus
corpus = train_data['cleaned'].tolist()

# Tokenizer with limited vocab size
max_words = 10000
tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(corpus)
word_index = tokenizer.word_index
vocab_size = min(max_words, len(word_index)) + 1

# Create sequences
sequences = []
for line in corpus[:10000]:  # Limit lines to avoid memory issues
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(2, len(token_list)+1):
        seq = token_list[:i]
        sequences.append(seq)

# Pad sequences
max_len = max(len(seq) for seq in sequences)
sequences = pad_sequences(sequences, maxlen=max_len, padding='pre')

# Split into X and y
X = sequences[:, :-1]
y = sequences[:, -1]

# No need to one-hot encode y
# Use sparse_categorical_crossentropy instead
model = Sequential([
    Embedding(vocab_size, 50, input_length=max_len-1),
    LSTM(64),
    Dense(vocab_size, activation='softmax')
])

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train
model.fit(X, y, epochs=5, batch_size=64)

# Summary
model.summary()
