## Project Description: Next Word Prediction Using LSTM
#### Project Overview:

This project aims to develop a deep learning model for predicting the next word in a given sequence of words. The model is built using Long Short-Term Memory (LSTM) networks, which are well-suited for sequence prediction tasks. The project includes the following steps:

1- Data Collection: We use the text of Shakespeare's "Hamlet" as our dataset. This rich, complex text provides a good challenge for our model.

2- Data Preprocessing: The text data is tokenized, converted into sequences, and padded to ensure uniform input lengths. The sequences are then split into training and testing sets.

3- Model Building: An LSTM model is constructed with an embedding layer, two LSTM layers, and a dense output layer with a softmax activation function to predict the probability of the next word.

4- Model Training: The model is trained using the prepared sequences, with early stopping implemented to prevent overfitting. Early stopping monitors the validation loss and stops training when the loss stops improving.

5- Model Evaluation: The model is evaluated using a set of example sentences to test its ability to predict the next word accurately.

6- Deployment: A Streamlit web application is developed to allow users to input a sequence of words and get the predicted next word in real-time.

In [1]:
!pip install tensorflow



In [None]:
import os
import tokenize


def tokenize_file(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            
            content = file.read().replace('\t', '    ')  
            tokens = tokenize.generate_tokens(iter(content.splitlines()).__next__)
            token_list = []
            for token in tokens:
                if token.type not in (tokenize.COMMENT, tokenize.NL):
                    if token.string == '':
                        continue
                    token_list.append(token.string)

            return token_list
    except tokenize.TokenError as e:

        return []
    except Exception as e:

        return []

def tokenize_directory(directory_path):
    tokenized_files = {}
    
    for filename in os.listdir(directory_path):
        if filename.endswith('.py'):
            file_path = os.path.join(directory_path, filename)
            tokens = tokenize_file(file_path)
            tokenized_files[filename] = tokens
    
    return tokenized_files



directory_path = "reduced_set"

tokenized_data = tokenize_directory(directory_path)

print(len(tokenized_data))
print(type(tokenized_data))

for idx, (keys, values) in enumerate(tokenized_data.items()):
    if idx >= 5:  
        break
    print(keys, values)

1235
<class 'dict'>
test_label.py ['from', '__future__', 'import', 'division', ',', 'print_function', ',', 'unicode_literals', 'import', 'sys', 'import', 'os', 'sys', '.', 'path', '.', 'insert', '(', '0', ',', 'os', '.', 'path', '.', 'join', '(', 'os', '.', 'path', '.', 'dirname', '(', '__file__', ')', ',', "'..'", ')', ')', 'testinfo', '=', '"s, t 5, s, t 10.1, s, q"', 'tags', '=', '"Label, text, ScaleTo"', 'import', 'cocos', 'from', 'cocos', '.', 'director', 'import', 'director', 'from', 'cocos', '.', 'sprite', 'import', 'Sprite', 'from', 'cocos', '.', 'actions', 'import', '*', 'from', 'cocos', '.', 'text', 'import', '*', 'import', 'pyglet', 'class', 'TestLayer', '(', 'cocos', '.', 'layer', '.', 'Layer', ')', ':', '    ', 'def', '__init__', '(', 'self', ')', ':', '        ', 'super', '(', 'TestLayer', ',', 'self', ')', '.', '__init__', '(', ')', 'x', ',', 'y', '=', 'director', '.', 'get_window_size', '(', ')', 'self', '.', 'text', '=', 'Label', '(', '"hello"', ',', '(', 'x', '//', '2

In [None]:
def build_vocabulary(tokenized_data):
    """
    Build a vocabulary mapping tokens to unique IDs.
    """
    all_tokens = [token for tokens in tokenized_data.values() for token in tokens]
    vocab = {token: idx for idx, token in enumerate(set(all_tokens), start=1)}  
    return vocab


def convert_tokens_to_ids(tokenized_data, vocab):
    """
    Convert tokenized data into token IDs using the vocabulary.
    """
    token_ids_data = {
        filename: [vocab[token] for token in tokens if token in vocab]
        for filename, tokens in tokenized_data.items()
    }
    return token_ids_data

vocab = build_vocabulary(tokenized_data)
for idx, (keys, values) in enumerate(vocab.items()):
    if idx >= 5:  
        break
    print(keys, values)

token_ids_data = convert_tokens_to_ids(tokenized_data, vocab)
for idx, (keys, values) in enumerate(token_ids_data.items()):
    if idx >= 5:  
        break
    print(keys, values)


to_sql 1
'first' 2
'project' 3
value_to_name 4
'aaaa-bb-cc' 5
test_label.py [34558, 34404, 18017, 13959, 33166, 25037, 33166, 7651, 18017, 13197, 18017, 7844, 13197, 15467, 67, 15467, 616, 34818, 13840, 33166, 7844, 15467, 67, 15467, 17795, 34818, 7844, 15467, 67, 15467, 24403, 34818, 42033, 18470, 33166, 23120, 18470, 18470, 7689, 19052, 41055, 37346, 19052, 5004, 18017, 12209, 34558, 12209, 15467, 12325, 18017, 12325, 34558, 12209, 15467, 8490, 18017, 20911, 34558, 12209, 15467, 18602, 18017, 32224, 34558, 12209, 15467, 39537, 18017, 32224, 18017, 6769, 903, 41756, 34818, 12209, 15467, 43835, 15467, 42572, 18470, 2006, 21263, 9014, 39944, 34818, 8491, 18470, 2006, 33512, 37221, 34818, 41756, 33166, 8491, 18470, 15467, 39944, 34818, 18470, 7757, 33166, 7600, 19052, 12325, 15467, 25594, 34818, 18470, 8491, 15467, 39537, 19052, 15124, 34818, 3832, 33166, 34818, 7757, 28796, 36508, 33166, 7600, 28796, 36508, 18470, 18470, 8491, 15467, 39537, 15467, 37464, 34818, 20537, 34818, 28271, 3316

In [4]:
import pickle
with open("vocab_gru.pkl", "wb") as file:
    pickle.dump(vocab, file)

In [None]:
def prepare_sequences(token_ids_data, sequence_length=4):
    """
    Prepare input-output pairs for training a model.
    Each input is a sequence of token IDs, and the output is the next token ID.
    """
    x = []
    y = []
    for token_ids in token_ids_data.values():
        for i in range(len(token_ids) - sequence_length):
            input_seq = token_ids[i:i + sequence_length]
            output_token = token_ids[i + sequence_length]
            x.append(input_seq)
            y.append(output_token)
    return x, y

sequence_length = 4
x,y = prepare_sequences(token_ids_data, sequence_length)

print("Training Data:")
for i in range(5):  
    print(f"Input: {x[i]}, Output: {y[i]}")

Training Data:
Input: [34558, 34404, 18017, 13959], Output: 33166
Input: [34404, 18017, 13959, 33166], Output: 25037
Input: [18017, 13959, 33166, 25037], Output: 33166
Input: [13959, 33166, 25037, 33166], Output: 7651
Input: [33166, 25037, 33166, 7651], Output: 18017


In [None]:
print(len(x), len(y))

814010 814010


In [None]:
from sklearn.model_selection import train_test_split
import numpy as np

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

x_train = np.array(x_train)
x_test = np.array(x_test)

y_train = np.array(y_train)
y_test = np.array(y_test)

In [8]:
print(len(x_train), len(y_train), len(x_test), len(y_test))

651208 651208 162802 162802


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding,LSTM,Dense,Dropout,GRU

total_words = len(vocab) + 1
max_sequence_len = 4

model=Sequential()
model.add(Embedding(total_words,100,input_length=max_sequence_len-1))
model.add(GRU(150,return_sequences=True))
model.add(Dropout(0.2))
model.add(GRU(100))
model.add(Dense(total_words,activation="softmax"))

model.compile(loss="sparse_categorical_crossentropy",optimizer='adam',metrics=['accuracy'])

2024-11-30 03:24:00.299688: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1732917240.316134   85134 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1732917240.321209   85134 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-30 03:24:00.339800: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
I0000 00:00:1732917241.837545   85134 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 27

In [None]:
history=model.fit(x_train,y_train,epochs=10,validation_data=(x_test,y_test),verbose=1)
model.summary()

Epoch 1/10


I0000 00:00:1732917244.745343   85264 cuda_dnn.cc:529] Loaded cuDNN version 90300


[1m20351/20351[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m320s[0m 16ms/step - accuracy: 0.2639 - loss: 5.0615 - val_accuracy: 0.4603 - val_loss: 3.7063
Epoch 2/10
[1m20351/20351[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m316s[0m 16ms/step - accuracy: 0.4811 - loss: 3.4381 - val_accuracy: 0.5010 - val_loss: 3.4993
Epoch 3/10
[1m20351/20351[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m317s[0m 16ms/step - accuracy: 0.5299 - loss: 3.1169 - val_accuracy: 0.5186 - val_loss: 3.4042
Epoch 4/10
[1m20351/20351[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m317s[0m 16ms/step - accuracy: 0.5559 - loss: 2.9213 - val_accuracy: 0.5315 - val_loss: 3.3345
Epoch 5/10
[1m20351/20351[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m317s[0m 16ms/step - accuracy: 0.5722 - loss: 2.7977 - val_accuracy: 0.5386 - val_loss: 3.2919
Epoch 6/10
[1m20351/20351[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m316s[0m 16ms/step - accuracy: 0.5852 - loss: 2.6891 - val_accuracy: 0.5435 - val_loss: 3.26

In [None]:
def predict_next_word_custom(model, content):
    tokens = tokenize.generate_tokens(iter(content.splitlines()).__next__)
    token_list = []
    for token in tokens:
        if token.type not in (tokenize.COMMENT, tokenize.NL):
            if token.string == '':
                continue
            token_list.append(token.string)
    print(token_list)
    count=0
    print(len(vocab))
    for i in range(len(token_list)):
        if vocab.get(token_list[i])==None:
            token_list[i] = 1
        else:
            token_list[i] = vocab[token_list[i]]
    token_list = np.array(token_list).reshape(1, -1)
    print(token_list)
    prediction = model.predict(token_list, verbose=0)
    predicted_word_index = np.argmax(prediction)
    print(predicted_word_index)
    flag = False
    for key, value in vocab.items():
        if value == predicted_word_index:
            flag = True
            return key
    if(flag == False):
        return "No word found"

In [12]:
input_text="for i in"
print(f"Input text:{input_text}")
next_word=predict_next_word_custom(model,input_text)
print(f"Next Word Prediction:{next_word}")

Input text:for i in
['for', 'i', 'in']
44156
[[25812 31504 39957]]
34818
Next Word Prediction:(


In [None]:
model.save("GRU_big_25.h5")

