In [31]:
import pandas as pd
import re
import nltk
import spacy
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import numpy as np
from torch.utils.data import DataLoader,TensorDataset
from torch.optim.lr_scheduler import ReduceLROnPlateau
import torch
import gc



gc.collect()

torch.cuda.empty_cache()

torch.cuda.reset_peak_memory_stats()
torch.cuda.reset_accumulated_memory_stats()

In [32]:

nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\abdob\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\abdob\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\abdob\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\abdob\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [33]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
nlp = spacy.load('en_core_web_sm')
def preprocess_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove punctuation and special characters
    text = text.lower()  # Convert to lowercase
    tokens = word_tokenize(text)  # Tokenize the text
    return ' '.join(tokens)  # Join words back into a single string

In [34]:
columns = ["Line ID", "Speaker ID", "Movie ID", "Character Name", "Text"]

# Read the file and process lines into a DataFrame
with open("movie_lines.txt", 'r', encoding='latin-1') as file:
    movie_lines_c = [
        line.split(' +++$+++ ') for line in file.read().splitlines() if line.strip()
    ]
# filtering out empty lines (if line.strip()), which reduces the chance of unexpected errors from blank lines.

# Create the DataFrame and drop unnecessary columns
movie_lines = pd.DataFrame(movie_lines_c, columns=columns)
movie_lines['processed_text']=movie_lines['Text'].apply(lambda x: preprocess_text(str(x)))
# Display a sample for verification
print(movie_lines.head())
print(movie_lines.isna().sum())

  Line ID Speaker ID Movie ID Character Name          Text processed_text
0   L1045         u0       m0         BIANCA  They do not!    they do not
1   L1044         u2       m0        CAMERON   They do to!     they do to
2    L985         u0       m0         BIANCA    I hope so.      i hope so
3    L984         u2       m0        CAMERON     She okay?       she okay
4    L925         u0       m0         BIANCA     Let's go.        lets go
Line ID            0
Speaker ID        69
Movie ID          69
Character Name    69
Text              69
processed_text     0
dtype: int64


In [35]:
movie_lines.dropna(inplace=True)
print(movie_lines.isna().sum())

Line ID           0
Speaker ID        0
Movie ID          0
Character Name    0
Text              0
processed_text    0
dtype: int64


In [36]:
movie_lines['processed_text']

0                                               they do not
1                                                they do to
2                                                 i hope so
3                                                  she okay
4                                                   lets go
                                ...                        
304777    lord chelmsford seems to want me to stay back ...
304778    im to take the sikali with the main column to ...
304779                               your orders mr vereker
304780    good ones yes mr vereker gentlemen who can rid...
304781    colonel durnford william vereker i hear you ve...
Name: processed_text, Length: 304713, dtype: object

In [37]:


# Check for CUDA availability and set the device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
movie_lines=movie_lines.sample(frac =.10)
# Flatten the processed dialogues into a list of sentences
sentences = movie_lines['processed_text'].tolist()

# Tokenize and create a vocabulary (token-to-index mapping)
tokenizer = nltk.tokenize.word_tokenize
all_words = []
for sentence in sentences:
    all_words.extend(tokenizer(sentence.lower()))

# Create a vocabulary
vocab = sorted(set(all_words))
word_to_idx = {word: idx for idx, word in enumerate(vocab)}
idx_to_word = {idx: word for word, idx in word_to_idx.items()}

# Convert sentences to sequences of indices
sequences = []
for sentence in sentences:
    tokenized = tokenizer(sentence.lower())
    sequence = [word_to_idx[word] for word in tokenized if word in word_to_idx]
    sequences.append(sequence)

# Create input-output pairs for training (X: input sequence, Y: next word)
sequence_length = 4  # We will use the previous 3 words to predict the next word
X_data = []
y_data = []
for seq in sequences:
    for i in range(len(seq) - sequence_length):
        X_data.append(seq[i:i + sequence_length])
        y_data.append(seq[i + sequence_length])

X_data = np.array(X_data)
y_data = np.array(y_data)


In [38]:

# Convert to PyTorch tensors and move them to CUDA
X_data = torch.tensor(X_data, dtype=torch.long).to(device)
y_data = torch.tensor(y_data, dtype=torch.long).to(device)


# **RNN**

In [39]:

class RNN_Language_Model(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_size):
        super(RNN_Language_Model, self).__init__()

        # Embedding layer to map token indices to GloVe embeddings
        self.embedding = nn.Embedding(vocab_size, embedding_dim)

        # RNN layer
        self.rnn = nn.RNN(embedding_dim, hidden_dim, batch_first=True)

        # Fully connected layer to predict the next word
        self.fc = nn.Linear(hidden_dim, output_size)

    def forward(self, x):
        # Get embeddings for the input tokens
        x = self.embedding(x)

        # Pass through the RNN
        rnn_out, _ = self.rnn(x)

        # Use the output of the last time step (we're doing a sequence-to-one task)
        out = self.fc(rnn_out[:, -1, :])

        return out

# Model parameters
embedding_dim = 100  # Using GloVe 100-dimensional vectors
hidden_dim = 128  # Hidden layer dimension
vocab_size = len(vocab)  # Size of the vocabulary
output_size = vocab_size  # Output size is the size of the vocabulary (for next word prediction)

# Instantiate the model and move it to CUDA
Rnn_model = RNN_Language_Model(vocab_size, embedding_dim, hidden_dim, output_size).to(device)

# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(Rnn_model.parameters(), lr=0.001)

# Create a DataLoader for batching
train_dataset = TensorDataset(X_data, y_data)
train_loader = DataLoader(train_dataset, batch_size=512, shuffle=True)


# Training RNN (Skip to load models if you want to test)

In [40]:

# Train the model
# Training loop
num_epochs = 100
for epoch in range(num_epochs):
    Rnn_model.train()
    total_loss = 0
    for batch in train_loader:
        X_batch, y_batch = batch

        # Move batches to CUDA
        X_batch = X_batch.to(device)
        y_batch = y_batch.to(device)

        optimizer.zero_grad()

        # Forward pass
        output = Rnn_model(X_batch)

        # Calculate loss
        loss = criterion(output, y_batch)
        total_loss += loss.item()

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss/len(train_loader)}")


Epoch [1/100], Loss: 6.823441997351727
Epoch [2/100], Loss: 6.054068527082457
Epoch [3/100], Loss: 5.757985485151156
Epoch [4/100], Loss: 5.52889604522074
Epoch [5/100], Loss: 5.3305794258767385
Epoch [6/100], Loss: 5.151155008886852
Epoch [7/100], Loss: 4.984238798600914
Epoch [8/100], Loss: 4.828195547535472
Epoch [9/100], Loss: 4.678806004443018
Epoch [10/100], Loss: 4.542465238965631
Epoch [11/100], Loss: 4.415323822457715
Epoch [12/100], Loss: 4.295245250646215
Epoch [13/100], Loss: 4.184222373359105
Epoch [14/100], Loss: 4.0775206628507075
Epoch [15/100], Loss: 3.9761316521034336
Epoch [16/100], Loss: 3.8812913755430793
Epoch [17/100], Loss: 3.7901107690630167
Epoch [18/100], Loss: 3.705016172715347
Epoch [19/100], Loss: 3.622527702591425
Epoch [20/100], Loss: 3.5447393902316873
Epoch [21/100], Loss: 3.4702983917691124
Epoch [22/100], Loss: 3.3983372719618528
Epoch [23/100], Loss: 3.3310231129908505
Epoch [24/100], Loss: 3.266016656754951
Epoch [25/100], Loss: 3.204941131192692
E

# **LSTM**

In [41]:
class LSTM_Language_Model(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_size):
        super(LSTM_Language_Model, self).__init__()

        # Embedding layer to map token indices to embeddings
        self.embedding = nn.Embedding(vocab_size, embedding_dim)

        # LSTM layer
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)

        # Fully connected layer to predict the next word
        self.fc = nn.Linear(hidden_dim, output_size)

    def forward(self, x):
        # Get embeddings for the input tokens
        x = self.embedding(x)

        # Pass through the LSTM
        lstm_out, _ = self.lstm(x)

        # Use the output of the last time step
        out = self.fc(lstm_out[:, -1, :])

        return out

# Instantiate the LSTM model
lstm_model = LSTM_Language_Model(vocab_size, embedding_dim, hidden_dim, output_size).to(device)

optimizer = torch.optim.Adam(lstm_model.parameters(), lr=0.001)


# Training LSTM (Skip to load models if you want to test)

In [42]:

# Train the LSTM model
num_epochs = 100
for epoch in range(num_epochs):
    lstm_model.train()
    total_loss = 0
    for batch in train_loader:
        X_batch, y_batch = batch

        # Move batches to device
        X_batch = X_batch.to(device)
        y_batch = y_batch.to(device)

        optimizer.zero_grad()

        # Forward pass
        output = lstm_model(X_batch)

        # Calculate loss
        loss = criterion(output, y_batch)
        total_loss += loss.item()

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss/len(train_loader)}")





Epoch [1/100], Loss: 6.936704494077214
Epoch [2/100], Loss: 6.173497096755499
Epoch [3/100], Loss: 5.8945350472944495
Epoch [4/100], Loss: 5.6951301741774065
Epoch [5/100], Loss: 5.523401037619932
Epoch [6/100], Loss: 5.364690924502927
Epoch [7/100], Loss: 5.213489062594672
Epoch [8/100], Loss: 5.067951236038023
Epoch [9/100], Loss: 4.925725925287771
Epoch [10/100], Loss: 4.788286377333667
Epoch [11/100], Loss: 4.654340253259144
Epoch [12/100], Loss: 4.525081947772172
Epoch [13/100], Loss: 4.400665670705828
Epoch [14/100], Loss: 4.282407865617107
Epoch [15/100], Loss: 4.170887894874071
Epoch [16/100], Loss: 4.063412028797641
Epoch [17/100], Loss: 3.961555724596455
Epoch [18/100], Loss: 3.865112796607099
Epoch [19/100], Loss: 3.772506737651036
Epoch [20/100], Loss: 3.6830970927746627
Epoch [21/100], Loss: 3.6001997417487077
Epoch [22/100], Loss: 3.517418704183722
Epoch [23/100], Loss: 3.4398145640853546
Epoch [24/100], Loss: 3.36478088952039
Epoch [25/100], Loss: 3.293509612118241
Epoch

# **LSTM with attention**

In [43]:

class LSTM_Attention_Language_Model(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_size):
        super(LSTM_Attention_Language_Model, self).__init__()

        # Embedding layer to map token indices to embeddings
        self.embedding = nn.Embedding(vocab_size, embedding_dim)

        # LSTM layer
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)

        # Attention layer
        self.attention = nn.Linear(hidden_dim, 1)

        # Fully connected layer to predict the next word
        self.fc = nn.Linear(hidden_dim, output_size)

    def forward(self, x):
        # Get embeddings for the input tokens
        x = self.embedding(x)

        # Pass through the LSTM
        lstm_out, _ = self.lstm(x)

        # Calculate attention scores and apply attention
        attention_scores = torch.tanh(self.attention(lstm_out))  # Shape: (batch_size, seq_length, 1)
        attention_weights = torch.softmax(attention_scores, dim=1)  # Normalize across the sequence
        context_vector = torch.sum(attention_weights * lstm_out, dim=1)  # Weighted sum of LSTM outputs

        # Pass through the fully connected layer
        out = self.fc(context_vector)

        return out

# Instantiate the LSTM with Attention model
attention_model = LSTM_Attention_Language_Model(vocab_size, embedding_dim, hidden_dim, output_size).to(device)

# Define the loss function and optimizer for the LSTM with Attention model
optimizer = torch.optim.Adam(attention_model.parameters(), lr=0.001)


# Training LSTM with attention (Skip to load models if you want to test)

In [44]:

# Train the LSTM with Attention model
num_epochs = 100
for epoch in range(num_epochs):
    attention_model.train()
    total_loss = 0
    for batch in train_loader:
        X_batch, y_batch = batch

        # Move batches to device
        X_batch = X_batch.to(device)
        y_batch = y_batch.to(device)

        optimizer.zero_grad()

        # Forward pass
        output = attention_model(X_batch)

        # Calculate loss
        loss = criterion(output, y_batch)
        total_loss += loss.item()

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss/len(train_loader)}")





Epoch [1/100], Loss: 7.092240381124826
Epoch [2/100], Loss: 6.535681319642821
Epoch [3/100], Loss: 6.18769973211915
Epoch [4/100], Loss: 5.939977040256027
Epoch [5/100], Loss: 5.745075912661216
Epoch [6/100], Loss: 5.575710743303136
Epoch [7/100], Loss: 5.419562386190224
Epoch [8/100], Loss: 5.273941019155683
Epoch [9/100], Loss: 5.1314756156754315
Epoch [10/100], Loss: 4.993332444026232
Epoch [11/100], Loss: 4.859609408970297
Epoch [12/100], Loss: 4.727086298077066
Epoch [13/100], Loss: 4.598809710964379
Epoch [14/100], Loss: 4.473517100016276
Epoch [15/100], Loss: 4.349426546816118
Epoch [16/100], Loss: 4.2312031546358355
Epoch [17/100], Loss: 4.117983337736478
Epoch [18/100], Loss: 4.009963662084872
Epoch [19/100], Loss: 3.909590666601548
Epoch [20/100], Loss: 3.814821043153749
Epoch [21/100], Loss: 3.725282434709461
Epoch [22/100], Loss: 3.641453137943054
Epoch [23/100], Loss: 3.5613864651561653
Epoch [24/100], Loss: 3.4848032687992365
Epoch [25/100], Loss: 3.4126677756761983
Epoch

# saving models (if needed after training):

In [45]:

torch.save(Rnn_model.state_dict(), 'rnn_model.pth')

torch.save(lstm_model.state_dict(), 'lstm_model.pth')

torch.save(attention_model.state_dict(), 'att_model.pth')

# Loading models:

In [46]:
Rnn_model.load_state_dict(torch.load('rnn_model.pth'))
lstm_model.load_state_dict(torch.load('lstm_model.pth'))
attention_model.load_state_dict(torch.load('att_model.pth'))

<All keys matched successfully>

In [47]:

# Function to preprocess the input string
def preprocess_input(text, word_to_idx, sequence_length=5):
    tokens = nltk.tokenize.word_tokenize(text.lower())  # Tokenize and convert to lowercase
    sequence = [word_to_idx[word] for word in tokens if word in word_to_idx]

    # If the sequence is shorter than the required length, pad it with zeros
    if len(sequence) < sequence_length:
        sequence = [0] * (sequence_length - len(sequence)) + sequence

    # Use only the last 'sequence_length' tokens (to keep the input size fixed)
    sequence = sequence[-sequence_length:]

    # Convert to tensor and return
    return torch.tensor(sequence, dtype=torch.long).unsqueeze(0)  # Add batch dimension

# Function to predict the next word
def predict_next_word(model, text, word_to_idx, idx_to_word, device, sequence_length=5):
    model.eval()  # Set the model to evaluation mode
    input_tensor = preprocess_input(text, word_to_idx, sequence_length).to(device)

    with torch.no_grad():
        # Get the output from the model
        output = model(input_tensor)

        # Get the index of the predicted word (the word with the highest probability)
        predicted_idx = torch.argmax(output, dim=1).item()

        # Map the index to the word
        predicted_word = idx_to_word[predicted_idx]

    return predicted_word


In [48]:

# Example usage:
text_input = "hi how are"

RNN_predicted_word = predict_next_word(Rnn_model, text_input, word_to_idx, idx_to_word, device)
print(f"Next word prediction (RNN): {RNN_predicted_word}")

lstm_predicted_word = predict_next_word(lstm_model, text_input, word_to_idx, idx_to_word, device)
print(f"Next word prediction (LSTM): {lstm_predicted_word}")


attention_predicted_word = predict_next_word(attention_model, text_input, word_to_idx, idx_to_word, device)
print(f"Next word prediction (LSTM with Attention): {attention_predicted_word}")


Next word prediction (RNN): you
Next word prediction (LSTM): trained
Next word prediction (LSTM with Attention): you


In [49]:
text_input=input("enter an incomplete sentence")
print("your sentence is:",text_input)
print("the next predicted word:")
RNN_predicted_word = predict_next_word(Rnn_model, text_input, word_to_idx, idx_to_word, device)
print(f"Next word prediction (RNN): {RNN_predicted_word}")

lstm_predicted_word = predict_next_word(lstm_model, text_input, word_to_idx, idx_to_word, device)
print(f"Next word prediction (LSTM): {lstm_predicted_word}")


attention_predicted_word = predict_next_word(attention_model, text_input, word_to_idx, idx_to_word, device)
print(f"Next word prediction (LSTM with Attention): {attention_predicted_word}")


your sentence is: i want to
the next predicted word:
Next word prediction (RNN): get
Next word prediction (LSTM): know
Next word prediction (LSTM with Attention): make


# **Conclusion**

The three models work well and both LSTM and LSTM with attention work alot better than the RNN but the attention doesnt improve the results in this scope(num of epochs,task,etc...)