In [None]:
import tarfile
!wget http://wstyler.ucsd.edu/files/enronsentv1.tar.gz
my_tar = tarfile.open('/content/enronsentv1.tar.gz')
my_tar.extractall('/content/') # specify which folder to extract to
my_tar.close()

--2023-05-20 19:26:40--  http://wstyler.ucsd.edu/files/enronsentv1.tar.gz
Resolving wstyler.ucsd.edu (wstyler.ucsd.edu)... 132.239.147.75
Connecting to wstyler.ucsd.edu (wstyler.ucsd.edu)|132.239.147.75|:80... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://wstyler.ucsd.edu/files/enronsentv1.tar.gz [following]
--2023-05-20 19:26:41--  https://wstyler.ucsd.edu/files/enronsentv1.tar.gz
Connecting to wstyler.ucsd.edu (wstyler.ucsd.edu)|132.239.147.75|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 26406491 (25M) [application/x-gzip]
Saving to: ‘enronsentv1.tar.gz’


2023-05-20 19:26:42 (24.3 MB/s) - ‘enronsentv1.tar.gz’ saved [26406491/26406491]



In [None]:
import os
import numpy as np
data_dir = '/content/enronsent'
train_files = 10
val_files = 5
train_text = ""
val_text = ""

for i in range(train_files):
    filename = os.path.join(data_dir, f'enronsent{i:02d}')
    with open(filename, 'r') as file:
        content = file.read()
        train_text += content

for i in range(train_files, train_files + val_files):
    filename = os.path.join(data_dir, f'enronsent{i:02d}')
    with open(filename, 'r') as file:
        content = file.read()
        val_text += content

In [None]:
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
import string
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
#preprocessing the data (remove punctuation, spaces, digits and stopwords / make everything to lower case / stemming and lemmtization)

lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

def preprocessingAndgettokens(text):
    cleaned_data = []
    for j in text:
        j = j.lower()
        j = j.translate(str.maketrans('', '', string.punctuation))
        j = " ".join(j.split())
        j = ''.join(c for c in j if not c.isdigit())
        word_tokens = word_tokenize(j)
        stop_words = set(stopwords.words("english"))
        filtered_text = [word for word in word_tokens if word not in stop_words]
        cleaned_data.append(filtered_text)
    return cleaned_data

train_data = preprocessingAndgettokens([train_text])[0]
val_data = preprocessingAndgettokens([val_text])[0]

In [None]:
def create_dataset(words, window_size, stride):
    X, y = [], []
    for i in range(0, len(words) - window_size + 1, stride):
        window = words[i:i+window_size]
        X.append(window[:-1])
        y.append(window[-1])
    return np.array(X), np.array(y)

In [None]:
X_train, y_train, = create_dataset(train_data, window_size=10, stride=10)
X_val, y_val =  create_dataset(val_data, window_size=10, stride=10)

In [None]:
del train_data, val_data

In [None]:
print(X_train.shape, y_train.shape, X_val.shape, y_val.shape)

(181763, 9) (181763,) (84542, 9) (84542,)


In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import ModelCheckpoint
import gensim.downloader as api
from tqdm import tqdm
from gensim.models import Word2Vec
import numpy as np

pretreained_model = api.load("fasttext-wiki-news-subwords-300")
max_sequence_length = 10
embedding_dim = pretreained_model.vector_size
embedding_matrix = np.zeros((len(pretreained_model.key_to_index) + 1, embedding_dim))
for word, index in pretreained_model.key_to_index.items():
    embedding_matrix[index + 1] = pretreained_model.get_vector(word)

In [None]:
corpus = (np.concatenate((X_train, X_val))).tolist()
trained_model = Word2Vec(sentences=tqdm(corpus), vector_size=300, window=10, min_count=1, workers=4, epochs=50)

max_sequence_length = 10
embedding_dim = trained_model.vector_size
embedding_matrix = np.zeros((len(trained_model.wv.key_to_index) + 1, embedding_dim))
for word, index in trained_model.wv.key_to_index.items():
    embedding_matrix[index + 1] = trained_model.wv[word]


100%|██████████| 266305/266305 [00:00<00:00, 467881.10it/s]


In [None]:
def preprocess_sequences(sequences, model, max_sequence_length):
    sequences_encoded = [[model.wv.key_to_index.get(word, 0) for word in sublist] for sublist in sequences]
    sequences_padded = tf.keras.preprocessing.sequence.pad_sequences(sequences_encoded, maxlen=max_sequence_length)
    return sequences_padded



def encode_targets(target_words):
    word_to_int = {word: i for i, word in enumerate(set(target_words))}
    num_classes = len(word_to_int)
    target_encoded = [word_to_int[word] for word in target_words]
    target_encoded = np.array(target_encoded)

    return target_encoded, word_to_int, num_classes


In [None]:
y_train_encoded, word_to_int_train, num_classes_train = encode_targets(y_train)
y_val_encoded, word_to_int_val, num_classes_val = encode_targets(y_val)
X_train_padded = preprocess_sequences(X_train, trained_model, max_sequence_length)
X_val_padded = preprocess_sequences(X_val, trained_model, max_sequence_length)

In [None]:
del X_train, X_val, y_train, y_val

In [None]:
embedding_layer = Embedding(
    input_dim=len(trained_model.wv.key_to_index) + 1,
    output_dim=embedding_dim,
    input_length=max_sequence_length,
    weights=[embedding_matrix],
    trainable=False
)

AutoComplete = Sequential()
AutoComplete.add(embedding_layer)
AutoComplete.add(LSTM(units=256, return_sequences=True))
AutoComplete.add(LSTM(units=256))
AutoComplete.add(Dense(units=num_classes_train, activation='softmax'))
AutoComplete.add(Dropout(0.2))

AutoComplete.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [None]:
AutoComplete.fit(X_train_padded, y_train_encoded, validation_data=(X_val_padded, y_val_encoded), epochs=50, batch_size=64)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7fcb0d4b9e40>

In [None]:
def preprocess_input_sequence(sequence, model, max_sequence_length):
    word_to_index = {word: index for index, word in enumerate(model.wv.index_to_key)}
    sequence_encoded = [word_to_index.get(word, 0) for word in sequence]
    sequence_padded = tf.keras.preprocessing.sequence.pad_sequences([sequence_encoded], maxlen=max_sequence_length)
    return sequence_padded

In [None]:
int_to_word_train = {value: key for key, value in word_to_int_train.items()}
int_to_word_val = {value: key for key, value in word_to_int_val.items()}

int_to_word_combined = {}
int_to_word_combined.update(int_to_word_train)
int_to_word_combined.update(int_to_word_val)

In [None]:
def SentenceAutoComplete(pretreained_model, max_sequence_length, int_to_word_combined):
    predicted_sequence = []  # Sequence of correct predictions and user inputs

    while True:
        word = input("Enter Next word (-1 to terminate): ")
        if word == "-1":
            break

        predicted_sequence.append(word)
        input_sequence = preprocess_input_sequence(predicted_sequence, pretreained_model, max_sequence_length)
        predicted_word = AutoComplete.predict(input_sequence)
        predicted_word = int_to_word_combined[np.argmax(predicted_word)]
        print(f"Is your next word: '{predicted_word}'")

        while True:
            user_input = input("Yes/No: ")
            if user_input.lower() == "no":
                break
            elif user_input.lower() == 'yes':
              predicted_sequence.append(predicted_word)
              input_sequence = preprocess_input_sequence(predicted_sequence, pretreained_model, max_sequence_length)
              predicted_word = AutoComplete.predict(input_sequence)
              predicted_word = int_to_word_combined[np.argmax(predicted_word)]
              print(f"Is your next word: '{predicted_word}'")
            else:
              print("wrong Answer")
              continue

    print("Your final Sentence is:", ' '.join(predicted_sequence))

In [None]:
SentenceAutoComplete(trained_model, max_sequence_length, int_to_word_combined)

Enter Next word (-1 to terminate): hello
Is your next word: 'yere'
Yes/No: no
Enter Next word (-1 to terminate): my
Is your next word: 'good'
Yes/No: no
Enter Next word (-1 to terminate): name
Is your next word: 'forcing'
Yes/No: no
Enter Next word (-1 to terminate): is
Is your next word: 'parameters'
Yes/No: yes
Is your next word: 'appl'
Yes/No: no
Enter Next word (-1 to terminate): -1
Your final Sentence is: hello my name is parameters


In [None]:
SentenceAutoComplete(trained_model, max_sequence_length, int_to_word_combined)

Enter Next word (-1 to terminate): the
Is your next word: 'ogc'
Yes/No: no
Enter Next word (-1 to terminate): world
Is your next word: 'chicfila'
Yes/No: no
Enter Next word (-1 to terminate): is
Is your next word: 'suffered'
Yes/No: no
Enter Next word (-1 to terminate): -1
Your final Sentence is: the world is


In [None]:
SentenceAutoComplete(trained_model, max_sequence_length, int_to_word_combined)

Enter Next word (-1 to terminate): hey
Is your next word: 'dc'
Yes/No: no
Enter Next word (-1 to terminate): there
Is your next word: 'suffered'
Yes/No: no
Enter Next word (-1 to terminate): mate
Is your next word: 'doc'
Yes/No: no
Enter Next word (-1 to terminate): -1
Your final Sentence is: hey there mate
