Importing required files

In [2]:
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


In [3]:
import tensorflow as tf
import timeit

device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  print(
      '\n\nThis error most likely means that this notebook is not '
      'configured to use a GPU.  Change this in Notebook Settings via the '
      'command palette (cmd/ctrl-shift-P) or the Edit menu.\n\n')
  raise SystemError('GPU device not found')

def cpu():
  with tf.device('/cpu:0'):
    random_image_cpu = tf.random.normal((100, 100, 100, 3))
    net_cpu = tf.keras.layers.Conv2D(32, 7)(random_image_cpu)
    return tf.math.reduce_sum(net_cpu)

def gpu():
  with tf.device('/device:GPU:0'):
    random_image_gpu = tf.random.normal((100, 100, 100, 3))
    net_gpu = tf.keras.layers.Conv2D(32, 7)(random_image_gpu)
    return tf.math.reduce_sum(net_gpu)

# We run each op once to warm up; see: https://stackoverflow.com/a/45067900
cpu()
gpu()

# Run the op several times.
print('Time (s) to convolve 32x7x7x3 filter over random 100x100x100x3 images '
      '(batch x height x width x channel). Sum of ten runs.')
print('CPU (s):')
cpu_time = timeit.timeit('cpu()', number=10, setup="from __main__ import cpu")
print(cpu_time)
print('GPU (s):')
gpu_time = timeit.timeit('gpu()', number=10, setup="from __main__ import gpu")
print(gpu_time)
print('GPU speedup over CPU: {}x'.format(int(cpu_time/gpu_time)))

Time (s) to convolve 32x7x7x3 filter over random 100x100x100x3 images (batch x height x width x channel). Sum of ten runs.
CPU (s):
6.100990279000001
GPU (s):
0.1592407299999934
GPU speedup over CPU: 38x


In [4]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding,Bidirectional, LSTM, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
import pickle
import numpy as np
import os

upload the corpas

In [5]:
from google.colab import files
uploaded = files.upload()

Saving 5000lines.txt to 5000lines.txt


# New section

Open and pre-process the data


In [6]:
file = open("5000lines.txt", "r", encoding = "utf8")

# store file in list
lines = []
for i in file:
    lines.append(i)

# Convert list to string
data = ""
for i in lines:
  data = ' '. join(lines)

#replace unnecessary stuff with space
data = data.replace('\n', '').replace('\r', '').replace('\ufeff', '').replace('“','').replace('”','')  #new line, carriage return, unicode character --> replace by space

#remove unnecessary spaces
data = data.split()
data = ' '.join(data)
data[:500]

'’ میرے بھائی کا ای میل آیاہے ۔ بلال بھائی ، ہم آپ کی محنت کے معترف اور قائل ہیں ۔ اکیس ستمبر دو ہزار آٹھ کو میریئٹ ہوٹل پر ٹرک خود کش حملے میں چون افراد ہلاک ہوئے جبکہ اڑھائی سو سے زائد زخمی ہوئے ۔ محفوظات برائے خون کے آنسو ٹیگ شمشاد بھائ مجھے تو پتہ ہی نہیں تھا کہ محفل میں شادی دفتر بھی ہے اگر آپ کو زیادہ امیدوار کی ضرورت ہے تو میں بھی حاضر ہوں آخر ایک اور ایک گیارہ ہوتے ہیں ! يہ پچھلے پہر کا زرد رو چاند لفظ ندا عموماً اول آتا ہے لیکن زور دینے کے لیے آخر مین بھی آجاتا ہے ۔ " مخدوم محمد حسین ماد'

In [7]:
len(data)

371493

Apply tokenization and some other changes

In [8]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])

# saving the tokenizer for predict function
pickle.dump(tokenizer, open('token.pkl', 'wb'))

sequence_data = tokenizer.texts_to_sequences([data])[0]
sequence_data[:15]

[99, 79, 73, 9, 583, 1182, 5126, 1, 1055, 73, 8, 46, 23, 5, 1056]

In [9]:
len(sequence_data)

85214

In [10]:
vocab_size = len(tokenizer.word_index) + 1
print(vocab_size)

11982


In [11]:
sequences = []

for i in range(3, len(sequence_data)):
    words = sequence_data[i-3:i+1]
    sequences.append(words)

print("The Length of sequences are: ", len(sequences))
sequences = np.array(sequences)
sequences[:10]

The Length of sequences are:  85211


array([[  99,   79,   73,    9],
       [  79,   73,    9,  583],
       [  73,    9,  583, 1182],
       [   9,  583, 1182, 5126],
       [ 583, 1182, 5126,    1],
       [1182, 5126,    1, 1055],
       [5126,    1, 1055,   73],
       [   1, 1055,   73,    8],
       [1055,   73,    8,   46],
       [  73,    8,   46,   23]])

In [12]:
X = []
y = []

for i in sequences:
    X.append(i[0:3])
    y.append(i[3])

X = np.array(X)
y = np.array(y)

In [13]:
print("Data: ", X[:10])
print("Response: ", y[:10])

Data:  [[  99   79   73]
 [  79   73    9]
 [  73    9  583]
 [   9  583 1182]
 [ 583 1182 5126]
 [1182 5126    1]
 [5126    1 1055]
 [   1 1055   73]
 [1055   73    8]
 [  73    8   46]]
Response:  [   9  583 1182 5126    1 1055   73    8   46   23]


In [14]:
y = to_categorical(y, num_classes=vocab_size)
y[:5]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.]], dtype=float32)

 Creating the model

In [15]:
model = Sequential()
model.add(Embedding(vocab_size, 10, input_length=3))
model.add(LSTM(1000))
model.add(Dense(1000, activation="relu"))
model.add(Dense(vocab_size, activation="softmax"))

In [16]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 3, 10)             119820    
                                                                 
 lstm (LSTM)                 (None, 1000)              4044000   
                                                                 
 dense (Dense)               (None, 1000)              1001000   
                                                                 
 dense_1 (Dense)             (None, 11982)             11993982  
                                                                 
Total params: 17158802 (65.46 MB)
Trainable params: 17158802 (65.46 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [17]:
# from tensorflow import keras
# from keras.utils.vis_utils import plot_model

# keras.utils.plot_model(model, to_file='plot.png', show_layer_names=True)

ModuleNotFoundError: ignored

Train the model

In [None]:
from tensorflow.keras.callbacks import ModelCheckpoint

checkpoint = ModelCheckpoint("next_words.h5", monitor='Accuracy', verbose=1, save_best_only=False )
model.compile(loss="categorical_crossentropy", metrics=['accuracy'],optimizer=Adam(learning_rate=0.001))
history = model.fit(X, y, validation_split= 0.2, epochs=60, batch_size=64, callbacks=[checkpoint])



In [None]:
import matplotlib.pyplot as plt

plt.plot(history.history['accuracy'])
plt.title('Model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epochs')
plt.legend(['accuracy'],loc ='upper left')
plt.show

In [None]:
import matplotlib.pyplot as plt

plt.plot(history.history['loss'])
plt.title('Model loss')
plt.ylabel('loss')
plt.xlabel('epochs')
plt.legend(['loss'],loc ='upper left')
plt.show

prediction

In [None]:
from tensorflow.keras.models import load_model
import numpy as np
import pickle

# Load the model and tokenizer
model = load_model('next_words.h5')
tokenizer = pickle.load(open('token.pkl', 'rb'))

def Predict_Next_Words(model, tokenizer, text):

  sequence = tokenizer.texts_to_sequences([text])
  sequence = np.array(sequence)
  preds = np.argmax(model.predict(sequence))
  predicted_word = ""

  for key, value in tokenizer.word_index.items():
      if value == preds:
          predicted_word = key
          break

  print(predicted_word)
  return predicted_word

In [None]:
import matplotlib.pyplot as plt


def plot_graphs(history, string):
    plt.plot(history.history[string])
    plt.xlabel("Epochs")
    plt.ylabel(string)
    plt.show()

In [None]:
while(True):
  text = input("Enter your line: ")

  if text == "0":
      print("Execution completed.....")
      break

  else:
      try:
          text = text.split(" ")
          text = text[:3]
          print(text)

          Predict_Next_Words(model, tokenizer, text)

      except Exception as e:
        print("Error occurred: ",e)
        continue