<a href="https://colab.research.google.com/github/ArpanChaudhary/TextGenerationModel/blob/main/TextGenerationModel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import random
import pickle
from nltk.tokenize import RegexpTokenizer
from tensorflow.keras.models import Sequential,load_model
from tensorflow.keras.layers import LSTM,Dense,Activation
from tensorflow.keras.optimizers import RMSprop

import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv('/content/ds_data.csv')

In [3]:
df.head()

Unnamed: 0,Text
0,Introduction to
1,Statistical and
2,Machine Learning
3,Methods for
4,Data Science


In [4]:
text = list(df.Text.values)

joined_text = " ".join(text)

In [5]:
len(joined_text)

400025

In [6]:
partial_text = joined_text[1:]

In [7]:
tokenizer = RegexpTokenizer(r"\w+")

tokens = tokenizer.tokenize(partial_text.lower())

In [8]:
unique_tokens = np.unique(tokens)

unique_tokens_index = {token: idx for idx, token in enumerate(unique_tokens)}

In [9]:
n_words = 10

input_words = []

next_words = []


for i in range(len(tokens) - n_words):
  input_words.append(tokens[i:i+n_words])
  next_words.append(tokens[i+n_words])

In [10]:
input_words

[['ntroduction',
  'to',
  'statistical',
  'and',
  'machine',
  'learning',
  'methods',
  'for',
  'data',
  'science'],
 ['to',
  'statistical',
  'and',
  'machine',
  'learning',
  'methods',
  'for',
  'data',
  'science',
  'carlos'],
 ['statistical',
  'and',
  'machine',
  'learning',
  'methods',
  'for',
  'data',
  'science',
  'carlos',
  'andre'],
 ['and',
  'machine',
  'learning',
  'methods',
  'for',
  'data',
  'science',
  'carlos',
  'andre',
  'reis'],
 ['machine',
  'learning',
  'methods',
  'for',
  'data',
  'science',
  'carlos',
  'andre',
  'reis',
  'pinheiro'],
 ['learning',
  'methods',
  'for',
  'data',
  'science',
  'carlos',
  'andre',
  'reis',
  'pinheiro',
  'mike'],
 ['methods',
  'for',
  'data',
  'science',
  'carlos',
  'andre',
  'reis',
  'pinheiro',
  'mike',
  'patettathe'],
 ['for',
  'data',
  'science',
  'carlos',
  'andre',
  'reis',
  'pinheiro',
  'mike',
  'patettathe',
  'correct'],
 ['data',
  'science',
  'carlos',
  'andre',

In [11]:
next_words

['carlos',
 'andre',
 'reis',
 'pinheiro',
 'mike',
 'patettathe',
 'correct',
 'bibliographic',
 'citation',
 'for',
 'this',
 'manual',
 'is',
 'as',
 'follows',
 'pinheiro',
 'carlos',
 'andre',
 'reis',
 'and',
 'mike',
 'patetta',
 '2021',
 'introduction',
 'to',
 'statistical',
 'and',
 'machine',
 'learning',
 'methods',
 'for',
 'data',
 'science',
 'cary',
 'nc',
 'sas',
 'institute',
 'inc',
 'introduction',
 'to',
 'statistical',
 'and',
 'machine',
 'learning',
 'methods',
 'for',
 'data',
 'sciencecopyright',
 '2021',
 'sas',
 'institute',
 'inc',
 'cary',
 'nc',
 'usaisbn',
 '978',
 '1',
 '953329',
 '64',
 '6',
 'hardcover',
 'isbn',
 '978',
 '1',
 '953329',
 '60',
 '8',
 'paperback',
 'isbn',
 '978',
 '1',
 '953329',
 '61',
 '5',
 'web',
 'pdf',
 'isbn',
 '978',
 '1',
 '953329',
 '62',
 '2',
 'epub',
 'isbn',
 '978',
 '1',
 '953329',
 '63',
 '9',
 'kindle',
 'all',
 'rights',
 'reserved',
 'produced',
 'in',
 'the',
 'united',
 'states',
 'of',
 'america',
 'for',
 'a',


In [12]:
X = np.zeros((len(input_words), n_words, len(unique_tokens)), dtype=bool)

y = np.zeros((len(next_words), len(unique_tokens)), dtype=bool)

In [13]:
for i, words in enumerate(input_words):
  for j, word in enumerate(words):
    X[i, j, unique_tokens_index[word]] = 1
  y[i, unique_tokens_index[next_words[i]]] = 1

In [14]:
model = Sequential()

model.add(LSTM(128, input_shape=(n_words, len(unique_tokens)), return_sequences=True))
model.add(LSTM(128))
model.add(Dense(len(unique_tokens)))
model.add(Activation('softmax'))

In [15]:
model.compile(loss='categorical_crossentropy',optimizer=RMSprop(learning_rate=0.01), metrics=['accuracy'])
model.fit(X,y, batch_size=128, epochs=10, shuffle=True)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7d4f4809d6f0>

In [16]:
model.save("my_model.h5")

In [17]:
model = load_model("my_model.h5")

In [18]:
def predict_next_word(input_text,n_best):
  input_text = input_text.lower()
  X = np.zeros((1, n_words, len(unique_tokens)))
  for i, word in enumerate(input_text.split()):
    X[0, i, unique_tokens_index[word]] = 1

  predictions = model.predict(X)[0]
  return np.argpartition(predictions, -n_best)[-n_best:]

In [19]:
possible = predict_next_word(" was working on a project", 5)



In [20]:
print([unique_tokens[idx] for idx in possible])

['or', 'chapter', 'use', 'data', 'text']


In [21]:
def generate_text(input_text, text_length, creativity=3):
  word_sequence = input_text.split()
  current = 0

  for _ in range(text_length):
    sub_sequence = " ".join(tokenizer.tokenize(" ".join(word_sequence).lower())[current:current+n_words])
    try:
      choice = unique_tokens[random.choice(predict_next_word(sub_sequence, creativity))]
    except:
      choice = random.choice(unique_tokens)
    word_sequence.append(choice)
    current+=1
  return " ".join(word_sequence)

In [22]:
generate_text("Linear regression is most powerful and most widely used algorithm for supervised machine learning.", 10, 5)



'Linear regression is most powerful and most widely used algorithm for supervised machine learning. in data machine to models the analysis learning solve and'

In [23]:
generate_text("I love you.",5,3)



'I love you. predicts misclassification use use information'

In [24]:
generate_text("I am learning data science.",5,5)



'I am learning data science. bsc spot model text text'

In [25]:
generate_text("working on a natural language model",5,3)



'working on a natural language model chapter text approach introduction introduction'

In [26]:
generate_text("Logistic regression is used for classification problem in supervised machine learning", 50, 5)



'Logistic regression is used for classification problem in supervised machine learning models models and understand understand the a role goal in this that process containing that a best key level in of figure the 6 7 training hierarchical decision clusters trees for we new can model add this important model network a optimization optimization model a will fit a a set'

In [27]:
generate_text("decision tree is an algorithm which works as the rule based algorithm", 50, 3)



'decision tree is an algorithm which works as the rule based algorithm a to on neural find a networks evaluate classification a the and and rule b the this or is is b calculated the to if lift positive this is b is going the greater the false cycle performance curve or and can a its learn predictive number this models to'