<a href="https://colab.research.google.com/github/Aayush360/Natural_langauge_processing/blob/main/NLP_Text_Generator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import nltk
from nltk.corpus import stopwords
import pandas as pd
import numpy as np

import re

from keras.preprocessing.sequence import pad_sequences
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import Dense,LSTM,Dropout,Embedding

In [2]:
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
df_bkp = pd.read_csv('DataSetNew.txt',sep="\n", header=None)

In [4]:
df_bkp.head()

Unnamed: 0,0
0,hello
1,"hi, how are you"
2,hi
3,"hello, welcome to naulo restaurant"
4,hey


In [5]:
df_bkp.tail()

Unnamed: 0,0
565,santosh shah who dedicated his life to cooking...
566,what restaurant does santosh shah saha work
567,santosh shah works in cinnamon kitchen as head...
568,what are the upcoming events of santosh shah saha
569,there are no upcoming events of santosh shah a...


In [6]:
data = df_bkp.copy()

In [7]:
data.columns=['Sentences']

In [8]:
data['Sentences']

0                                                  hello
1                                        hi, how are you
2                                                     hi
3                     hello, welcome to naulo restaurant
4                                                    hey
                             ...                        
565    santosh shah who dedicated his life to cooking...
566          what restaurant does santosh shah saha work
567    santosh shah works in cinnamon kitchen as head...
568    what are the upcoming events of santosh shah saha
569    there are no upcoming events of santosh shah a...
Name: Sentences, Length: 570, dtype: object

In [9]:
## Pre-Process the data

In [10]:
stop = set(stopwords.words('english'))

def stopword_removal(data_point):
  data= [x for x in data_point.split() if x not in stop]
  return data

In [11]:
def clean_data(data): 
  '''works for single sentence and returns cleaned tokens of words in the sentence along with unique_words'''
  cleaned_data =[]
  all_unique_words_in_each_description=[]
  for entry in data:
    entry = re.sub(pattern='[^a-zA-Z]',repl=" ", string=entry) # check if alphabet, remove non-alphabet
    entry = re.sub(r'\b\w{0,1}\b', repl=" ", string=entry)  # remove single word or blank character/word
    entry =entry.lower()
    entry = stopword_removal(entry)
    cleaned_data.append(entry) # list with in list 0(1) time-complexity
    unique = list(set(entry))
    all_unique_words_in_each_description.extend(unique) # appended list O(K) time-complexity, k- lenght of list need to be added
  return cleaned_data,all_unique_words_in_each_description

In [12]:
# function to make vocabulary

def unique_words(data):
  unique_words = set(data)
  return unique_words, len(unique_words)

In [13]:
## applying data cleaning

In [14]:
cleaned_data, all_unique_words_in_each_description = clean_data(data['Sentences'])


In [15]:
unique_words, len_unique_words = unique_words(all_unique_words_in_each_description)

In [None]:
unique_words

In [17]:
len_unique_words  # this is our vocab_size


847

In [18]:

## peeking at one of the cleaned data

len(cleaned_data[0])

1

In [19]:
cleaned_data[3]

['hello', 'welcome', 'naulo', 'restaurant']

In [20]:
## Building a Mapper

In [21]:
def build_indices(unique_words):
  word_to_index = {}
  index_to_word = {}
  for i,word in enumerate(unique_words):
    word_to_index[word]=i
    index_to_word[i]=word
  return word_to_index, index_to_word

In [22]:
word_to_index, index_to_word = build_indices(unique_words)

In [23]:
## prepare training corpus

In [24]:
def prepare_corpus(corpus,word_to_index):
  sequences= []
  for line in corpus:
    tokens = line
    for i in range(1, len(tokens)):
      i_gram_sequence = tokens[:i+1]
      i_gram_sequence_ids = []

      for j,token in enumerate(i_gram_sequence):
        i_gram_sequence_ids.append(word_to_index[token])
      sequences.append(i_gram_sequence_ids)
  return sequences

In [25]:
sequences=prepare_corpus(cleaned_data,word_to_index)
max_sequence_len = max([len(x) for x in sequences]) # find sentence with maximum length

In [26]:
len(sequences) # our sequences contains 2828 list with colleciton of n-grams for each sentence staritng from length 2


2828

In [27]:
max_sequence_len # so the max len of sentece after cleaning and tokenizing is 308


38

In [28]:
sequences[0] # looking at the first sentence and its i-gram indices (first sentence with 2 words indexes)


[75, 102]

In [29]:
sequences[1]


[75, 102, 89]

In [30]:
# let us see which words are mapped to these indices

print(index_to_word[640])
print(index_to_word[286])
print(index_to_word[733])

putting
assist
mexicanna


In [31]:
# Finally, we have correctly build our sequences


In [32]:

## Preparing the data for trainng the model
# we should pad the input to the longest words in the sequences 38 in this case
# also we need to split the data into dependent and independent variable

In [33]:
def build_input_data(sequences,max_sequence_len, len_unique_words):
  sequences = np.array(pad_sequences(sequences,maxlen=max_sequence_len, padding='pre'))
  X = sequences[:,:-1]
  y = sequences[:,-1]
  y = np_utils.to_categorical(y,len_unique_words) # convert correspoding row index to 1, and len of each word is vector of length len_unique_words
  return X,y

In [34]:
X,y = build_input_data(sequences,max_sequence_len,len_unique_words)


In [35]:
### Building the model

input_dim: This is the size of the vocabulary in the text data. For example, if your data is integer encoded to values between 0-10, then the size of the vocabulary would be 11 words. output_dim: This is the size of the vector space in which words will be embedded. It defines the size of the output vectors from this layer for each word. For example, it could be 32 or 100 or even larger. Test different values for your problem. input_length: This is the length of input sequences, as you would define for any input layer of a Keras model. For example, if all of your input documents are comprised of 1000 words, this would be 1000.

In [36]:
# 10 indicates that we want a Dense embedding of size 10 as output from our model.
# 128 LSTM units in the hidden layer, dimensionality of the inner cells in the LSTM layer.
# randomly drop off 20% of neurons from the network using the Dropout layer.
#  multi-class classification problem, and so the softmax activation function is used.

In [37]:

def create_model(max_sequence_len, len_unique_words):
  model = Sequential()
  model.add(Embedding(len_unique_words,10,input_length=max_sequence_len-1))
  model.add(LSTM(128))
  model.add(Dropout(0.2))
  model.add(Dense(len_unique_words, activation='softmax'))
  model.compile(loss='categorical_crossentropy',optimizer='adam')
  return model

In [38]:

model = create_model(max_sequence_len, len_unique_words)
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 37, 10)            8470      
_________________________________________________________________
lstm (LSTM)                  (None, 128)               71168     
_________________________________________________________________
dropout (Dropout)            (None, 128)               0         
_________________________________________________________________
dense (Dense)                (None, 847)               109263    
Total params: 188,901
Trainable params: 188,901
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.fit(X,y,batch_size=128, epochs=100)


In [40]:
## testing and text generation

In [41]:

def generate_text(seed_text, next_words, model, max_seq_len):
  for _ in range(next_words):
    cleaned_data = clean_data([seed_text])
    sequences = prepare_corpus(cleaned_data[0],word_to_index)
    sequences = pad_sequences([sequences[-1]],maxlen=max_seq_len-1,padding='pre')
    predicted = model.predict_classes(sequences, verbose=0)
    output_word=''
    output_word = index_to_word[predicted[0]]
    seed_text = seed_text+" "+output_word
  return seed_text.title()

In [51]:
print(generate_text("Hello do you like",8,model,max_sequence_len))



Hello Do You Like Suggest Favorite Cheese Brust Pizza Done Pizza Thank
