In [11]:
import pandas as pd
import numpy as np

import tensorflow as tf
# from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import EarlyStopping
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense,Dropout
from keras.preprocessing.text import Tokenizer
import keras.utils as ku
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import load_model
import re
import string

In [12]:
df = pd.read_csv('/content/transcripts.csv.zip')

In [13]:
df.shape

(2467, 2)

In [14]:
fraction = 0.03
df = df.sample(frac=fraction, random_state=42)

In [15]:
df.shape

(74, 2)

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 74 entries, 1711 to 1173
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   transcript  74 non-null     object
 1   url         74 non-null     object
dtypes: object(2)
memory usage: 1.7+ KB


In [17]:
df['transcript'] = df['transcript'].astype(str)

In [18]:
# text=''
# for i in df['transcript']:
#     text=text + i

In [19]:
# train_text = text[:500000]

In [20]:
# cleaned_text = []
# for i in train_text.split():
#     i = i.lower()
#     i = re.sub('https?://\S+|www\.\S+', '', i)

#     #now stemming all the words that are not stop words
#     # i = i.split()
#     # review = [ps.stem(word) for word in review if not word in stop_words]
#     # review = ' '.join(review)
#     i = re.sub('\\W', ' ', i)
#     i = re.sub('\n', '', i)
#     i = re.sub(' +', ' ', i)
#     i = re.sub('^ ', '', i)
#     i = re.sub(' $', '', i)
#     cleaned_text.append(i)

In [21]:
# train_text[:500]

In [22]:
def clean(text):
    return re.sub("([\(\[]).*?([\)\]])", "\g<1>\g<2>",text)

In [23]:
df['transcript'] = df['transcript'].apply(lambda x:clean(x))

In [24]:
df['transcript']

1711    I would like to share with you a new model of ...
1557    Mobility in developing world cities is a very ...
1870    I dedicated the past two years to understandin...
1703    When I was preparing for this talk, I went to ...
1857    We are built out of very small stuff, and we a...
                              ...                        
1851    Today I'm going to speak to you about the last...
296     What I want to tell you about today is how I s...
435     This is a sculpture I made, which is a way of,...
1993    Interpreter: Piano, "p," is my favorite musica...
1173                                     ()()()()()()()()
Name: transcript, Length: 74, dtype: object

In [25]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [26]:
def clean_text(txt):
    return txt.translate(str.maketrans('', '', string.punctuation))

In [27]:
df['transcript'] = df['transcript'].apply(lambda x:clean_text(x))

In [28]:
df['transcript']

1711    I would like to share with you a new model of ...
1557    Mobility in developing world cities is a very ...
1870    I dedicated the past two years to understandin...
1703    When I was preparing for this talk I went to s...
1857    We are built out of very small stuff and we ar...
                              ...                        
1851    Today Im going to speak to you about the last ...
296     What I want to tell you about today is how I s...
435     This is a sculpture I made which is a way of k...
1993    Interpreter Piano p is my favorite musical sym...
1173                                                     
Name: transcript, Length: 74, dtype: object

In [29]:
token = Tokenizer()
token.fit_on_texts(df['transcript'])

In [30]:
token.word_index

{'the': 1,
 'and': 2,
 'to': 3,
 'of': 4,
 'a': 5,
 'that': 6,
 'in': 7,
 'i': 8,
 'is': 9,
 'you': 10,
 'we': 11,
 'it': 12,
 'this': 13,
 'was': 14,
 'for': 15,
 'so': 16,
 'are': 17,
 'have': 18,
 'on': 19,
 'they': 20,
 'with': 21,
 'its': 22,
 'can': 23,
 'what': 24,
 'but': 25,
 '—': 26,
 'about': 27,
 'be': 28,
 'at': 29,
 'do': 30,
 'as': 31,
 'not': 32,
 'all': 33,
 'my': 34,
 'were': 35,
 'people': 36,
 'from': 37,
 'there': 38,
 'one': 39,
 'if': 40,
 'like': 41,
 'our': 42,
 'or': 43,
 'just': 44,
 'these': 45,
 'me': 46,
 'very': 47,
 'he': 48,
 'had': 49,
 'because': 50,
 'an': 51,
 'now': 52,
 'out': 53,
 'how': 54,
 'when': 55,
 'thats': 56,
 'more': 57,
 'see': 58,
 'which': 59,
 'by': 60,
 'them': 61,
 'up': 62,
 'know': 63,
 'their': 64,
 'would': 65,
 'going': 66,
 'think': 67,
 'really': 68,
 'your': 69,
 'years': 70,
 'who': 71,
 'us': 72,
 'get': 73,
 'time': 74,
 'some': 75,
 'world': 76,
 'here': 77,
 'then': 78,
 'has': 79,
 'said': 80,
 'little': 81,
 'dont':

In [31]:
len(token.word_index)

12152

In [32]:
len(token.word_counts)

12152

In [33]:
token.document_count

74

In [34]:
sequence = token.texts_to_sequences(df['transcript'])

In [35]:
sequence

[[8,
  65,
  41,
  3,
  428,
  21,
  10,
  5,
  124,
  387,
  4,
  418,
  374,
  5,
  387,
  6,
  358,
  3949,
  23,
  5789,
  1,
  1675,
  1501,
  4,
  553,
  4,
  1150,
  2,
  3067,
  910,
  6,
  1222,
  65,
  28,
  287,
  685,
  117,
  29,
  1,
  76,
  718,
  5,
  217,
  2,
  911,
  19,
  12,
  10,
  87,
  181,
  606,
  3068,
  418,
  374,
  214,
  719,
  75,
  4,
  5790,
  2535,
  14,
  1151,
  7,
  5791,
  3,
  5,
  328,
  4,
  359,
  190,
  554,
  1,
  859,
  503,
  48,
  2,
  111,
  328,
  35,
  2170,
  3,
  3069,
  3,
  5792,
  38,
  7,
  5793,
  4,
  111,
  1152,
  48,
  5794,
  439,
  292,
  21,
  860,
  686,
  3950,
  48,
  211,
  3,
  912,
  3,
  418,
  374,
  25,
  1347,
  3,
  111,
  328,
  329,
  19,
  1,
  913,
  522,
  48,
  14,
  687,
  1348,
  3,
  607,
  914,
  3,
  138,
  2,
  1502,
  121,
  242,
  3,
  1349,
  111,
  328,
  2535,
  154,
  419,
  62,
  111,
  808,
  4,
  418,
  374,
  658,
  29,
  591,
  168,
  138,
  48,
  5795,
  1,
  2171,
  212,
  15,
  429,
  

In [36]:
def flatten_list(nested_list):
    flattened = []
    for item in nested_list:
        if isinstance(item, list):
            flattened.extend(flatten_list(item))
        else:
            flattened.append(item)
    return flattened


sequence_list = flatten_list(sequence)



In [37]:
len(sequence_list)

135982

In [38]:
sequences = []

for i in range(3, len(sequence_list)):
    words = sequence_list[i-3:i+1]
    sequences.append(words)

print("The Length of sequences are: ", len(sequences))
sequences = np.array(sequences)
sequences[:10]

The Length of sequences are:  135979


array([[  8,  65,  41,   3],
       [ 65,  41,   3, 428],
       [ 41,   3, 428,  21],
       [  3, 428,  21,  10],
       [428,  21,  10,   5],
       [ 21,  10,   5, 124],
       [ 10,   5, 124, 387],
       [  5, 124, 387,   4],
       [124, 387,   4, 418],
       [387,   4, 418, 374]])

In [39]:
x = []
y = []

for i in sequences:
    x.append(i[0:3])
    y.append(i[3])

x = np.array(x)
y = np.array(y)

In [40]:
print("Data: ", x[:10])
print("Response: ", y[:10])

Data:  [[  8  65  41]
 [ 65  41   3]
 [ 41   3 428]
 [  3 428  21]
 [428  21  10]
 [ 21  10   5]
 [ 10   5 124]
 [  5 124 387]
 [124 387   4]
 [387   4 418]]
Response:  [  3 428  21  10   5 124 387   4 418 374]


In [41]:
vocab_size = len(token.word_index) + 1
print(vocab_size)

12153


In [42]:
y = to_categorical(y, num_classes=vocab_size)

In [43]:
len(y)

135979

In [44]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)

In [45]:
x_train.shape,x_test.shape,y_train.shape,y_test.shape

((91105, 3), (44874, 3), (91105, 12153), (44874, 12153))

In [46]:
model1 = Sequential()
model1.add(Embedding(vocab_size, 10, input_length=3))
model1.add(LSTM(100, return_sequences=True))
model1.add(LSTM(100))
model1.add(Dense(100, activation="relu"))
model1.add(Dense(vocab_size, activation="softmax"))

In [47]:
model1.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 3, 10)             121530    
                                                                 
 lstm (LSTM)                 (None, 3, 100)            44400     
                                                                 
 lstm_1 (LSTM)               (None, 100)               80400     
                                                                 
 dense (Dense)               (None, 100)               10100     
                                                                 
 dense_1 (Dense)             (None, 12153)             1227453   
                                                                 
Total params: 1,483,883
Trainable params: 1,483,883
Non-trainable params: 0
_________________________________________________________________


In [48]:
# callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss',patience=10,verbose=1,mode='auto')

In [49]:
model1.compile(loss="categorical_crossentropy", optimizer=Adam(learning_rate=0.001),metrics=["accuracy"])
history = model1.fit(x_train, y_train, epochs=50, validation_split=0.1,batch_size=32)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [50]:
def Predict_Words(model, tokenizer, text, n_word):
    result=text
    for _ in range(n_word):
        sequence = tokenizer.texts_to_sequences([text])
        sequence = np.array(sequence)
        preds = np.argmax(model.predict(sequence))
        predicted_word = ""

        for key, value in tokenizer.word_index.items():
            if value == preds:
                predicted_word = key
                break
        result+=" "+predicted_word
        # text=str(text)
        # text = text.split(" ")
        # text = text[-3:]
        # text+=" "+predicted_word

    return result

In [55]:
print(Predict_Words(model1, token, "I am a",1))
print(Predict_Words(model1, token, "dogs are ",1))
print(Predict_Words(model1, token, "I'm really grateful",3))
print(Predict_Words(model1, token, "human creativity in",2))

I am a very
dogs are  livingston
I'm really grateful now and lot
human creativity in human world


In [56]:
print(Predict_Words(model1, token, "God please help",1))

God please help that
