In [24]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential , load_model
from tensorflow.keras.layers import Dense , Dropout , LSTM , Embedding , SpatialDropout1D , Bidirectional
from tensorflow.keras.callbacks import EarlyStopping , ModelCheckpoint , ReduceLROnPlateau
import pickle 

In [25]:
with open("/kaggle/input/dialogues-text/dialogues_text.txt", "r", encoding="utf-8") as f:
    data = f.readlines()


In [26]:
all_utterances = []

for line in data:
    line = line.strip() 
    if not line:  
        continue
    
    
    utterances = line.split("__eou__")
    
    
    for u in utterances:
        u = u.strip()
        if u:
            words = u.split() 
            words = " ".join(words).lower() 
            all_utterances.append(words)


In [5]:
all_utterances

['the kitchen stinks .',
 "i'll throw out the garbage .",
 'so dick , how about getting some coffee for tonight ?',
 'coffee ? i don ’ t honestly like that kind of stuff .',
 'come on , you can at least try a little , besides your cigarette .',
 'what ’ s wrong with that ? cigarette is the thing i go crazy for .',
 'not for me , dick .',
 'are things still going badly with your houseguest ?',
 'getting worse . now he ’ s eating me out of house and home . i ’ ve tried talking to him but it all goes in one ear and out the other . he makes himself at home , which is fine . but what really gets me is that yesterday he walked into the living room in the raw and i had company over ! that was the last straw .',
 'leo , i really think you ’ re beating around the bush with this guy . i know he used to be your best friend in college , but i really think it ’ s time to lay down the law .',
 'you ’ re right . everything is probably going to come to a head tonight . i ’ ll keep you informed .',
 'w

In [27]:
token = Tokenizer(num_words=7000 , oov_token="UNK")
token.fit_on_texts(all_utterances)

In [28]:
with open("token.pkl","wb") as f:
    pickle.dump(token,f)

In [7]:
df = []
for line in all_utterances:
    token_data = token.texts_to_sequences([line])[0]
    for i in range(1 ,len(token_data)):
        idx = token_data[ : i+1]
        df.append(idx)


In [8]:
df

[[4, 761],
 [4, 761, 6925],
 [89, 1322],
 [89, 1322, 79],
 [89, 1322, 79, 4],
 [89, 1322, 79, 4, 3070],
 [32, 2416],
 [32, 2416, 31],
 [32, 2416, 31, 34],
 [32, 2416, 31, 34, 289],
 [32, 2416, 31, 34, 289, 62],
 [32, 2416, 31, 34, 289, 62, 383],
 [32, 2416, 31, 34, 289, 62, 383, 15],
 [32, 2416, 31, 34, 289, 62, 383, 15, 295],
 [383, 3],
 [383, 3, 110],
 [383, 3, 110, 10],
 [383, 3, 110, 10, 56],
 [383, 3, 110, 10, 56, 2230],
 [383, 3, 110, 10, 56, 2230, 27],
 [383, 3, 110, 10, 56, 2230, 27, 14],
 [383, 3, 110, 10, 56, 2230, 27, 14, 167],
 [383, 3, 110, 10, 56, 2230, 27, 14, 167, 11],
 [383, 3, 110, 10, 56, 2230, 27, 14, 167, 11, 794],
 [105, 26],
 [105, 26, 2],
 [105, 26, 2, 20],
 [105, 26, 2, 20, 36],
 [105, 26, 2, 20, 36, 558],
 [105, 26, 2, 20, 36, 558, 164],
 [105, 26, 2, 20, 36, 558, 164, 6],
 [105, 26, 2, 20, 36, 558, 164, 6, 165],
 [105, 26, 2, 20, 36, 558, 164, 6, 165, 618],
 [105, 26, 2, 20, 36, 558, 164, 6, 165, 618, 18],
 [105, 26, 2, 20, 36, 558, 164, 6, 165, 618, 18, 2368

In [9]:
lengths = [len(seq) for seq in df]
print("max:", max(lengths))
print("95th percentile:", np.percentile(lengths, 95))
print("99th percentile:", np.percentile(lengths, 99))

max: 256
95th percentile: 30.0
99th percentile: 50.0


In [10]:
padded_data = pad_sequences(df , maxlen=30 , padding="pre")

In [11]:
padded_data

array([[   0,    0,    0, ...,    0,    4,  761],
       [   0,    0,    0, ...,    4,  761, 6925],
       [   0,    0,    0, ...,    0,   89, 1322],
       ...,
       [   0,    0,    0, ...,   25,  631,   57],
       [   0,    0,    0, ...,  631,   57,  317],
       [   0,    0,    0, ...,    0,   84,    2]], dtype=int32)

In [12]:
X = padded_data[ : ,:-1]
y = padded_data[ : , -1]

In [13]:
X

array([[  0,   0,   0, ...,   0,   0,   4],
       [  0,   0,   0, ...,   0,   4, 761],
       [  0,   0,   0, ...,   0,   0,  89],
       ...,
       [  0,   0,   0, ...,  47,  25, 631],
       [  0,   0,   0, ...,  25, 631,  57],
       [  0,   0,   0, ...,   0,   0,  84]], dtype=int32)

In [14]:
y

array([ 761, 6925, 1322, ...,   57,  317,    2], dtype=int32)

In [15]:
X_train , X_test , y_train , y_test = train_test_split(X , y , test_size= 0.2 , random_state=12)

In [16]:
X_test

array([[  0,   0,   0, ..., 119,   3, 352],
       [  0,   0,   0, ...,  24, 408,  11],
       [  0,   0,   0, ...,   0,  59,  13],
       ...,
       [  0,   0,   0, ...,   2,   3,  13],
       [  0,   0,   0, ..., 488,   6, 144],
       [  0,   0,   0, ...,  21,  33,  75]], dtype=int32)

In [17]:
X_train

array([[   0,    0,    0, ...,    0,    0,  624],
       [   0,    0,    0, ...,    9,   44,  155],
       [   0,    0,    0, ...,   35,  117, 1542],
       ...,
       [   0,    0,    0, ...,   15, 1505,   36],
       [  75,   70,    1, ...,  981,    3,   42],
       [   0,    0,    0, ...,    1,  889,    3]], dtype=int32)

In [18]:
"""model = Sequential()
model.add(Embedding(7001, 200))
model.add(SpatialDropout1D(0.2))
model.add(Bidirectional(LSTM(300, return_sequences=True)))
model.add(Dropout(0.2))
model.add(LSTM(150))
model.add(Dense(7001, activation='softmax'))


opt = tf.keras.optimizers.Adam(learning_rate=0.001)
losses = tf.keras.losses.sparse_categorical_crossentropy
top_10 = tf.keras.metrics.SparseTopKCategoricalAccuracy(k = 10 , name= "Top_10")

model.compile(optimizer = opt , loss = losses , metrics = ["accuracy", top_10])"""

'model = Sequential()\nmodel.add(Embedding(7001, 200))\nmodel.add(SpatialDropout1D(0.2))\nmodel.add(Bidirectional(LSTM(300, return_sequences=True)))\nmodel.add(Dropout(0.2))\nmodel.add(LSTM(150))\nmodel.add(Dense(7001, activation=\'softmax\'))\n\n\nopt = tf.keras.optimizers.Adam(learning_rate=0.001)\nlosses = tf.keras.losses.sparse_categorical_crossentropy\ntop_10 = tf.keras.metrics.SparseTopKCategoricalAccuracy(k = 10 , name= "Top_10")\n\nmodel.compile(optimizer = opt , loss = losses , metrics = ["accuracy", top_10])'

In [19]:
early = EarlyStopping(monitor="val_loss" , patience= 10 , restore_best_weights=True)
checkpoint = ModelCheckpoint(filepath= "/kaggle/working/modelchats.keras" , monitor="val_loss", save_best_only=True, save_weights_only=False, verbose=1)
reduce_lr = ReduceLROnPlateau(
    monitor='val_loss',  
    factor=0.5,          
    patience=2,           
    verbose=1,
    min_lr=1e-5          
)


In [20]:
model = load_model("/kaggle/working/modelchats.keras")

I0000 00:00:1756904113.822425    1468 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 15513 MB memory:  -> device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0


In [None]:
history = model.fit(X_train,y_train, validation_split=0.2 , epochs=40 , callbacks=[early,checkpoint,reduce_lr] , batch_size = 32)

In [None]:
history = model.fit(X_train,y_train, validation_split=0.2 , epochs=40 , callbacks=[early,checkpoint,reduce_lr] , batch_size = 32)

In [22]:
model.evaluate(X_test,y_test)

[1m6775/6775[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 7ms/step - Top_10: 0.5493 - accuracy: 0.2421 - loss: 4.5106


[4.525643348693848, 0.2413627654314041, 0.5490142703056335]

In [23]:
perplexity = np.exp(4.5106)
perplexity

90.9763879708369