In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
# Load the preprocessed data from the CSV file
preprocessed_data_file = 'datasets/preprocessed_data.csv'
data = pd.read_csv(preprocessed_data_file)

In [3]:
data.shape

(63136, 12)

In [5]:
# Tokenization
tokenizer = Tokenizer()
data['Body/Text'] = data['Body/Text'].fillna('')  # Replace missing values with empty strings
tokenizer.fit_on_texts(data['Body/Text'])

In [6]:
# Create Input-Output Pairs
inputs = tokenizer.texts_to_sequences(data['Body/Text'])
outputs = tokenizer.texts_to_sequences(data['Body/Text'])

In [7]:
# Numerical Encoding
vocab_size = len(tokenizer.word_index) + 1
max_seq_length = 50  # Maximum sequence length for padding/truncation

In [8]:
# Padding and Truncation
inputs = pad_sequences(inputs, maxlen=max_seq_length, padding='post', truncating='post')
outputs = pad_sequences(outputs, maxlen=max_seq_length, padding='post', truncating='post')

In [9]:
# Data Split
train_inputs, val_test_inputs, train_outputs, val_test_outputs = train_test_split(inputs, outputs, test_size=0.2, random_state=42)
val_inputs, test_inputs, val_outputs, test_outputs = train_test_split(val_test_inputs, val_test_outputs, test_size=0.5, random_state=42)

In [10]:
# Print the shapes of the data arrays
print("Train Inputs shape:", train_inputs.shape)
print("Train Outputs shape:", train_outputs.shape)
print("Validation Inputs shape:", val_inputs.shape)
print("Validation Outputs shape:", val_outputs.shape)
print("Test Inputs shape:", test_inputs.shape)
print("Test Outputs shape:", test_outputs.shape)

Train Inputs shape: (50508, 50)
Train Outputs shape: (50508, 50)
Validation Inputs shape: (6314, 50)
Validation Outputs shape: (6314, 50)
Test Inputs shape: (6314, 50)
Test Outputs shape: (6314, 50)


In [32]:
# Create Input-Output Pairs
sequences = tokenizer.texts_to_sequences(data['Body/Text'])

# Split sequences into input and output
inputs = sequences[:-1]
outputs = sequences[1:]

# Padding and Truncation
inputs = pad_sequences(inputs, maxlen=max_seq_length, padding='post', truncating='post')
outputs = pad_sequences(outputs, maxlen=max_seq_length, padding='post', truncating='post')

# Data Split
train_inputs, val_test_inputs, train_outputs, val_test_outputs = train_test_split(inputs, outputs, test_size=0.2, random_state=42)
val_inputs, test_inputs, val_outputs, test_outputs = train_test_split(val_test_inputs, val_test_outputs, test_size=0.5, random_state=42)

# Print the shapes of the data arrays
print("Train Inputs shape:", train_inputs.shape)
print("Train Outputs shape:", train_outputs.shape)
print("Validation Inputs shape:", val_inputs.shape)
print("Validation Outputs shape:", val_outputs.shape)
print("Test Inputs shape:", test_inputs.shape)
print("Test Outputs shape:", test_outputs.shape)


Train Inputs shape: (50508, 50)
Train Outputs shape: (50508, 50)
Validation Inputs shape: (6313, 50)
Validation Outputs shape: (6313, 50)
Test Inputs shape: (6314, 50)
Test Outputs shape: (6314, 50)


In [33]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

# Define the model architecture
model = Sequential()
model.add(Embedding(vocab_size, 256, input_length=max_seq_length))
model.add(LSTM(256, return_sequences=True))
model.add(LSTM(256))
model.add(Dense(vocab_size, activation='softmax'))

# Compile the model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam')

# Train the model
model.fit(train_inputs, train_outputs, validation_data=(val_inputs, val_outputs), batch_size=32, epochs=10)

# Generate responses
def generate_response(input_text):
    input_sequence = tokenizer.texts_to_sequences([input_text])
    padded_sequence = pad_sequences(input_sequence, maxlen=max_seq_length, padding='post', truncating='post')
    predicted_sequence = model.predict(padded_sequence)
    predicted_token_id = tf.argmax(predicted_sequence, axis=-1).numpy()[0]
    response = tokenizer.sequences_to_texts([[predicted_token_id]])[0]
    return response

# Test the chatbot
user_input = "Hello"
response = generate_response(user_input)
print(f"Chatbot: {response}")


Epoch 1/10


InvalidArgumentError: Graph execution error:

Detected at node 'sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/SparseSoftmaxCrossEntropyWithLogits' defined at (most recent call last):
    File "C:\Users\neele\anaconda3\lib\runpy.py", line 197, in _run_module_as_main
      return _run_code(code, main_globals, None,
    File "C:\Users\neele\anaconda3\lib\runpy.py", line 87, in _run_code
      exec(code, run_globals)
    File "C:\Users\neele\anaconda3\lib\site-packages\ipykernel_launcher.py", line 17, in <module>
      app.launch_new_instance()
    File "C:\Users\neele\anaconda3\lib\site-packages\traitlets\config\application.py", line 846, in launch_instance
      app.start()
    File "C:\Users\neele\anaconda3\lib\site-packages\ipykernel\kernelapp.py", line 712, in start
      self.io_loop.start()
    File "C:\Users\neele\anaconda3\lib\site-packages\tornado\platform\asyncio.py", line 199, in start
      self.asyncio_loop.run_forever()
    File "C:\Users\neele\anaconda3\lib\asyncio\base_events.py", line 601, in run_forever
      self._run_once()
    File "C:\Users\neele\anaconda3\lib\asyncio\base_events.py", line 1905, in _run_once
      handle._run()
    File "C:\Users\neele\anaconda3\lib\asyncio\events.py", line 80, in _run
      self._context.run(self._callback, *self._args)
    File "C:\Users\neele\anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 510, in dispatch_queue
      await self.process_one()
    File "C:\Users\neele\anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 499, in process_one
      await dispatch(*args)
    File "C:\Users\neele\anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 406, in dispatch_shell
      await result
    File "C:\Users\neele\anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 730, in execute_request
      reply_content = await reply_content
    File "C:\Users\neele\anaconda3\lib\site-packages\ipykernel\ipkernel.py", line 390, in do_execute
      res = shell.run_cell(code, store_history=store_history, silent=silent)
    File "C:\Users\neele\anaconda3\lib\site-packages\ipykernel\zmqshell.py", line 528, in run_cell
      return super().run_cell(*args, **kwargs)
    File "C:\Users\neele\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2914, in run_cell
      result = self._run_cell(
    File "C:\Users\neele\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2960, in _run_cell
      return runner(coro)
    File "C:\Users\neele\anaconda3\lib\site-packages\IPython\core\async_helpers.py", line 78, in _pseudo_sync_runner
      coro.send(None)
    File "C:\Users\neele\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 3185, in run_cell_async
      has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
    File "C:\Users\neele\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 3377, in run_ast_nodes
      if (await self.run_code(code, result,  async_=asy)):
    File "C:\Users\neele\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 3457, in run_code
      exec(code_obj, self.user_global_ns, self.user_ns)
    File "C:\Users\neele\AppData\Local\Temp\ipykernel_22240\3304134584.py", line 16, in <module>
      model.fit(train_inputs, train_outputs, validation_data=(val_inputs, val_outputs), batch_size=32, epochs=10)
    File "C:\Users\neele\anaconda3\lib\site-packages\keras\utils\traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "C:\Users\neele\anaconda3\lib\site-packages\keras\engine\training.py", line 1650, in fit
      tmp_logs = self.train_function(iterator)
    File "C:\Users\neele\anaconda3\lib\site-packages\keras\engine\training.py", line 1249, in train_function
      return step_function(self, iterator)
    File "C:\Users\neele\anaconda3\lib\site-packages\keras\engine\training.py", line 1233, in step_function
      outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "C:\Users\neele\anaconda3\lib\site-packages\keras\engine\training.py", line 1222, in run_step
      outputs = model.train_step(data)
    File "C:\Users\neele\anaconda3\lib\site-packages\keras\engine\training.py", line 1024, in train_step
      loss = self.compute_loss(x, y, y_pred, sample_weight)
    File "C:\Users\neele\anaconda3\lib\site-packages\keras\engine\training.py", line 1082, in compute_loss
      return self.compiled_loss(
    File "C:\Users\neele\anaconda3\lib\site-packages\keras\engine\compile_utils.py", line 265, in __call__
      loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    File "C:\Users\neele\anaconda3\lib\site-packages\keras\losses.py", line 152, in __call__
      losses = call_fn(y_true, y_pred)
    File "C:\Users\neele\anaconda3\lib\site-packages\keras\losses.py", line 284, in call
      return ag_fn(y_true, y_pred, **self._fn_kwargs)
    File "C:\Users\neele\anaconda3\lib\site-packages\keras\losses.py", line 2098, in sparse_categorical_crossentropy
      return backend.sparse_categorical_crossentropy(
    File "C:\Users\neele\anaconda3\lib\site-packages\keras\backend.py", line 5633, in sparse_categorical_crossentropy
      res = tf.nn.sparse_softmax_cross_entropy_with_logits(
Node: 'sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/SparseSoftmaxCrossEntropyWithLogits'
logits and labels must have the same first dimension, got logits shape [32,62709] and labels shape [1600]
	 [[{{node sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/SparseSoftmaxCrossEntropyWithLogits}}]] [Op:__inference_train_function_44445]