In [19]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [23]:
import pandas as pd
import numpy as np

path = '/content/drive/MyDrive/SE-Chatting.csv'

data_frame = pd.read_csv(path)
data_frame.head()

Unnamed: 0,input,output,context
0,What is 1 - 91?,Hello! The answer is -90.,helpful assistant
1,What is 16 - 79?,Hello! The answer is -63.,helpful assistant
2,What is 67 * 40?,Hello! The answer is 2680.,helpful assistant
3,What is 5460 / 60?,Hello! The answer is 91.,helpful assistant
4,What is 3848 / 52?,Hello! The answer is 74.,helpful assistant


In [24]:
data_frame.count()

Unnamed: 0,0
input,10000140
output,10000140
context,10000140


### 1. Data Preparation

In [6]:
### Deleting the context which is same for all the entries

data_frame = data_frame.drop(['context'], axis=1)
data_frame.head()


Unnamed: 0,input,output
0,What is 1 - 91?,Hello! The answer is -90.
1,What is 16 - 79?,Hello! The answer is -63.
2,What is 67 * 40?,Hello! The answer is 2680.
3,What is 5460 / 60?,Hello! The answer is 91.
4,What is 3848 / 52?,Hello! The answer is 74.


In [21]:
### Converting all the data in the dataset to lower case

data_frame['input'] = data_frame['input'].str.lower()
data_frame['output'] = data_frame['output'].str.lower()
data_frame.head()

Unnamed: 0,input,output,context
0,what is 1 - 91?,hello! the answer is -90.,helpful assistant
1,what is 16 - 79?,hello! the answer is -63.,helpful assistant
2,what is 67 * 40?,hello! the answer is 2680.,helpful assistant
3,what is 5460 / 60?,hello! the answer is 91.,helpful assistant
4,what is 3848 / 52?,hello! the answer is 74.,helpful assistant


In [26]:
### Delete the duplicates if we have any

data_frame = data_frame.drop_duplicates()
data_frame.head()

Unnamed: 0,input,output,context
0,What is 1 - 91?,Hello! The answer is -90.,helpful assistant
1,What is 16 - 79?,Hello! The answer is -63.,helpful assistant
2,What is 67 * 40?,Hello! The answer is 2680.,helpful assistant
3,What is 5460 / 60?,Hello! The answer is 91.,helpful assistant
4,What is 3848 / 52?,Hello! The answer is 74.,helpful assistant


In [30]:
### Cleaning the data by removing unwnated symbols and preserving the symbols which are important for teh math problems
import re

In [31]:
data_frame['text'] = "[input] " + data_frame['input'] + " [output] " + data_frame['output'] + " [end]"
data_frame.drop(['input', 'output'], axis=1, inplace=True)
data_frame.head()

Unnamed: 0,context,text
0,helpful assistant,[input] What is 1 - 91? [output] Hello! The an...
1,helpful assistant,[input] What is 16 - 79? [output] Hello! The a...
2,helpful assistant,[input] What is 67 * 40? [output] Hello! The a...
3,helpful assistant,[input] What is 5460 / 60? [output] Hello! The...
4,helpful assistant,[input] What is 3848 / 52? [output] Hello! The...


In [32]:
data_frame.count()

Unnamed: 0,0
context,40070
text,40070


In [33]:
data_frame.tail()

Unnamed: 0,context,text
10000065,helpful assistant,[input] How do black holes form? [output] Hell...
10000066,helpful assistant,[input] What is the theory of relativity? [out...
10000067,helpful assistant,[input] How does photosynthesis work? [output]...
10000068,helpful assistant,[input] What is the structure of DNA? [output]...
10000069,helpful assistant,[input] What is the role of the mitochondria? ...


In [11]:
full_text = data_frame['text'].str.cat(sep=' ')
len(full_text)

2642205

In [12]:
# Make lower case and split into individual words.
from tensorflow.keras.preprocessing.text import text_to_word_sequence
import tensorflow as tf

### I don't want to  remove few of the special characters thats the reason i have changed the filters
text = text_to_word_sequence(full_text, filters='!"#$%&(),.:;@[\\]^_`{|}~\t\n', lower=True, split=' ')



In [13]:
# Create training examples.
fragments = []
targets = []
with tf.device('/device:GPU:0'):
  for i in range(0, len(text) - 40, 3):
      fragments.append(text[i: i + 40])
      targets.append(text[i + 40])

In [14]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Input
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.text import text_to_word_sequence

from tensorflow.keras.preprocessing.sequence import pad_sequences

import tensorflow as tf
import logging
tf.get_logger().setLevel(logging.ERROR)

In [15]:
vocab_size = len(set(text))

In [16]:
vocab_size

3594

In [17]:
from sklearn.model_selection import train_test_split
from tokenizers import Tokenizer, models, pre_tokenizers, trainers, decoders


bpe_tokenizer = Tokenizer(models.BPE())
bpe_tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
trainer = trainers.BpeTrainer(vocab_size = vocab_size,special_tokens=["[input]", "[output]", "[end]"])
bpe_tokenizer.train_from_iterator([text], trainer=trainer)

bpe_tokenizer.decoder = decoders.ByteLevel()

max_length = 100
pad_token_id = bpe_tokenizer.token_to_id("[PAD]")

def tokenize_and_pad(text):
    encoded = bpe_tokenizer.encode(text)
    tokens = encoded.ids
    if len(tokens) > max_length:
        tokens = tokens[:max_length]
    else:
        tokens = tokens + [pad_token_id] * (max_length - len(tokens))
    return tokens

# Tokenize and pad input and output
data_frame['input_tok'] = data_frame['text'].apply(lambda x: tokenize_and_pad("[input] " + x + " [output]"))
data_frame['output_tok'] = data_frame['text'].apply(lambda x: tokenize_and_pad(x + " [end]"))

input_tokens = np.stack(data_frame['input_tok'].values)
output_tokens = np.stack(data_frame['output_tok'].values)