# ChatBot-v1

In [1]:
### All imports
import torch
from torch import nn
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
#import 

## Data manipulation

### Get data

In [2]:
### Setup dataset viarables and methods for data import
import os
current_dir = os.getcwd()
data_dir = os.path.join(os.path.dirname(current_dir), "datasets", "plain_text_wikipedia", "AllCombined.txt")  # Path to all data

def get_lines(file_path):
    """
    Args:
        file_path: path to file (.txt file only)
    Returns:
        all the lines found in given file
    """
    with open(file_path, "r") as f:
        return f.readlines()

In [3]:
### Importing and storing the data into a variable
all_data = get_lines(data_dir)
print(f"Random line from the dataset:\n{all_data[60]}")  # Fixed position to 60 | many times the line is "\n"
print(f"Number of lines: {len(all_data)}")

Random line from the dataset:
Some people say that art is a product or item that is made with the intention of stimulating the human senses as well as the human mind, spirit and soul. An artwork is normally judged by how much impact it has on people, the number of people who can relate to it, and how much they appreciate it. Some people also get inspired.

Number of lines: 2052699


### Split and validate data

In [4]:
### Split the data into train-test-validation splits
def get_splits(target_data,
               train_size: float = 0.7,
               test_size: float = 0.15,
               validation_size: float = 0.15,
               shuffle: bool = True,
               seed: int = 42):
    """
    Args:
        target_data: dataset to split on
        train_size: size of the train dataset
        test_size: size of the test dataset
        validation_size: size of the validation dataset
        shuffle: shuffle the target_dataset before spliting
    Returns:
        train_split, test_split, validation_split
    """
    import random

    # Setup
    _sum = train_size+test_size+validation_size
    assert _sum == 1, f"Sum of all sizes must be 1 got {sum}"
    random.seed(seed)
    if shuffle:
        random.shuffle(target_data)

    # Get split idx's
    total_length = len(target_data)
    train_end_idx = int(total_length * train_size)
    test_end_idx = train_end_idx + int(total_length * test_size) + 1
    validation_end_idx = test_end_idx + int(total_length * validation_size) + 1
    
    # Split the data into train-test-validation
    train_split = target_data[:train_end_idx]
    test_split = target_data[train_end_idx:test_end_idx]
    validation_split = target_data[test_end_idx:validation_end_idx]

    return train_split, test_split, validation_split

def validate_splits(train_data, test_data, validation_data, leak_ok: bool = False):
    """
    Args:
        leak_ok: set to False to check for data-leakage, set to True otherwise
        *All other arguments have same meaning as their name
    """
    _train_set = set(train_data)
    _test_set = set(test_data)
    _validation_set = set(validation_data)
    overlap_train_test = _train_set.intersection(_test_set)
    overlap_train_validation = _train_set.intersection(_validation_set)
    overlap_validation_test = _validation_set.intersection(_test_set)
    print(f"Overlap train-test: {len(overlap_train_test)}")
    print(f"Overlap train-validation: {len(overlap_train_validation)}")
    print(f"Overlap validation-test: {len(overlap_validation_test)}")
    _sum = len(overlap_train_test)+len(overlap_train_validation)+len(overlap_validation_test)
    if not leak_ok:
        assert _sum == 0, f"There exists a data-leakage. Found '{_sum}' samples overlaping."
    return overlap_train_test, overlap_train_validation, overlap_validation_test



# Get splits
train_data, test_data, validation_data = get_splits(
    target_data=all_data,
    # Leaving all as default
)

# Get some info about the splits
_sum = len(train_data)+len(test_data)+len(validation_data)  # sum of splits
print(f"Train samples: {len(train_data)}")
print(f"Test samples: {len(test_data)}")
print(f"Validation samples: {len(validation_data)}")
print(f"{_sum==len(all_data)}")
# Check for data leakages
overlaps = validate_splits(
    train_data=train_data,
    test_data=test_data,
    validation_data=validation_data,
    leak_ok=True, # We may have some leaks atm from chars like "\n"
)

Train samples: 1436889
Test samples: 307905
Validation samples: 307905
True
Overlap train-test: 2454
Overlap train-validation: 2520
Overlap validation-test: 988


### Getting the data ready

Creating the **Tokenizer** and **Text-dataset** classes. \
These will be used to create PyTorch friendly datasets.

In [8]:
### Check the data split's lengths
len(train_data), len(test_data), len(validation_data)

(1436889, 307905, 307905)

In [20]:
### Get vocab
# Create a general purpose tokenizer
class Tokenizer():
    def __init__(self, token_size):
        """
        Creates an instance of a tokenizer.
        You can use this object to convert raw data into tokens.

        Args:
            token_size: size of each token
        """
        self.token_size = token_size
        self._vocab = None
        self._stoi = None
        self._itos = None

    def _preprocess(self,
                    text: list,
                    strip_punctuation = False) -> list:
        """
        Preprocesses a string to handle punctuation and normalize tokens.

        Args:
            text: a string to preprocess
            strip_punctuation: whether to strip punctuation during preprocessing
        Returns:
            Preprocessed string
        """
        import re
        import string

        # Strip punctuation if required
        if strip_punctuation:
            text = re.sub(r'[{}]'.format(re.escape(string.punctuation)), '', text)
        else:
            # Add spaces around punctuation
            text = re.sub(r"([.,!?;:])", r" \1 ", text)
            text = re.sub(r"\s+", " ", text).strip()  # Remove extra spaces
        return text

    def _token_split(self,
                     text: str,
                     strip_punctuation) -> list:
        """
        Tokenize a single input string into chunks of token_size words.
        
        Args:
            text: a string to tokenize
            strip_punctuation: whether to strip punctuation during preprocessing
        Returns:
            A list, where each entry has 'token_size' number of words
        """
        tokens = []
        # Preprocess the string
        text = self._preprocess(text=text, strip_punctuation=strip_punctuation)
        words = text.split()
        for i in range(0, len(words), self.token_size):
            tokens.append(" ".join(words[i:i + self.token_size]))
        
        return tokens

    def build_vocab(self,
                   data: list,
                   strip_punctuation: bool = False):
        """
        Produces a vocab list, on the given tokens.
        
        Args:
            data: list of sentences to build the vocab on
            strip_puntuation: where to srip punctuation during preprocessing
        Returns:
             vocab: list of unique tokens
             stoi: dict mapping strings to indices
             itos: dict mapping indices to strings
        """
        tokens = []
        for sentence in data:
            tokens.extend(self._token_split(sentence, strip_punctuation=strip_punctuation))
        self._vocab = sorted(set(tokens))
        self._stoi = {token: idx for idx, token in enumerate(self._vocab)}
        self._itos = {idx: token for token, idx in self._stoi.items()}
        
        return self._vocab, self._stoi, self._itos

    def tokenize_string(self,
                 text: str,
                 strip_punctuation = False) -> list:
        """
        Tokenizes the given string into tokens based on the prebuilt vocab.

        Args:
            text: string to tokenize
            strip_puntuation: where to srip punctuation during preprocessing
        Returns:
            List of tokens converted to indices
        """
        assert self._stoi is not None, f"Vocab is not built yet. Call .build_vocab first."
        tokens = self._token_split(text=text, strip_punctuation=strip_punctuation)
        token_indices = [self._stoi[token] for token in tokens if token in self._stoi]
        
        return token_indices

In [58]:
from torch.utils.data import Dataset, DataLoader
class TextDataset(Dataset):
    def __init__(self,
                 data,
                 tokenizer,
                 strip_punctuation = False,
                 max_len = None):
        """
        Custom dataset for tokenized text.
        Builds a tokenized dataset on provided data, using the specified tokenizer.

        Args:
            data: list of sentences
            tokenizer: instance of the Tokenizer class
            strip_punctuation: whether to strip punctuation during preprocessing.
            max_len: optional, maximum length of tokenized sequences (for padding).
        """
        self.data = data
        self.tokenizer = tokenizer
        self.strip_punctuation = strip_punctuation
        self.max_len = max_len
        self.tokenized_data = [self.tokenizer.tokenize_string(line, strip_punctuation) for line in self.data]
        if self.max_len:
            self.tokenized_data = [tokens[:self.max_len] for tokens in self.tokenized_data] # Truncate
    
    def __len__(self):
        """
        Number of samples in "data"
        """
        return len(self.tokenized_data)

    def __getitem__(self, idx):
        """
        Returns a tokenized sample from "data" at "idx".

        Args:
            idx: index of sample to retrieve
        Returns:
            Sample at "idx", tokenized.
        """
        tokens = self.tokenized_data[idx]
        tokens_tensor = torch.tensor(tokens, dtype=torch.long)
        return tokens_tensor

In [46]:
### Tokenizer hyperparams
TOKEN_SIZE = 1
STRIP_PUNCTUATION = False

# Tokenizer instance (and setup)
tokenizer = Tokenizer(TOKEN_SIZE)
vocab, stoi, itos = tokenizer.build_vocab(
    data=train_data,
    strip_punctuation=STRIP_PUNCTUATION,
)

In [59]:
# Sample text data/strings
sample_data = [
    "This is the first sentence.",
    "Another example, with punctuation!",
    "Let's see if this works well."
]

"""
# Tokenize sample_data
sample_data_tokenized = tokenizer.tokenize_string(text=sample_data[0])
print(sample_data_tokenized)
for i in sample_data_tokenized:
    print(itos[i])
"""

dataset = TextDataset(data=sample_data, tokenizer=tokenizer, strip_punctuation=False, max_len=None)

In [60]:
dataset.__len__()

3

In [61]:
dataset.__getitem__(1)

tensor([162081, 534850, 120790, 633093, 593438,      0])

In [67]:
for i in dataset.__getitem__(1):
    print(itos[i.item()])

Another
example
,
with
punctuation
!
