# ChatBot-v1

In [1]:
### All imports
import torch
from torch import nn
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
#import 

## Data manipulation

### Get data

In [2]:
### Setup dataset viarables and methods for data import
import os
current_dir = os.getcwd()
data_dir = os.path.join(os.path.dirname(current_dir), "datasets", "plain_text_wikipedia", "AllCombined.txt")  # Path to all data

def get_lines(file_path):
    """
    Args:
        file_path: path to file (.txt file only)
    Returns:
        all the lines found in given file
    """
    with open(file_path, "r") as f:
        return f.readlines()

In [3]:
### Importing and storing the data into a variable
all_data = get_lines(data_dir)
print(f"Random line from the dataset:\n{all_data[60]}")  # Fixed position to 60 | many times the line is "\n"
print(f"Number of lines: {len(all_data)}")

Random line from the dataset:
Some people say that art is a product or item that is made with the intention of stimulating the human senses as well as the human mind, spirit and soul. An artwork is normally judged by how much impact it has on people, the number of people who can relate to it, and how much they appreciate it. Some people also get inspired.

Number of lines: 2052699


### Split and validate data

In [4]:
### Split the data into train-test-validation splits
def get_splits(target_data,
               train_size: float = 0.7,
               test_size: float = 0.15,
               validation_size: float = 0.15,
               shuffle: bool = True,
               seed: int = 42):
    """
    Args:
        target_data: dataset to split on
        train_size: size of the train dataset
        test_size: size of the test dataset
        validation_size: size of the validation dataset
        shuffle: shuffle the target_dataset before spliting
    Returns:
        train_split, test_split, validation_split
    """
    import random

    # Setup
    _sum = train_size+test_size+validation_size
    assert _sum == 1, f"Sum of all sizes must be 1 got {sum}"
    random.seed(seed)
    if shuffle:
        random.shuffle(target_data)

    # Get split idx's
    total_length = len(target_data)
    train_end_idx = int(total_length * train_size)
    test_end_idx = train_end_idx + int(total_length * test_size) + 1
    validation_end_idx = test_end_idx + int(total_length * validation_size) + 1
    
    # Split the data into train-test-validation
    train_split = target_data[:train_end_idx]
    test_split = target_data[train_end_idx:test_end_idx]
    validation_split = target_data[test_end_idx:validation_end_idx]

    return train_split, test_split, validation_split

def validate_splits(train_data, test_data, validation_data, leak_ok: bool = False):
    """
    Args:
        leak_ok: set to False to check for data-leakage, set to True otherwise
        *All other arguments have same meaning as their name
    """
    _train_set = set(train_data)
    _test_set = set(test_data)
    _validation_set = set(validation_data)
    overlap_train_test = _train_set.intersection(_test_set)
    overlap_train_validation = _train_set.intersection(_validation_set)
    overlap_validation_test = _validation_set.intersection(_test_set)
    print(f"Overlap train-test: {len(overlap_train_test)}")
    print(f"Overlap train-validation: {len(overlap_train_validation)}")
    print(f"Overlap validation-test: {len(overlap_validation_test)}")
    _sum = len(overlap_train_test)+len(overlap_train_validation)+len(overlap_validation_test)
    if not leak_ok:
        assert _sum == 0, f"There exists a data-leakage. Found '{_sum}' samples overlaping."
    return overlap_train_test, overlap_train_validation, overlap_validation_test



# Get splits
train_data, test_data, validation_data = get_splits(
    target_data=all_data,
    # Leaving all as default
)

# Get some info about the splits
_sum = len(train_data)+len(test_data)+len(validation_data)  # sum of splits
print(f"Train samples: {len(train_data)}")
print(f"Test samples: {len(test_data)}")
print(f"Validation samples: {len(validation_data)}")
print(f"{_sum==len(all_data)}")
# Check for data leakages
overlaps = validate_splits(
    train_data=train_data,
    test_data=test_data,
    validation_data=validation_data,
    leak_ok=True, # We may have some leaks atm from chars like "\n"
)

Train samples: 1436889
Test samples: 307905
Validation samples: 307905
True
Overlap train-test: 2454
Overlap train-validation: 2520
Overlap validation-test: 988


### Understand the data

In [5]:
### Check the data split's lengths
len(train_data), len(test_data), len(validation_data)

(1436889, 307905, 307905)

In [6]:
### Get vocab
# Create a general purpose tokenizer
class Tokenizer():
    def __init__(self, token_size):
        """
        Creates an instance of a tokenizer.
        You can use this object to convert raw data into tokens.

        Args:
            token_size: size of each token
        """
        self.token_size = token_size
        self._vocab = None
        self._stoi = None
        self._itos = None

    def _preprocess(self,
                    data: list,
                    strip_punctuation = False) -> list:
        """
        Preprocesses the data to handle punctuation and normalize tokens.

        Args:
            data: list of sentences or lines to preprocess
        Returns:
            A list of preprocessed sentences where punctuation is separated.
        """
        import re
        import string
        processed_data = []
        
        # Get rid of puntuation if needed
        if strip_punctuation:
            for line in data:
                line = re.sub(r'[{}]'.format(re.escape(string.punctuation)), '', line)
                processed_data.append(line)
        # Add spaces around punctuation, otherwise
        else:
            for line in data:
                line = re.sub(r"([.,!?;:])", r" \1 ", line)  # Add spaces around punctuation
                line = re.sub(r"\s+", " ", line).strip()  # Remove extra spaces
                processed_data.append(line)
        
        return processed_data
            
        
    
    def _token_split(self,
                     data: list,
                     strip_punctuation):
        """
        Tokenizes the data on the token_size and returns a list of the data, where each entry has token_size number of words.
        
        Args:
            data: list of sentences, words, etc to tokenize
        Returns:
            A list, where each entry has 'token_size' number of words
        """
        tokens = []
        data = self._preprocess(data=data, strip_punctuation=strip_punctuation)
        for line in data:
            words = line.split()
            for i in range(0, len(words), self.token_size):
                tokens.append(" ".join(words[i:i + self.token_size]))
        
        return tokens

    def build_vocab(self,
                   data: list,
                   strip_punctuation: bool = False):
        """
        Produces a vocab list, on the given tokens.
        
        Args:
            data: list of sentences to build the vocab on
            strip_puntuation: where to srip punctuation during preprocessing
        Returns:
             vocab: list of unique tokens
             stoi: dict mapping strings to indices
             itos: dict mapping indices to strings
        """
        tokens = self._token_split(data, strip_punctuation=strip_punctuation)
        self._vocab = sorted(set(tokens))
        self._stoi = {token: idx for idx, token in enumerate(self._vocab)}
        self._itos = {idx: token for token, idx in self._stoi.items()}
        
        return self._vocab, self._stoi, self._itos

    def tokenize_string(self,
                 data: list,
                 strip_punctuation = False) -> list:
        """
        Tokenizes the given data into tokens based onthe prebuild vocab.

        Args:
            data: data to tokenize
            strip_puntuation: where to srip punctuation during preprocessing
        Returns:
            List of tokens converted to indices
        """
        assert self._stoi is not None, f"Vocab is not built yet. Call .build_vocab first."
        tokens = self._token_split(data, strip_punctuation=strip_punctuation)
        token_indices = [self._stoi[token] for token in tokens if token in self._stoi]
        
        return token_indices

In [7]:
### Setup and run the tokenizer
TOKEN_SIZE = 1  # Word level (token_size: number of words per token)

"""
# Testing the tokenizer
test_tokenizer = Tokenizer(1)
sample_data = ["This is the first set of tokens.", "And this is the second set of tokens"]
#print(test_tokenizer._token_split(sample_data))
tokens, vocab, stoi, itos = test_tokenizer.tokenize(data = sample_data, strip_punctuation=False)

print(tokens)
print("------")
print(vocab)
print("------")
print(stoi)
print("------")
print(itos)
print("------")
"""

tokenizer_token_size_1 = Tokenizer(TOKEN_SIZE)
vocab, stoi, itos = tokenizer_token_size_1.build_vocab(
    data=train_data,
    strip_punctuation=False
)
train_tokens = tokenizer_token_size_1.tokenize_string(train_data)