# ChatBot-v1

In [2]:
### All imports
import torch
from torch import nn
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
#import 

## Data manipulation

### Get data

In [3]:
### Setup dataset viarables and methods for data import
data_dir = "/Users/dhruvnandigam/Desktop/Dhruv/Programing/NN/ChatBot/datasets/plain_text_wikipedia/AllCombined.txt"  # Path to all data

def get_lines(file_path):
    """
    Args:
        file_path: path to file (.txt file only)
    Returns:
        all the lines found in given file
    """
    with open(file_path, "r") as f:
        return f.readlines()

In [18]:
### Importing and storing the data into a variable
all_data = get_lines(data_dir)
print(f"Random line from the dataset:\n{all_data[60]}")  # Fixed position to 60 | many times the line is "\n"
print(f"Number of lines: {len(all_data)}")

Random line from the dataset:
Some people say that art is a product or item that is made with the intention of stimulating the human senses as well as the human mind, spirit and soul. An artwork is normally judged by how much impact it has on people, the number of people who can relate to it, and how much they appreciate it. Some people also get inspired.

Number of lines: 2052699


### Split and validate data

In [45]:
### Split the data into train-test-validation splits
def get_splits(target_data,
               train_size: float = 0.7,
               test_size: float = 0.15,
               validation_size: float = 0.15,
               shuffle: bool = True,
               seed: int = 42):
    """
    Args:
        target_data: dataset to split on
        train_size: size of the train dataset
        test_size: size of the test dataset
        validation_size: size of the validation dataset
        shuffle: shuffle the target_dataset before spliting
    Returns:
        train_split, test_split, validation_split
    """
    import random

    # Setup
    _sum = train_size+test_size+validation_size
    assert _sum == 1, f"Sum of all sizes must be 1 got {sum}"
    random.seed(seed)
    if shuffle:
        random.shuffle(target_data)

    # Get split idx's
    total_length = len(target_data)
    train_end_idx = int(total_length * train_size)
    test_end_idx = train_end_idx + int(total_length * test_size) + 1
    validation_end_idx = test_end_idx + int(total_length * validation_size) + 1
    
    # Split the data into train-test-validation
    train_split = target_data[:train_end_idx]
    test_split = target_data[train_end_idx:test_end_idx]
    validation_split = target_data[test_end_idx:validation_end_idx]

    return train_split, test_split, validation_split

def validate_splits(train_data, test_data, validation_data, leak_ok: bool = False):
    """
    Args:
        leak_ok: set to False to check for data-leakage, set to True otherwise
        *All other arguments have same meaning as their name
    """
    _train_set = set(train_data)
    _test_set = set(test_data)
    _validation_set = set(validation_data)
    overlap_train_test = _train_set.intersection(_test_set)
    overlap_train_validation = _train_set.intersection(_validation_set)
    overlap_validation_test = _validation_set.intersection(_test_set)
    print(f"Overlap train-test: {len(overlap_train_test)}")
    print(f"Overlap train-validation: {len(overlap_train_validation)}")
    print(f"Overlap validation-test: {len(overlap_validation_test)}")
    _sum = len(overlap_train_test)+len(overlap_train_validation)+len(overlap_validation_test)
    if not leak_ok:
        assert _sum == 0, f"There exists a data-leakage. Found '{_sum}' samples overlaping."
    return overlap_train_test, overlap_train_validation, overlap_validation_test



# Get splits
train_data, test_data, validation_data = get_splits(
    target_data=all_data,
    # Leaving all as default
)

# Get some info about the splits
_sum = len(train_data)+len(test_data)+len(validation_data)  # sum of splits
print(f"Train samples: {len(train_data)}")
print(f"Test samples: {len(test_data)}")
print(f"Validation samples: {len(validation_data)}")
print(f"{_sum==len(all_data)}")
# Check for data leakages
overlaps = validate_splits(
    train_data=train_data,
    test_data=test_data,
    validation_data=validation_data,
    leak_ok=True, # We may have some leaks atm from chars like "\n"
)

Train samples: 1436889
Test samples: 307905
Validation samples: 307905
True
Overlap train-test: 2439
Overlap train-validation: 2447
Overlap validation-test: 962


In [47]:
overlaps[0]

{'General Orders No. 32, W.D., 1919\n',
 '161||4||7||0||19||0||187||4\n',
 'Slavia Prague\n',
 'The team has some notable achievements.\n',
 '304||10||29||0||45||3||3||0||381||13\n',
 'The arrondissements are further divided into cantons and communes.\n',
 '"Main article: Xiphactinus"\n',
 '"Oh how I want to be with St Kilda."\n',
 'On 1 January 2015, the former municipality of St. Martin merged into the municipality of Vals.\n',
 '1914\n',
 'The player can play as Jill Valentine and defeat zombies including the titular Nemesis.\n',
 'On May 3, 2017, Puerto Rico filed for bankruptcy after a massive debt and weak economy. It is the largest bankruptcy case in American history.\n',
 '30||4\n',
 'FM\n',
 '!Total||13||0\n',
 'Bayern Munich II\n',
 'Toronto FC\n',
 'The earliest organized group of Japanese emigrants settled in Mexico in 1897.\n',
 'Air\n',
 '157||8\n',
 'VfL Osnabrück\n',
 '314||35\n',
 '!Total||48||1\n',
 '<section begin=PR />\n',
 'Right tributaries:\n',
 'A. "Church of th

In [48]:
overlaps[1]

{'General Orders No. 32, W.D., 1919\n',
 '61||5||3||0||2||0||66||5\n',
 '79||2\n',
 'Towns are in boldface; market towns in "italics"; suburbs, hamlets and other subdivisions of a municipality are in .\n',
 'The arrondissements are further divided into cantons and communes.\n',
 "The person who first did twin studies was Francis Galton, Darwin's half-cousin, who was a founder of statistics. His method was to trace twins through their life-history, making many kinds of measurement. Unfortunately, though he knew about mono and dizygotic twins, he did not appreciate the real genetic difference. Twin studies of the modern kind did not appear until the 1920s.\n",
 '168||32||10||6||1||0||179||38\n',
 '45||1||0||0||9||1||54||2\n',
 'Italy U19\n',
 'progress {\n',
 '100||11||6||1||1||0||107||12\n',
 'CD single (1995)\n',
 'With the national team she won the gold medal at the 2022 World Games in Birmingham, Alabama.\n',
 '!Total||13||0\n',
 'Bayern Munich II\n',
 'Toronto FC\n',
 'Subaru\n',
 '

In [49]:
overlaps[2]

{'\n',
 ' \n',
 ' Season in progress.\n',
 '! 0\n',
 '! 13\n',
 '! 18th \n',
 '! colspan=14 style=background:#dcdcdc; text-align:center| Midfielders\n',
 '! scope="row" | 2021\n',
 '!EMI\n',
 '!HUN\n',
 '!Source\n',
 '!Total||10||0\n',
 '!Total||10||1\n',
 '!Total||11||0\n',
 '!Total||11||1\n',
 '!Total||11||2\n',
 '!Total||12||0\n',
 '!Total||12||1\n',
 '!Total||12||2\n',
 '!Total||13||0\n',
 '!Total||14||0\n',
 '!Total||15||0\n',
 '!Total||17||1\n',
 '!Total||18||0\n',
 '!Total||19||0\n',
 '!Total||19||1\n',
 '!Total||1||0\n',
 '!Total||21||2\n',
 '!Total||21||4\n',
 '!Total||22||0\n',
 '!Total||22||1\n',
 '!Total||22||6\n',
 '!Total||23||0\n',
 '!Total||23||2\n',
 '!Total||24||1\n',
 '!Total||25||0\n',
 '!Total||26||2\n',
 '!Total||28||0\n',
 '!Total||29||3\n',
 '!Total||2||0\n',
 '!Total||2||2\n',
 '!Total||31||0\n',
 '!Total||32||0\n',
 '!Total||32||9\n',
 '!Total||38||1\n',
 '!Total||38||3\n',
 '!Total||3||0\n',
 '!Total||47||1\n',
 '!Total||4||0\n',
 '!Total||4||1\n',
 '!Total||

### Understand the data

In [52]:
### Check the data split's lengths
len(train_data), len(test_data), len(validation_data)

(1436889, 307905, 307905)

In [None]:
### Get vocab
# Create a general purpose tokenizer
class Tokenizer():
    def __init__(self, token_size):
        """
        Creates an instance of a tokenizer.
        You can use this object to convert raw data into tokens.

        Args:
            token_size: size of each token
        """
        self.token_size = token_size

    def _token_split(self, data):
        """
        Tokenizes the data on the token_size and returns a list of the data, where each entry has token_size number of words.
        
        Args:
            data: list of sentences, words, etc to tokenize
        Returns:
            A list, where each entry has 'token_size' number of words
        """
        




### Setup and run the tokenizer
TOKEN_SIZE = 1  # Word level (token_size: number of words per token)