In [1]:
### All imports
import torch
from torch import nn
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
from pathlib import Path

current_dir = Path(os.getcwd()).parent  # Get current dir (.parent, for root dir).

In [2]:
# Load the current state of the dataset into a df
data_dir = current_dir / "datasets" / "preprocessing"
data_path = data_dir / "english_lang.csv"
all_data_df = pd.read_csv(data_path)
all_data_df.drop(["Unnamed: 0.1", "Unnamed: 0"], axis=1, inplace=True)# Droping some cols (due to a bug in dataset creation)

all_data_df.head()

Unnamed: 0,text,word_count
0,April (Apr.) is the fourth month of the year i...,31
1,April always begins on the same day of the wee...,30
2,"April comes between March and May, making it t...",40
3,April begins on the same day of the week as Ju...,51
4,"In common years, April starts on the same day ...",103


# Data preparation

## Data splits

### Create splits

In [3]:
### Split the data into train-test-validation splits
def get_splits(target_data: list,
               train_size: float = 0.7,
               test_size: float = 0.15,
               validation_size: float = 0.15,
               shuffle: bool = True,
               seed: int = None) -> tuple:
    """
    Splits target_data into train-val-test datasets.
    
    Args:
        target_data: dataset to split on
        train_size: size of the train dataset
        test_size: size of the test dataset
        validation_size: size of the validation dataset
        shuffle: shuffle the target_dataset before spliting
        seed: control randomness in shuffle (default = None)
    Returns:
        train_split, test_split, validation_split
    """
    _total = train_size+test_size+validation_size
    assert _total == 1, f"Total size must be 1 (100%), got: {_total} ({_total*100}%)"
    import random

    # Setup
    _sum = train_size+test_size+validation_size
    assert _sum == 1, f"Sum of all sizes must be 1 got {sum}"
    if seed is not None:
        random.seed(seed)
    if shuffle:
        random.shuffle(target_data)

    # Get split idx's
    total_length = len(target_data)
    train_end_idx = int(total_length * train_size)
    test_end_idx = train_end_idx + int(total_length * test_size) + 1
    validation_end_idx = test_end_idx + int(total_length * validation_size) + 1

    # Split the data into train-test-validation
    train_split = target_data[:train_end_idx]
    test_split = target_data[train_end_idx:test_end_idx]
    validation_split = target_data[test_end_idx:validation_end_idx]

    return train_split, test_split, validation_split

def validate_splits(train_data: list,
                    test_data: list,
                    validation_data: list,
                    leak_ok: bool = False) -> tuple:
    """
    Validates the datasplits by counting the number of overlapping/shared samples between datasets.
    
    Args:
        leak_ok: set to False to check for data-leakage, set to True otherwise
        *All other arguments have same meaning as their name
    Returns:
        samples that overlap
    """
    
    _train_set = set(train_data)
    _test_set = set(test_data)
    _validation_set = set(validation_data)
    
    overlap_train_test = _train_set.intersection(_test_set)
    overlap_train_validation = _train_set.intersection(_validation_set)
    overlap_validation_test = _validation_set.intersection(_test_set)
    
    print(f"Overlap train-test: {len(overlap_train_test)}")
    print(f"Overlap train-validation: {len(overlap_train_validation)}")
    print(f"Overlap validation-test: {len(overlap_validation_test)}")
    
    _sum = len(overlap_train_test)+len(overlap_train_validation)+len(overlap_validation_test)
    
    if leak_ok == False and _sum > 0:
        print(f"[!!!CRITICAL!!!] There exists a data-leakage. Found '{_sum}' samples overlaping.")
    
    return overlap_train_test, overlap_train_validation, overlap_validation_test


In [4]:
# Store all lines from df into arr
all_data_list = all_data_df["text"].tolist()

# Get splits
train_data, test_data, validation_data = get_splits(
    target_data=all_data_list,
    seed=42
    # Leaving all as default
)

# Get some info about the splits
_sum = len(train_data)+len(test_data)+len(validation_data)  # sum of splits
print(f"Train samples: {len(train_data)}")
print(f"Test samples: {len(test_data)}")
print(f"Validation samples: {len(validation_data)}")
print(f"{_sum==len(all_data_list)}")

# Check for data leakages
overlaps = validate_splits(
    train_data=train_data,
    test_data=test_data,
    validation_data=validation_data,
    leak_ok=False,  # We may have some leaks atm from chars like "\n"
)


Train samples: 556689
Test samples: 119291
Validation samples: 119291
True
Overlap train-test: 1661
Overlap train-validation: 1683
Overlap validation-test: 665
[!!!CRITICAL!!!] There exists a data-leakage. Found '4009' samples overlaping.


In [5]:
### Understanding the overlaping texts
# Get overlap counts
print(f"Train-test: {len(overlaps[0])} | Train-val: {len(overlaps[1])} | Val-test: {len(overlaps[2])}")

# Checking the overlaps
train_val_overlaps_df = pd.DataFrame(data=overlaps[1], columns=["text"])
train_val_overlaps_df.head()

Train-test: 1661 | Train-val: 1683 | Val-test: 665


Unnamed: 0,text
0,<section begin=qf3 /><section end=qf3 />\n
1,A wave is a kind of oscillation (disturbance) ...
2,319||100\n
3,7||0||3||0||0||0||10||0\n
4,14||0\n


I think the issue is caused with my implementation of the `get_splits` function, so ill try to verify it using the **train-test split** function from **sklearn**.\
Though "my" implementation shouldn't cause any problems (from a logical perspective), lets just verify for the samke of it.

In [6]:
from sklearn.model_selection import train_test_split

# Split for train_data
train_data, test_data = train_test_split(all_data_list, train_size=0.7, test_size=0.3, random_state=42)
# Split for val and test
validation_data, test_data = train_test_split(test_data, train_size=0.5, random_state=42)

overlaps = validate_splits(
    train_data=train_data,
    test_data=test_data,
    validation_data=[],
)

Overlap train-test: 1754
Overlap train-validation: 0
Overlap validation-test: 0
[!!!CRITICAL!!!] There exists a data-leakage. Found '1754' samples overlaping.


So... looks like there are some samples that are leaking b/w the train-val-test datasets.\
The validate splits function isn't a problem here, as it just compares the "set" of all values from its args. I.e. there exists some kind of overlaping samples.

### Analyse splits

Since the train-test-split function from sklearn resulted in a smaller number of overlaps, let use that as out reference to analyse the overlaping samples.

*This has little to do with the way sklearn splits the data, its due to the way we have to use it to split the data into train, _ first then split _ into test and val. (couldn't think of a good var name for _)*

In [7]:
### Analysis

# Covert overlaps from tuple to a dict of df's
if type(overlaps) == tuple:
    overlaps = {
        "train_val": pd.DataFrame(overlaps[0], columns=["text"]),
        "train_test": pd.DataFrame(overlaps[1], columns=["text"]),
        "val_test": pd.DataFrame(overlaps[2], columns=["text"]),
    }

# View few overlaping samples
print(overlaps["train_val"].head(10))

                                                text
0         <section begin=qf3 /><section end=qf3 />\n
1  In medicine "epidemiology" is the study of wha...
2                                         319||100\n
3  Scientists say this frog is related to the bla...
4                    139||8||13||2||23||1||175||11\n
5                                          524||33\n
6                                            14||0\n
7                             National Film Awards\n
8                                      Southampton\n
9                       Most people speak Finnish.\n


In [8]:
### Check number of occurences of samples in all_data_df

# Check sample overlap
overlaping_text = overlaps["train_val"].loc[1]["text"]
matches = all_data_df["text"] == overlaping_text

# Check number of overlaps
count_matches = matches.sum() if isinstance(matches, pd.Series) else sum(matches)
print(f"Occurrences: {count_matches}")

# Get idx of overlap entry to understand context
matches[matches].index

Occurrences: 2


Index([39947, 201048], dtype='int64')

In [9]:
for i in range(524911-3, 524911+3):
    print(all_data_df.loc[i]["text"])
print("---"*5)
for i in range(524913-3, 524913+3):
    print(all_data_df.loc[i]["text"])
print("---"*5)
for i in range(525016-3, 525016+3):
    print(all_data_df.loc[i]["text"])
print("---"*5)
for i in range(525021-3, 525021+3):
    print(all_data_df.loc[i]["text"])

1729 = 7*247, further check if 247 is a prime or not , if its not prime, continue the same steps of prime factorization for 247.

1729 = 7*247 = 7*13*19, Notice that all the factors for the 1729 at this point are prime and no further factors are possible except 1, so further factorization should be stopped.

"Encore une fois" is a 1997 song by the German DJ group Sash! and it features German singer Sabine Ohmes. It is the second single from Sash!'s first studio album "It's My Life – The Album" which was released on 25 August 1997.

It was played a lot in clubs around the world and is one of Sash!'s biggest songs.

"Ecuador" is a 1997 song by the German DJ group Sash! and it features German DJ Adrian Rodriguez. It is the third single from Sash!'s first studio album "It's My Life – The Album" which was released on 25 August 1997.

It was played a lot in clubs around the world and is one of Sash!'s biggest songs.

---------------
"Encore une fois" is a 1997 song by the German DJ group Sas

---

This shows that there are many places where the entries are repeating. This can be a problem with the dataset, or the way the data lines were split. But for now, since the samples are repeating around areas with similar context. We can drop the repeating samples.
So the next step would be to clean the data again from repeating samples.

---

In [10]:
### Getting rid of all repeating samples in the dataset
unique_df = all_data_df.drop_duplicates(subset="text", keep="first",)
len_unique_df = len(unique_df)
unique_df.reset_index(drop=True, inplace=True)
print(f"Number of samples droped: {len(all_data_df) - len_unique_df}")

unique_df.head(7)

Number of samples droped: 13511


Unnamed: 0,text,word_count
0,April (Apr.) is the fourth month of the year i...,31
1,April always begins on the same day of the wee...,30
2,"April comes between March and May, making it t...",40
3,April begins on the same day of the week as Ju...,51
4,"In common years, April starts on the same day ...",103
5,"In years immediately before common years, Apri...",70
6,April is a spring month in the Northern Hemisp...,30


In [11]:
### Saving this df as a '.csv'
save_path = data_dir / "unique_samples.csv"
unique_df.to_csv(save_path)
print(f"[INFO] Saved to: {save_path}")

[INFO] Saved to: /Users/dhruvnandigam/Desktop/Dhruv/Programing/NN/Inqueropedia/datasets/preprocessing/unique_samples.csv


### Final splits

In [35]:
# Create splits
unique_list = unique_df["text"].tolist()
train_data, test_data, validation_data = get_splits(
    target_data=unique_list,
    train_size=0.7,
    test_size=0.15,
    validation_size=0.15,
    shuffle=True,
    seed=42,
)

# Validate splits
overlaps = validate_splits(
    train_data=train_data,
    test_data=test_data,
    validation_data=validation_data,
    leak_ok=False
)


Overlap train-test: 0
Overlap train-validation: 0
Overlap validation-test: 0


In [37]:
len(train_data), len(validation_data), len(test_data)

(547232, 117263, 117265)

## Model preprocessing

Creating the **Tokenizer** and **Text-dataset** classes. \
These will be used to create PyTorch friendly datasets.

In [38]:
### Create a general purpose tokenizer
class Tokenizer():
    def __init__(self, token_size):
        """
        Creates an instance of a tokenizer.
        You can use this object to convert raw data into tokens.

        Args:
            token_size: size of each token
        """
        self.token_size = token_size
        self._vocab = None
        self._stoi = None
        self._itos = None

    def _preprocess(self,
                    text: list,
                    strip_punctuation = False) -> list:
        """
        Preprocesses a string to handle punctuation and normalize tokens.

        Args:
            text: a string to preprocess
            strip_punctuation: whether to strip punctuation during preprocessing
        Returns:
            Preprocessed string
        """
        import re
        import string

        # Strip punctuation if required
        if strip_punctuation:
            text = re.sub(r'[{}]'.format(re.escape(string.punctuation)), '', text)
        else:
            # Add spaces around punctuation
            text = re.sub(r"([.,!?;:])", r" \1 ", text)
            text = re.sub(r"\s+", " ", text).strip()  # Remove extra spaces
        return text

    def _token_split(self,
                     text: str,
                     strip_punctuation) -> list:
        """
        Tokenize a single input string into chunks of token_size words.

        Args:
            text: a string to tokenize
            strip_punctuation: whether to strip punctuation during preprocessing
        Returns:
            A list, where each entry has 'token_size' number of words
        """
        tokens = []
        # Preprocess the string
        text = self._preprocess(text=text, strip_punctuation=strip_punctuation)
        words = text.split()
        for i in range(0, len(words), self.token_size):
            tokens.append(" ".join(words[i:i + self.token_size]))

        return tokens

    def build_vocab(self,
                   data: list,
                   strip_punctuation: bool = False):
        """
        Produces a vocab list, on the given tokens.

        Args:
            data: list of sentences to build the vocab on
            strip_puntuation: where to srip punctuation during preprocessing
        Returns:
             vocab: list of unique tokens
             stoi: dict mapping strings to indices
             itos: dict mapping indices to strings
        """
        tokens = []
        for sentence in data:
            tokens.extend(self._token_split(sentence, strip_punctuation=strip_punctuation))
        self._vocab = sorted(set(tokens))
        self._stoi = {token: idx for idx, token in enumerate(self._vocab)}
        self._itos = {idx: token for token, idx in self._stoi.items()}

        return self._vocab, self._stoi, self._itos

    def tokenize_string(self,
                 text: str,
                 strip_punctuation = False) -> list:
        """
        Tokenizes the given string into tokens based on the prebuilt vocab.

        Args:
            text: string to tokenize
            strip_puntuation: where to srip punctuation during preprocessing
        Returns:
            List of tokens converted to indices
        """
        assert self._stoi is not None, f"Vocab is not built yet. Call .build_vocab first."
        tokens = self._token_split(text=text, strip_punctuation=strip_punctuation)
        token_indices = [self._stoi[token] for token in tokens if token in self._stoi]

        return token_indices


In [39]:
### Create a "torch" friendly dataset
from torch.utils.data import Dataset, DataLoader
class TextDataset(Dataset):
    def __init__(self,
                 data,
                 tokenizer,
                 strip_punctuation = False,
                 max_len = None):
        """
        Custom dataset for tokenized text.
        Builds a tokenized dataset on provided data, using the specified tokenizer.

        Args:
            data: list of sentences
            tokenizer: instance of the Tokenizer class
            strip_punctuation: whether to strip punctuation during preprocessing.
            max_len: optional, maximum length of tokenized sequences (for padding).
        """
        self.data = data
        self.tokenizer = tokenizer
        self.strip_punctuation = strip_punctuation
        self.max_len = max_len
        self.tokenized_data = [self.tokenizer.tokenize_string(line, strip_punctuation) for line in self.data]
        if self.max_len:
            self.tokenized_data = [tokens[:self.max_len] for tokens in self.tokenized_data] # Truncate

    def __len__(self):
        """
        Number of samples in "data"
        """
        return len(self.tokenized_data)

    def __getitem__(self, idx):
        """
        Returns a tokenized sample from "data" at "idx".

        Args:
            idx: index of sample to retrieve
        Returns:
            Sample at "idx", tokenized.
        """
        tokens = self.tokenized_data[idx]
        tokens_tensor = torch.tensor(tokens, dtype=torch.long)
        return tokens_tensor


In [40]:
### Tokenizer hyperparams
TOKEN_SIZE = 1
STRIP_PUNCTUATION = False

# Tokenizer instance (and setup)
tokenizer = Tokenizer(TOKEN_SIZE)
vocab, stoi, itos = tokenizer.build_vocab(
    data=train_data,
    strip_punctuation=STRIP_PUNCTUATION,
)


In [42]:
# Sample text data/strings
sample_data = [
    "This is the first sentence.",
    "Another example, with punctuation!",
    "Let's see if this works well."
]

"""
# Tokenize sample_data
sample_data_tokenized = tokenizer.tokenize_string(text=sample_data[0])
print(sample_data_tokenized)
for i in sample_data_tokenized:
    print(itos[i])
"""

dataset = TextDataset(data=sample_data, tokenizer=tokenizer, strip_punctuation=False, max_len=None)

In [43]:
dataset.__len__()

3

In [55]:
# Print the 'tokenized' form of a sentence, and convert it back into its 'string' form
print(f"Tokens: {dataset.__getitem__(1)}")
print(f"String: {' '.join(itos[i.item()] for i in dataset.__getitem__(1))}")

Tokens: tensor([158247, 509836, 119183, 605606, 566985,      0])
String: Another example , with punctuation !


In [57]:
len(itos)

619100