# Exploratory Data Analysis
---

## Import libraries

In [1]:
import pandas as pd
from pathlib import Path

import torch
from torchtext.vocab import GloVe
from torch.utils.data.dataset import ConcatDataset
from torchtext.data import Field, TabularDataset, BucketIterator
from torch.utils.data.sampler import RandomSampler

## Global Constants

In [2]:
GLOVE_DIR = Path("../data")
DATA_DIR = Path("../data/mtl-dataset")
DATASETS = ['apparel', 'baby', 'books', 'camera_photo',  'electronics', 
      'health_personal_care', 'imdb', 'kitchen_housewares', 'magazines', 
      'music', 'software', 'sports_outdoors', 'toys_games', 'video']
BSIZE = 64

# Load Data

In [3]:
dataset_name = DATASETS[0]+".task.train"
dataset_name

'apparel.task.train'

In [4]:
df = pd.read_csv(DATA_DIR / dataset_name, sep="\t", header=None, names=['label', 'text'])
df.head(3)

Unnamed: 0,label,text
0,0,i ordered a pair for my husband and was very d...
1,1,love hush puppies . they are comfortable and i...
2,0,"i was so excited to find these , they seemed t..."


## Prepare data

In [5]:
def prepare_data(dataset_list, data_dir, devset_fraction=0.2):
    col_names=['label', 'text']
    for idx, dataset in enumerate(dataset_list):
        train_file = dataset+".task.train"
        dev_file = dataset+".task.dev"
        test_file = dataset+".task.test" 
        
        df = pd.read_csv(data_dir / train_file, sep="\t", header=None, names=col_names)
        train_df = df.sample(frac=1-devset_fraction).reset_index(drop=True)
        dev_df = df.sample(frac=devset_fraction).reset_index(drop=True)
        test_df = pd.read_csv(data_dir / test_file, sep="\t", header=None, names=col_names)
        
        train_df.to_csv(DATA_DIR / f"{train_file}.csv", index=False)
        dev_df.to_csv(DATA_DIR / f"{dev_file}.csv", index=False)
        test_df.to_csv(DATA_DIR / f"{test_file}.csv", index=False)

In [17]:
prepare_data(dataset_list=DATASETS, data_dir=DATA_DIR)

## Dataset

In [18]:
text_field = Field(sequential=True, tokenize=lambda x: x.split(), lower=True, batch_first=True, include_lengths=True)
label_field = Field(sequential=False, use_vocab=False)

In [19]:
data_fields = [("label", label_field), ("text", text_field)]

In [29]:
train_sets, dev_sets, test_sets = [], [], []
for dataset in DATASETS:
    train_file = dataset+".task.train.csv"
    dev_file = dataset+".task.dev.csv"
    test_file = dataset+".task.test.csv"
    train_set, dev_set, test_set = TabularDataset.splits(path=DATA_DIR, root=DATA_DIR,
                                                         train=train_file, validation=dev_file, test=test_file,
                                                         fields=data_fields, skip_header=True, format="csv")
    train_sets.append(train_set)
    dev_sets.append(dev_set)
    test_sets.append(test_set)
    
text_field.build_vocab(*train_sets, vectors=GloVe(cache=GLOVE_DIR))
train_sets = ConcatDataset(train_sets)
dev_sets = ConcatDataset(dev_sets)
test_sets = ConcatDataset(test_sets)

In [30]:
text_field.vocab.freqs.most_common(10)

[('the', 111264),
 ('.', 106635),
 (',', 85742),
 ('and', 58624),
 ('i', 54447),
 ('to', 53719),
 ('a', 52662),
 ('it', 44688),
 ('of', 41316),
 ('is', 36742)]

In [31]:
example = train_sets[0]
example.__dict__.keys()

dict_keys(['label', 'text'])

In [32]:
print(" ".join(example.text), example.label)

i have purchased these beloved sport pants for years . the cut of these pants has been changed with this order . the quality of the cut and fabric is much lower now . i returned the item 0


## Batchsampler for Multi-Tasks

In [35]:
class BatchSchedulerSampler(torch.utils.data.sampler.Sampler):
    """
    iterate over tasks and provide a random batch per task in each mini-batch
    """
    def __init__(self, dataset, batch_size):
        self.dataset = dataset
        self.batch_size = batch_size
        self.number_of_datasets = len(dataset.datasets)
        
    def __len__(self):
        return len(self.dataset) * self.number_of_datasets
    
    def __iter__(self):
        samplers_list = []
        sampler_iterators = []
        datasets_length = []
        for dataset_idx in range(self.number_of_datasets):
            cur_dataset = self.dataset.datasets[dataset_idx]
            sampler = RandomSampler(cur_dataset)
            samplers_list.append(sampler)
            cur_sampler_iterator = sampler.__iter__()
            sampler_iterators.append(cur_sampler_iterator)
            datasets_length.append(len(cur_dataset))
        push_index_val = [0] + self.dataset.cumulative_sizes[:-1]
        step = self.batch_size * self.number_of_datasets
        samples_to_grab = self.batch_size
        largest_dataset_index = torch.argmax(torch.as_tensor(datasets_length)).item()
        # for this case we want to get all samples in dataset, this force us to resample from the smaller datasets
        epoch_samples = datasets_length[largest_dataset_index] * self.number_of_datasets
        final_samples_list = []  # this is a list of indexes from the combined dataset
        for _ in range(0, epoch_samples, step):
            for i in range(self.number_of_datasets):
                cur_batch_sampler = sampler_iterators[i]
                cur_samples = []
                for _ in range(samples_to_grab):
                    try:
                        cur_sample_org = cur_batch_sampler.__next__()
                        cur_sample = cur_sample_org + push_index_val[i]
                        cur_samples.append(cur_sample)
                    except StopIteration:
                        if i == largest_dataset_index:
                            # largest dataset iterator is done we can break
                            samples_to_grab = len(cur_samples)  # adjusting the samples_to_grab
                            # got to the end of iterator - extend final list and continue to next task if possible
                            break
                        else:
                            # restart the iterator - we want more samples until finishing with the largest dataset
                            sampler_iterators[i] = samplers_list[i].__iter__()
                            cur_batch_sampler = sampler_iterators[i]
                            cur_sample_org = cur_batch_sampler.__next__()
                            cur_sample = cur_sample_org + push_index_val[i]
                            cur_samples.append(cur_sample)
                final_samples_list.extend(cur_samples)

        return iter(final_samples_list)

## Dataloaders

In [36]:
train_iter, dev_iter, test_iter = BucketIterator.splits((train_sets, dev_sets, test_sets),
                                                        batch_sizes=(BSIZE, BSIZE*2, BSIZE*2),
                                                        sort_within_batch=False,
                                                        sort_key=lambda x: len(x.text))

In [37]:
batch = next(iter(train_iter))

AttributeError: 'ConcatDataset' object has no attribute 'fields'

In [119]:
batch


[torchtext.data.batch.Batch of size 64]
	[.label]:[torch.LongTensor of size 64]
	[.text]:('[torch.LongTensor of size 64x890]', '[torch.LongTensor of size 64]')
	[.task]:[torch.LongTensor of size 64]

In [120]:
batch.text[0].shape, batch.task.shape, batch.label.shape

(torch.Size([64, 890]), torch.Size([64]), torch.Size([64]))

# Visualize

## Distribution of Tasks

In [127]:
train_df.task_id.value_counts()

0     1321
9     1319
12    1318
5     1286
2     1283
3     1276
7     1271
6     1271
8     1269
13    1258
4     1258
11    1242
1     1198
10    1174
Name: task_id, dtype: int64

## Distribution of Labels

In [129]:
train_df.label.value_counts()

1    8981
0    8763
Name: label, dtype: int64