# Exploratory Data Analysis
---

## Import libraries

In [105]:
from pathlib import Path
import pandas as pd
from torchtext.data import Field, TabularDataset, BucketIterator

## Global Constants

In [106]:
DATA_DIR = Path("../data/mtl-dataset")
DATASETS = ['apparel', 'baby', 'books', 'camera_photo',  'electronics', 
      'health_personal_care', 'imdb', 'kitchen_housewares', 'magazines', 
      'music', 'software', 'sports_outdoors', 'toys_games', 'video']
BSIZE = 64

# Load Data

In [45]:
dataset_name = DATASETS[0]+".task.train"
dataset_name

'apparel.task.train'

In [51]:
df = pd.read_csv(DATA_DIR / dataset_name, sep="\t", header=None, names=['label', 'text'])
df.head(3)

Unnamed: 0,label,text
0,0,i ordered a pair for my husband and was very d...
1,1,love hush puppies . they are comfortable and i...
2,0,"i was so excited to find these , they seemed t..."


## Prepare data

In [84]:
def prepare_data(dataset_list, data_dir, devset_fraction=0.):
    col_names=['label', 'text']
    train_dfs = []
    test_dfs = []
    for idx, dataset in enumerate(dataset_list):
        train_file = dataset+".task.train"
        test_file = dataset+".task.test"
        train_df = pd.read_csv(data_dir / train_file, sep="\t", header=None, names=col_names)
        test_df = pd.read_csv(data_dir / test_file, sep="\t", header=None, names=col_names)
        train_df['task_id'] = idx
        train_df['task_name'] = dataset
        test_df['task_id'] = idx
        test_df['task_name'] = dataset
        train_dfs.append(train_df)
        test_dfs.append(test_df)
    # concatenate then shuffle in place
    df = pd.concat(train_dfs)
    train_dfs = df.sample(frac=1-devset_fraction).reset_index(drop=True)
    dev_dfs = df.sample(frac=devset_fraction).reset_index(drop=True)
    test_dfs = pd.concat(test_dfs).sample(frac=1).reset_index(drop=True)
    train_dfs.to_csv(DATA_DIR / "train.csv", index=False)
    dev_dfs.to_csv(DATA_DIR / "dev.csv", index=False)
    test_dfs.to_csv(DATA_DIR / "test.csv", index=False)
    return train_dfs, dev_dfs, test_dfs 

In [87]:
train_df, dev_df, test_df = prepare_data(dataset_list=DATASETS, data_dir=DATA_DIR, devset_fraction=0.2)

((17744, 4), (8763, 4), (8981, 4))

In [89]:
train_df.head(5)

Unnamed: 0,label,text,task_id,task_name
0,1,when i first got a wireless network at home i ...,4,electronics
1,1,i 'm a pc mechanic . i have been using ad-awar...,10,software
2,0,i just wasted $ 16.35 on this piece of junk . ...,4,electronics
3,0,a director and his crew head out to the isolat...,6,imdb
4,0,"this pillow is soft and sqwishy , but it is to...",5,health_personal_care


In [90]:
train_df.shape, train_df[train_df['label'] == 0].shape, train_df[train_df['label'] == 1].shape

((17744, 4), (8763, 4), (8981, 4))

In [88]:
dev_df.shape, test_df.shape

((4436, 4), (5600, 4))

## Dataset

In [92]:
text_field = Field(sequential=True, tokenize=lambda x: x.split(), lower=True, batch_first=True, include_lengths=True)
label_field = Field(sequential=False, use_vocab=False)
task_field = Field(sequential=False, use_vocab=False)

In [93]:
data_fields = [("label", label_field), ("text", text_field), ("task", task_field), ("task_name", None)]

In [94]:
train_set, dev_set, test_set = TabularDataset.splits(path=DATA_DIR, root=DATA_DIR,
                                                     train="train.csv", validation="dev.csv", test="test.csv",
                                                     fields=data_fields, skip_header=True, format="csv")

In [95]:
text_field.build_vocab(train_set)

In [97]:
text_field.vocab.freqs.most_common(10)

[('the', 111346),
 ('.', 107062),
 (',', 86076),
 ('and', 58853),
 ('i', 54616),
 ('to', 54061),
 ('a', 52772),
 ('it', 44669),
 ('of', 41774),
 ('is', 36726)]

In [102]:
example = train_set[0]
example.__dict__.keys()

dict_keys(['label', 'text', 'task'])

In [104]:
print(" ".join(example.text), example.label, example.task)

when i first got a wireless network at home i purchased a kensington wifi finder . it worked fine . but as time went on i saw these smaller versions come out from chrysalis development . i heard they were more powerful and more versatile . so i bought one . i was not disappointed . since it was about the same size as my keyless entry remote for my car , it fit perfectly on my keychain . also it has a much longer range than the others . but best of all , it is directional . meaning i can simply turn my body while holding down the button and the leds change instantly as the signal strength increases or decreases . what a great product . i love it ! 1 4


## Dataloaders

In [108]:
train_iter, dev_iter, test_iter = BucketIterator.splits((train_set, dev_set, test_set),
                                                        batch_sizes=(BSIZE, BSIZE*2, BSIZE*2),
                                                        sort_within_batch=False,
                                                        sort_key=lambda x: len(x.text))

In [118]:
batch = next(iter(train_iter))

In [119]:
batch


[torchtext.data.batch.Batch of size 64]
	[.label]:[torch.LongTensor of size 64]
	[.text]:('[torch.LongTensor of size 64x890]', '[torch.LongTensor of size 64]')
	[.task]:[torch.LongTensor of size 64]

In [120]:
batch.text[0].shape, batch.task.shape, batch.label.shape

(torch.Size([64, 890]), torch.Size([64]), torch.Size([64]))

# Visualize

## Distribution of Tasks

In [127]:
train_df.task_id.value_counts()

0     1321
9     1319
12    1318
5     1286
2     1283
3     1276
7     1271
6     1271
8     1269
13    1258
4     1258
11    1242
1     1198
10    1174
Name: task_id, dtype: int64

## Distribution of Labels

In [129]:
train_df.label.value_counts()

1    8981
0    8763
Name: label, dtype: int64