In [5]:
import torch
import torch.nn as nn
import torchtext.data as ttd
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime

In [7]:
data = {
    "label":[0, 1, 1],
    "data":[
        "I like eggs and ham.",
        "Eggs I like!",
        "Ham and eggs or just ham?"
    ]
}

In [8]:
df = pd.DataFrame(data)

In [9]:
df

Unnamed: 0,label,data
0,0,I like eggs and ham.
1,1,Eggs I like!
2,1,Ham and eggs or just ham?


In [10]:
df.to_csv("thedata.csv", index=False)

In [14]:
TEXT = ttd.Field(
    sequential=True, # sequence of words
    batch_first=True, # Because want data to be N x ...
    lower=True, # Lowercase
    # tokenize='spacy', # will use string.split() if we dont specify
    pad_first=True
)

LABEL = ttd.Field(
    sequential=False, # Does't contain sequential data
    use_vocab=False, # this column doesn't contain vocabulary but integers
    is_target=True # contains target column
)

__Note__: If we don't specify `use_vocab=False`, then pytorch will complain later when you try to iterate over the datasete that the attribute `vocab` doesn't exist.
<br>
<br>
__Note__: If you don't specify `is_target=True`, then PyTorch will assume it's part of the input, so when you iterate over the dataset it will be like:<br>
`for (input, targets), _ in iterator:`<br>
where the 2nd element (\_) should have been the target

In [15]:
dataset = ttd.TabularDataset(
    path='thedata.csv',
    format='csv',
    skip_header=True,
    fields=[('label',LABEL),('data',TEXT)])

In [16]:
ex = dataset.examples[0]

In [17]:
ex

<torchtext.data.example.Example at 0x1ead8a92808>

In [18]:
ex.data

['i', 'like', 'eggs', 'and', 'ham.']

In [19]:
ex.label

'0'

In [20]:
train_dataset, test_dataset = dataset.split(0.66) # default is 0.7

In [24]:
for i in train_dataset:
    print(i.data)

['ham', 'and', 'eggs', 'or', 'just', 'ham?']
['eggs', 'i', 'like!']


In [27]:
TEXT.build_vocab(train_dataset,)

In [28]:
vocab = TEXT.vocab

In [29]:
type(vocab)

torchtext.vocab.Vocab

In [31]:
vocab.stoi

defaultdict(<bound method Vocab._default_unk_index of <torchtext.vocab.Vocab object at 0x000001EAD8A3B308>>,
            {'<unk>': 0,
             '<pad>': 1,
             'eggs': 2,
             'and': 3,
             'ham': 4,
             'ham?': 5,
             'i': 6,
             'just': 7,
             'like!': 8,
             'or': 9})

In [32]:
vocab.itos

['<unk>', '<pad>', 'eggs', 'and', 'ham', 'ham?', 'i', 'just', 'like!', 'or']

In [33]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [42]:
train_iter, test_iter = ttd.Iterator.splits(
    (train_dataset, test_dataset),
    sort_key=lambda x : len(x.data),
    batch_sizes=(2,2),
    device=device
)

In [43]:
for inputs, targets in train_iter:
    print("Inputs:", inputs,"shape:",inputs.shape)
    print("Targets:", targets,"shape:",targets.shape)
    break

Inputs: tensor([[1, 1, 1, 2, 6, 8],
        [4, 3, 2, 9, 7, 5]], device='cuda:0') shape: torch.Size([2, 6])
Targets: tensor([1, 1], device='cuda:0') shape: torch.Size([2])


In [45]:
for inputs, targets in test_iter:
    print("Inputs:", inputs,"shape:",inputs.shape)
    print("Targets:", targets,"shape:",targets.shape)
    break

Inputs: tensor([[6, 0, 2, 3, 0]], device='cuda:0') shape: torch.Size([1, 5])
Targets: tensor([0], device='cuda:0') shape: torch.Size([1])


In [54]:
for inputs, targets in train_iter:
    for i in inputs[1]:
        print(vocab.itos[i])
    break

<pad>
<pad>
<pad>
eggs
i
like!
