In [3]:
import torch
import torch.nn as nn
import torchtext.data as ttd
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime

In [5]:
# Creating a fake dataset
data = {
    "label":[0,1,1],
    "data":[
        "I like eggs and ham.",
        "Eggs I like!",
        "Ham and egg or just ham?"
    ]
}
df = pd.DataFrame(data)
df.to_csv("test.csv", index=False)
df

Unnamed: 0,label,data
0,0,I like eggs and ham.
1,1,Eggs I like!
2,1,Ham and egg or just ham?


In [8]:
TEXT = ttd.Field(sequential=True, batch_first=True, lower=True, tokenize='spacy', pad_first=True) # had to install spacy

In [9]:
LABEL = ttd.Field(sequential=False, use_vocab=False, is_target=True)

In [17]:
dataset = ttd.TabularDataset(
    path='test.csv',
    format='csv',
    skip_header=True,
    fields=[('label', LABEL),('data',TEXT)] # Must follow the order of the column
)
# Also what we declare in the field ('label', Label), we will access label by dataset.label

In [20]:
ex = dataset.examples[0]
print("Data",ex.data)
print("Index", ex.label)

Data ['i', 'like', 'eggs', 'and', 'ham', '.']
Index 0


In [21]:
train_dataset, test_dataset = dataset.split(0.66)

In [22]:
TEXT.build_vocab(train_dataset,)
vocab = TEXT.vocab

In [25]:
vocab.stoi

defaultdict(<bound method Vocab._default_unk_index of <torchtext.vocab.Vocab object at 0x000001D38BE81880>>,
            {'<unk>': 0,
             '<pad>': 1,
             'eggs': 2,
             'i': 3,
             'like': 4,
             '!': 5,
             '.': 6,
             'and': 7,
             'ham': 8})

In [26]:
vocab.itos

['<unk>', '<pad>', 'eggs', 'i', 'like', '!', '.', 'and', 'ham']

In [29]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [30]:
train_iter, test_iter = ttd.Iterator.splits(
    (train_dataset, test_dataset),
    sort_key=lambda x : len(x.data),
    batch_sizes=(2,2),
    device=device
)

In [38]:
for inputs, targets in train_iter:
    print("inputs:", inputs, "shape:", inputs.shape)
    print("targets:", targets, "shape:", targets.shape)

inputs: tensor([[1, 1, 2, 3, 4, 5],
        [3, 4, 2, 7, 8, 6]]) shape: torch.Size([2, 6])
targets: tensor([1, 0]) shape: torch.Size([2])


In [32]:
for inputs, targets in test_iter:
    print("inputs:", inputs, "shape:", inputs.shape)
    print("targets:", targets, "shape:", targets.shape)

inputs: tensor([[8, 7, 0, 0, 0, 8, 0]]) shape: torch.Size([1, 7])
targets: tensor([1]) shape: torch.Size([1])


In [40]:
for i in inputs[1]:
    print(vocab.itos[i])

i
like
eggs
and
ham
.
