In [19]:
import random
import numpy as np
import torch
from torchtext import datasets
from torchtext.data import TabularDataset
from torchtext import data
from torchtext.data import Field

In [7]:
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [20]:
TEXT = data.Field(tokenize = None, batch_first = True)
LABEL = data.LabelField(dtype = torch.float)



In [9]:
train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)

downloading aclImdb_v1.tar.gz


aclImdb_v1.tar.gz: 100%|██████████| 84.1M/84.1M [00:07<00:00, 11.3MB/s]


In [15]:
train_data[0].__dict__.keys()

dict_keys(['text', 'label'])

In [18]:
train_data[0].label

'pos'

In [14]:
for data in train_data:
    print(data)
    break

<torchtext.data.example.Example object at 0x7f53d2d92d90>


In [None]:
tv_datafields = [("review_id", None), # we won't be needing the id, so we pass in None as the field
                 ("lst_mots", TEXT), 
                 ("note", LABEL)]

trn, vld = TabularDataset.splits(
               path="data", # the root directory where the data lies
               train='train.csv', validation="valid.csv",
               format='csv',
               skip_header=True, # if your csv header has a header, make sure to pass this to ensure it doesn't get proceesed as data!
               fields=tv_datafields)

In [24]:
import io
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.utils import ngrams_iterator
from torchtext.data.utils import get_tokenizer
from torchtext.utils import unicode_csv_reader

In [30]:
def csv_iterator(data_path, ngrams):
    #tokenizer = get_tokenizer("basic_english")
    with io.open(data_path, encoding="utf8") as f:
        reader = unicode_csv_reader(f)
        for row in reader:
            tokens = ' '.join(row[1:])
            #yield ngrams_iterator(tokenizer(tokens), ngrams)
            yield ngrams_iterator(tokens, ngrams)

In [40]:
data_path = '../data/csv/train_1000.csv'
ngrams = 3
vocab = build_vocab_from_iterator(csv_iterator(data_path, ngrams))

1001lines [00:00, 4068.46lines/s]


In [41]:
print(vocab.stoi)

defaultdict(<bound method Vocab._default_unk_index of <torchtext.vocab.Vocab object at 0x7f53d5465f40>>, {'<unk>': 0, '<pad>': 1, "'": 2, ' ': 3, "  '": 4, "' ,": 5, "' ,  ": 6, ',': 7, ',  ': 8, ",   '": 9, 'e': 10, 'a': 11, 'r': 12, 'n': 13, 'i': 14, 's': 15, 't': 16, 'o': 17, "e '": 18, "e ' ,": 19, 'u': 20, 'l': 21, 'm': 22, 'c': 23, 'p': 24, "s '": 25, "s ' ,": 26, 'é': 27, 'e n': 28, 'd': 29, 'n t': 30, 'g': 31, "' p": 32, "  ' p": 33, "t '": 34, 'a n': 35, "t ' ,": 36, 'r e': 37, 'e s': 38, 'b': 39, 'v': 40, 'f': 41, 'o n': 42, "' a": 43, 'e r': 44, 'h': 45, "  ' a": 46, 't e': 47, 'l e': 48, "n t '": 49, "n '": 50, "e s '": 51, "' s": 52, "n ' ,": 53, "' c": 54, "  ' s": 55, 'r a': 56, "  ' c": 57, 'e n t': 58, "r '": 59, 'i n': 60, "r ' ,": 61, 'm e': 62, 't i': 63, "' r": 64, "  ' r": 65, 'i s': 66, "' d": 67, "  ' d": 68, 'p a': 69, 'o u': 70, 'i e': 71, "' b": 72, "  ' b": 73, "' m": 74, "' f": 75, "  ' m": 76, 'a i': 77, "  ' f": 78, 'r i': 79, 's e': 80, 'u r': 81, 'a r':

In [35]:
print(vocab.freqs.most_common(50))

[("'", 89270), (' ', 44636), (',', 43635), ("' ,", 43635), (',  ', 43635), ("  '", 43635), ('e', 37137), ('a', 23980), ('r', 23652), ('n', 23301), ('i', 23267), ('s', 22180), ('t', 19749), ('o', 16294), ("e '", 12804), ('u', 12288), ('l', 11051), ('m', 10296), ('c', 9976), ('p', 9775), ("s '", 9524), ('é', 8375), ('e n', 6241), ('d', 6231), ('n t', 5886), ('g', 5524), ("' p", 5419), ("t '", 5194), ('a n', 5160), ('r e', 4938), ('e s', 4767), ('b', 4679), ('v', 4501), ('f', 4433), ('o n', 4283), ("' a", 4012), ('e r', 4011), ('h', 3989), ('t e', 3755), ('l e', 3489), ("n '", 3404), ("' s", 3345), ("' c", 3321), ('r a', 3255), ("r '", 3210), ('i n', 3167), ('m e', 3052), ('t i', 3007), ("' r", 2993), ('i s', 2909)]
