In [1]:
import torch
from torch import nn
import torch.nn.functional as fun
import pandas as pd
import numpy as np

### Hypothesis:
A good starting point is to categorize art based on the different continents which latin america lies on. Since the dataset contains 2x the amount of mexican art than ALL OTHER latin american art combined, continent is a good feature to split and train on due to geographical difference and logistic/technical restrictions. Since latin america is only distributed across two continents, it is a good binary target due to lack of data in other features.

## Creating the Data Pipeline

In [25]:
target = pd.read_csv('../data_samples/continents.csv')

In [26]:
feature = pd.read_csv('../data_samples/results/en_titles.csv', index_col=0)

In [29]:
from TitleDataset import TitleDataset
dataset = TitleDataset()
batch_size = 16
validation_split = .2
random_seed= 42
size = len(dataset)
indices = list(range(size))
split = int(np.floor(validation_split * size))
shuffle = True
if shuffle:
    np.random.seed(random_seed)
    np.random.shuffle(indices)
train_indices, val_indices, test_indices = indices[split:], indices[:int(np.floor(split/2))], indices[int(np.floor(split/2)):split]

In [30]:
from torch.utils.data import SubsetRandomSampler, DataLoader

In [31]:
train_sampler = SubsetRandomSampler(train_indices)
valid_sampler = SubsetRandomSampler(val_indices)
test_sampler = SubsetRandomSampler(test_indices)

token_loader = DataLoader(dataset, batch_size=1,
                          sampler=train_sampler)

### Tokenizer for Train/Test Data

In [32]:
## Text Classification Model
DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [33]:
def collate_batch(batch):
    label_list, text_list, offsets = [], [], [0]
    for (_text, _label) in batch:
        label_list.append(label_pipeline(_label))
        processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
        text_list.append(processed_text)
        offsets.append(processed_text.size(0))
    label_list = torch.tensor(label_list, dtype=torch.int64)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text_list = torch.cat(text_list)
    return label_list.to(DEVICE), text_list.to(DEVICE), offsets.to(DEVICE)

train_loader = DataLoader(dataset, batch_size=batch_size,
                          sampler=train_sampler, collate_fn=collate_batch)
validation_loader = DataLoader(dataset, batch_size=batch_size,
                               sampler=valid_sampler,collate_fn=collate_batch)
test_loader = DataLoader(dataset, batch_size=batch_size,
                         sampler=valid_sampler, collate_fn=collate_batch)

In [34]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

tokenizer = get_tokenizer('basic_english')
token_iter = token_loader
def yield_tokens(data_iter):
    for _, text in data_iter:
        yield tokenizer(_[0])
vocab = build_vocab_from_iterator(yield_tokens(token_iter))

263lines [00:00, 7143.01lines/s]


### Training the Model

In [35]:
text_pipeline = lambda x: [vocab.__getitem__(tokenizer(y)[0]) for y in x.split(' ')]
label_pipeline = lambda x: 0 if x == 'North America' else 1

In [36]:
text_pipeline('Mexico Serafina ?')

[12, 335, 0]

In [37]:
from TorchTextModel import TorchTextModel

In [38]:
num_class = 2
vocab_size = len(vocab)
emsize=128
model = TorchTextModel(vocab_size, emsize, num_class).to(device=DEVICE)

In [39]:
import time
from torch import optim
EPOCHS = 10
criterion = torch.nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.1)
total_accu = None

In [40]:

def train(dataloader):
    model.train()
    total_acc, total_count = 0,0
    log_interval = 1
    start_time = time.time()

    for idx, (label, text, offsets) in enumerate(dataloader):
        optimizer.zero_grad()
        prediction = model(text, offsets)
        loss = criterion(prediction, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
        optimizer.step()
        total_acc += (prediction.argmax(1) == label).sum().item()
        total_count += label.size(0)
        if idx % log_interval == 0 and idx > 0:
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches '
                  '| accuracy {:8.3f}'.format(epoch, idx, len(dataloader),
                                              total_acc/total_count))
            total_acc, total_count = 0, 0
            start_time = time.time()

def evaluate(dataloader):
    model.eval()
    total_acc, total_count = 0, 0

    with torch.no_grad():
        for idx, (label, text, offsets) in enumerate(dataloader):
            predited_label = model(text, offsets)
            loss = criterion(predited_label, label)
            total_acc += (predited_label.argmax(1) == label).sum().item()
            total_count += label.size(0)
    return total_acc/total_count

In [41]:
best_val = 0
for epoch in range(1, EPOCHS + 1):
    epoch_start_time = time.time()
    train(train_loader)
    accu_val = evaluate(validation_loader)
    if accu_val > best_val:
        print("Current Model ________")
        best_val = accu_val
        model = model.to("cpu")
    if total_accu is not None and total_accu > accu_val:
        scheduler.step()
    else:
        total_accu = accu_val
    print('-' * 59)
    print('| end of epoch {:3d} | time: {:5.2f}s | '
          'valid accuracy {:8.3f} '.format(epoch,
                                           time.time() - epoch_start_time,
                                           accu_val))
    print('-' * 59)

| epoch   1 |     1/   17 batches | accuracy    0.656
| epoch   1 |     2/   17 batches | accuracy    0.625
| epoch   1 |     3/   17 batches | accuracy    0.875
| epoch   1 |     4/   17 batches | accuracy    0.750
| epoch   1 |     5/   17 batches | accuracy    0.812
| epoch   1 |     6/   17 batches | accuracy    0.562
| epoch   1 |     7/   17 batches | accuracy    0.625
| epoch   1 |     8/   17 batches | accuracy    0.438
| epoch   1 |     9/   17 batches | accuracy    0.562
| epoch   1 |    10/   17 batches | accuracy    0.812
| epoch   1 |    11/   17 batches | accuracy    0.688
| epoch   1 |    12/   17 batches | accuracy    0.688
| epoch   1 |    13/   17 batches | accuracy    0.750
| epoch   1 |    14/   17 batches | accuracy    0.625
| epoch   1 |    15/   17 batches | accuracy    0.688
| epoch   1 |    16/   17 batches | accuracy    0.429
Current Model ________
-----------------------------------------------------------
| end of epoch   1 | time:  0.06s | valid accuracy   

In [42]:
print('Checking the results of test dataset.')
accu_test = evaluate(test_loader)
print('test accuracy {:8.3f}'.format(accu_test))

Checking the results of test dataset.
test accuracy    0.750


In [43]:
continent = {0: "North America", 1: "South America"}

def predict(text, text_pipeline):
    with torch.no_grad():
        text = torch.tensor(text_pipeline(text))
        output = model(text, torch.tensor([0]))
        return output.argmax(1).item()

In [68]:
ex_text_str = "Fontamara"

print("This is a %s artwork title"%continent[predict(ex_text_str, text_pipeline)])

This is a South America artwork title


In [69]:
torch.save({'epoch': epoch, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'loss': accu_val, 'embedding': model.embedding}, f='title_to_continent_torchtext')

In [70]:
torch.save(model, f='continent_predictor')