In [175]:
import torch
import numpy as np
import torch.nn as nn
import torchtext

from torchtext.data import TabularDataset, Field, Iterator
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import Vocab
from torch import optim 
from torch.optim import lr_scheduler

import time
from collections import namedtuple
from tqdm import tqdm, tqdm_notebook

In [None]:
# utilized this tutorial : https://mlexplained.com/2018/02/08/a-comprehensive-tutorial-to-torchtext/
# https://medium.com/@sonicboom8/sentiment-analysis-with-variable-length-sequences-in-pytorch-6241635ae130

In [70]:
int2Label = \
{0: 'Miscellaneous',
 1: 'Video games',
 2: 'Language and literature',
 3: 'Music',
 4: 'Social sciences and society',
 5: 'Sports and recreation',
 6: 'Natural sciences',
 7: 'Art and architecture',
 8: 'History',
 9: 'Warfare',
 10: 'Engineering and technology',
 11: 'Philosophy and religion',
 12: 'Agriculture, food and drink',
 13: 'Geography and places',
 14: 'Mathematics',
 15: 'Media and drama'}

In [94]:
np.random.seed(11747)
LOWER = False
LEARNING_RATE = 3e-4
N_CLASS = len(int2Label.keys())
LOG_FILE = "ConvNetClassificationTesting.txt"
VERBOSE_LOG_FILE = "VerboseConvNetClassificationTesting.txt"
STEP_SIZE = 5
GAMMA = 1.0
NUMBER_EPOCHS = 20

In [97]:
f = open(LOG_FILE,"w+")
v = open(VERBOSE_LOG_FILE, "w+")

In [107]:
tokenizer = get_tokenizer("basic_english")

TEXT = Field(sequential=True, tokenize=tokenizer, lower=LOWER, batch_first=True)

LABEL = Field(sequential=False, use_vocab=False, batch_first=True)

In [108]:
train, val, test = TabularDataset.splits(".", 
                                            train = "topicclass_train.csv", 
                                            validation = "topicclass_valid_fixed.csv", 
                                            test = "topicclass_test.csv", 
                                            format = "csv", 
                                            fields = [('label', LABEL), ('text', TEXT)])

In [109]:
TEXT.build_vocab(train, val, test, vectors = "fasttext.en.300d")

In [52]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
print("device is: {}".format(device))
f.write("device is: {}\n".format(device))
v.write("device is: {}\n".format(device))

In [110]:
train_loader, val_loader = Iterator.splits(
                                    (train, val), 
                                    batch_sizes = (32, 64), 
                                    shuffle = True, 
                                    sort_key = lambda x: len(x.text), 
                                    device = device, 
                                    )

test_loader = Iterator(test, batch_size=64, device = device)

In [111]:
dataloaders = {'train': train_loader, 'val': val_loader}
dataset_sizes = {'train': len(train_loader), 'val': len(val_loader)}

Metric = namedtuple('Metric', ['loss', 'train_error', 'val_error'])

In [162]:
class ConvClassifier(nn.Module): 
    def __init__(self, vocab, n_class, channels_first = 64, channels_second = 64, kernel_first = 2, kernel_second = 2): 
        super().__init__()
        self.vocab_size, self.embedding_size = vocab.vectors.shape
        self.embedding = nn.Embedding(self.vocab_size, self.embedding_size, sparse = False)
        self.embedding.weight.data.copy_(vocab.vectors)
        
        self.embedding_delta = nn.Embedding(self.vocab_size, self.embedding_size, sparse = False)
        
        self.conv1 = nn.Conv1d(in_channels=self.embedding_size, out_channels=channels_first, dilation = 1, kernel_size = kernel_first, padding = kernel_first - 1)
        self.bn1 = nn.BatchNorm1d(channels_first)
        
        self.conv2 = nn.Conv1d(in_channels=channels_first, out_channels=channels_second, dilation = 1, kernel_size = kernel_second, padding = kernel_second - 1)
        self.bn2 = nn.BatchNorm1d(channels_second)
    
        self.relu = nn.ReLU()
        
        self.fc = nn.Linear(channels_second, n_class)
        
    def forward(self, texts): 
        with torch.no_grad(): 
            static_embeddings = self.embedding(texts)
        embeddings = static_embeddings + self.embedding_delta(texts)
        conv1_out = self.conv1(embeddings.transpose(1,2))
        conv1_out = self.bn1(conv1_out)
        conv1_out = self.relu(conv1_out)
        
        conv2_out = self.conv2(conv1_out)
        conv2_out = self.bn2(conv2_out)
        conv2_out = self.relu(conv2_out) 
        
        if conv1_out.shape == conv2_out.shape: 
            conv2_out += conv1_out
        
        pool = nn.MaxPool1d(kernel_size = conv2_out.shape[2])
#         print(f"conv2 shape is {conv2_out.shape}")
        
        pooled_out = pool(conv2_out).squeeze(2)
        
#         print(f"pooled out shape is {pooled_out.shape}")
        
        out = self.fc(pooled_out)
        
        return out
        
        
        
        
        
        

In [176]:
def train_model(model, criterion, optimizer, scheduler, num_epochs=25):

    since = time.time()
    
    best_acc = 0.0

    for epoch in range(num_epochs):
        f.write('Epoch {}/{}\n'.format(epoch+1, num_epochs))
        v.write('Epoch {}/{}\n'.format(epoch+1, num_epochs))
        f.write('-' *10)
        v.write('-' * 10)

        print('Epoch {}/{}'.format(epoch+1, num_epochs))
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0
            running_corrects = 0

            # Iterate over data.
            f.write("\nstarting epoch {} for {} phase\n".format(epoch+1, phase))
            v.write("\nstarting epoch {} for {} phase\n".format(epoch+1, phase))
            print("starting epoch {} for {} phase".format(epoch+1, phase))

            for i, data in enumerate(tqdm_notebook(dataloaders[phase])):
                #pdb.set_trace()
                inputs = data.text.to(device)
                labels = data.label.to(device)

                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)
                    _, preds = torch.max(outputs, 1)
                    loss = criterion(outputs, labels)

                    # backward + optimize only if in training phase
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                # statistics
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)

                if ((i%300) == 0):
                    #print("inside")
                    v.write("inputs size: {}\n".format(inputs.size(0)))
                    v.write("epoch {}, batch {},  loss : {}\n".format(epoch+1, i, loss.item()))
                    v.write("percent correct: {}\n".format((torch.sum(preds == labels.data)/inputs.size(0))))
            
            
            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_acc = running_corrects.double() / dataset_sizes[phase]

            if phase == 'train':

                train_loss = epoch_loss
                train_error = 1 - epoch_acc
                scheduler.step()

            elif phase == 'val': 

                val_error = 1 - epoch_acc
                metrics.append(Metric(loss=train_loss, train_error=train_error,val_error=val_error))
            
            f.write('{} Loss: {:.4f} Acc: {:.4f}\n'.format(phase, epoch_loss, epoch_acc))
            v.write('{} Loss: {:.4f} Acc: {:.4f}\n'.format(phase, epoch_loss, epoch_acc))
            print('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss, epoch_acc))

            # deep copy the model
            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())
                PATH = 'MobileNetModel_' + str(epoch+2) + '.pt'
                torch.save(model, PATH)

        print()

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    f.write('Training complete in {:.0f}m {:.0f}s\n'.format(time_elapsed // 60, time_elapsed % 60))
    v.write('Training complete in {:.0f}m {:.0f}s\n'.format(time_elapsed // 60, time_elapsed % 60))


    print('Best val Acc: {:4f}'.format(best_acc))

    f.write('Best val Acc: {:4f}'.format(best_acc))
    v.write('Best val Acc: {:4f}'.format(best_acc))


    # load best model weights
    model.load_state_dict(best_model_wts)
    return model

In [177]:
model = ConvClassifier(TEXT.vocab, N_CLASS, channels_first = 64, channels_second = 64, kernel_first = 2, kernel_second = 2)
model = model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=STEP_SIZE, gamma=GAMMA)


In [178]:
metrics = []
model = train_model(model, criterion, optimizer, exp_lr_scheduler, num_epochs=NUMBER_EPOCHS)

f.close()
v.close()
torch.save(model_ft, "bestMobile_v2_NetModel.pt")

def training_plot(metrics):
    plt.figure(1)
    plt.plot([m.val_error for m in metrics], 'b')
    plt.plot([m.train_error for m in metrics], 'r')
    plt.title('Val Error (blue) Train Error (red)')
    plt.savefig('Mobile_v2_NetPlot.png')

training_plot(metrics)


Epoch 1/20
----------
starting epoch 1 for train phase


HBox(children=(IntProgress(value=0, max=7935), HTML(value='')))

KeyboardInterrupt: 