# Program Config

In [1]:

from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
#https://drive.google.com/drive/folders/1NCLqicCgxQ8zx4mlXi9r-fl2Sd6k1uVQ?usp=sharing
#You can download datasets from this link.(use ryerson account)

DATASET='yelp_review_full'#['ag_news','yelp_review_full','yelp_review_polarity']
PREPROCES_TYPE='add_pos'#['lower','denoiser','add_pos','add_hashtag','add_NOT']
DATA_FOLDER = 'drive/MyDrive/NLP/datasets'
MODELS_FOLDER = 'drive/MyDrive/NLP/models/vdcnn'
def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--dataset", type=str, default=DATASET, help="'ag_news' or 'yelp_review_full' or 'yelp_review_polarity'")
    parser.add_argument("--preproces_type", type=str, default=PREPROCES_TYPE, help="'lower' or 'denoiser' or 'add_pos' or 'add_hashtag' or 'add_NOT'")
    parser.add_argument("--model_folder", type=str, default=MODELS_FOLDER+"/"+DATASET, help="result directory")
    parser.add_argument("--data_folder", type=str, default=DATA_FOLDER+"/"+DATASET, help="address of datasets directory")
    parser.add_argument("--depth", type=int, choices=[9, 17, 29, 49], default=29, help="Depth of the network tested in the paper (9, 17, 29, 49)")
    parser.add_argument("--maxlen", type=int, default=1024, help="max lentgh of input string")
    parser.add_argument('--shortcut', action='store_true', default=False)
    parser.add_argument("--batch_size", type=int, default=128, help="number of example read by the gpu")
    parser.add_argument("--epochs", type=int, default=10)
    parser.add_argument("--solver", type=str, default="sgd", help="'sgd' or 'adam'")
    parser.add_argument("--lr", type=float, default=0.01)
    parser.add_argument("--lr_halve_interval", type=float, default=10, help="Number of iterations before halving learning rate")
    parser.add_argument("--momentum", type=float, default=0.9, help="Number of iterations before halving learning rate")
    parser.add_argument("--snapshot_interval", type=int, default=5)
    parser.add_argument("--gamma", type=float, default=0.9)
    parser.add_argument("--nthreads", type=int, default=4)
   
    args,_ = parser.parse_known_args()
    return args

#import

In [3]:
import os
import re
import lmdb
import pickle
import sys
import csv
import tarfile
import shutil
import hashlib
import itertools
import numpy as np
import pickle as pkl
import tqdm
import argparse
import tarfile
import gzip

from tqdm import tqdm
from urllib.request import urlretrieve
from urllib.error import URLError
from urllib.error import HTTPError
from collections import Counter
from sklearn import utils, metrics

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
import os, subprocess
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)


#multiprocessing workaround
import resource
rlimit = resource.getrlimit(resource.RLIMIT_NOFILE)
resource.setrlimit(resource.RLIMIT_NOFILE, (2048, rlimit[1]))
# get device to calculate on (either CPU or GPU with minimum memory load)
def get_gpu_memory_map():
    
    result = subprocess.check_output(
        [
            'nvidia-smi', '--query-gpu=memory.used',
            '--format=csv,nounits,noheader'
        ], encoding='utf-8')
    
    # Convert lines into a dictionary
    gpu_memory = [int(x) for x in result.strip().split('\n')]
    gpu_memory_map = dict(zip(range(len(gpu_memory)), gpu_memory))
    
    return gpu_memory_map

def get_device():
    if torch.cuda.is_available():
        memory_map = get_gpu_memory_map()
        device = "cuda:%d" % min(memory_map, key=memory_map.get)
    else:
        device = "cpu"
    
    print("Device:", device)
    return device

#text preprosesing 

In [4]:
import nltk
import string
nltk.download('stopwords')
nltk.download('punkt')
nltk.download("averaged_perceptron_tagger")
nltk.download('tagsets')
nltk.download('treebank')
nltk.download('brown')
nltk.download('universal_tagset')

class Preprocessing():


    def __init__(self,Preprocess_Type='lower'):
        self.Preprocess_Type = Preprocess_Type
        self.stop_list = set(nltk.corpus.stopwords.words('english')+["#","@"])#english stop words
        self.temp_tag  = ['PRP','VBZ','CC','POS','IN','DT','TO','PRP$']#noise words
        self.stemmer   = nltk.stem.PorterStemmer()
        self.word_tokenize  = nltk.tokenize.word_tokenize
        self.pos_tag   = nltk.pos_tag
        self.punctuation = list(string.punctuation)
        self.neg_words = ["n't", "not", "no", "never"]#used in Add_Not preprocessing

    def transform(self, sentences):
        """
        sentences: list(str) 
        output: list(str)
        """
        if self.Preprocess_Type=='lower':
          return [s.lower() for s in sentences]
        elif self.Preprocess_Type=='denoiser':
          return [self.denoiser(s) for s in sentences]
        elif self.Preprocess_Type=='add_pos':
          return [self.add_pos(s) for s in sentences]
        elif self.Preprocess_Type=='add_hashtag':
          return [self.add_hashtag(s) for s in sentences]
        elif self.Preprocess_Type== 'add_NOT':
          return [self.add_Not(s) for s in sentences]


    def denoiser(self,text):
        new_text=""
        words=self.word_tokenize(text)
        words=nltk.pos_tag(words)
        words=[word.lower() for word,tag in words if tag not in self.temp_tag]#remove some extra words depends on their pos_tag
        words=[self.stemmer.stem(word) for word in words if word not in self.stop_list]#remove stop words and stemming
        for word in words:
            new_text=new_text+word+" "
        new_text=new_text[:-1]
        return new_text

    def add_pos(self,text):
        new_text=""
        words=self.word_tokenize(text)
        words=nltk.pos_tag(words)
        #concate pos_tag to the end of words,remove some extra words depends on their pos_tag,remove stop words and stemming
        words=[(self.stemmer.stem(word)+'@'+tag).lower() for word,tag in words if ((tag not in self.temp_tag) and (word.lower() not in self.stop_list))]
        for word in words:
            new_text=new_text+word+" "
        new_text=new_text[:-1]
        return new_text

    def add_hashtag(self,text):
        hashtaged = lambda word : '#'+word
        new_text=""
        words=self.word_tokenize(text)
        words=nltk.pos_tag(words)
        words2=[]
        hashtag=False
        for word,tag in words:
            if word=='#': 
                hashtag=True # if a previous token is '#' next token concated with '#' 
            if ((tag not in self.temp_tag) and (word not in self.stop_list)) or (word!='#' and hashtag==True):#remove some extra words depends on their pos_tag,remove stop words 
                if word[0].isupper():
                    hashtag=True # if a word is captalize will concate with '#'
                new_word=self.stemmer.stem(word).lower()# stemming
                if hashtag==False:
                    words2.append(new_word)
                if hashtag:
                    words2.append(hashtaged(new_word))
                    hashtag=False


        words=words2
        for word in words:
            new_text=new_text+word+" "
        new_text=new_text[:-1]
        return new_text

    def add_Not(self,text):
        new_text=""
        words= self.word_tokenize(text)
        words=self.pos_tag(words)
        words=[word.lower() for word,tag in words if tag not in self.temp_tag]#remove some extra words depends on their pos_tag 
        words=[self.stemmer.stem(word) for word in words ] 
        flag = 0  # start with the flag in the off position
        not_stem=[]
        for word in words:
            # if flag is on then append word with "NOT_"
            if flag == 1:
                # check if the word is a punctuation (this is where we need to stop if flag==1)
                if word in  self.punctuation:
                    # don't append anything to a punctuation
                    # if we reached here then it means the flag is to be turned off
                    not_stem.append(word)
                elif(word not in  self.neg_words):
                    not_stem.append("not_"+word)
                    
            # otherwise add the word without making any changes
            else:
                not_stem.append(word)
            
            # if the word is a negative word then turn on the flag
            if word in  self.neg_words:
                flag=1
            # if word is a punctuation then word off the flag
            if word in  self.punctuation:
                flag=0
                
        for word in not_stem:
            new_text=new_text+word+" "
        new_text=new_text[:-1]
        return new_text

 # valid alpabet or charactars
class CharVectorizer():
    def __init__(self, maxlen=1024, padding='pre', truncating='pre', alphabet="""abcdefghijklmnopqrstuvwxyz0123456789-,;.!?:’"/|$%ˆ&*˜‘+=<>()[]{}#@"""):
        
        self.alphabet = alphabet
        self.maxlen = maxlen
        self.padding = padding
        self.truncating = truncating

        self.char_dict = {'_pad_': 0, '_unk_': 1, ' ': 0} 
        for i, k in enumerate(self.alphabet, start=2):
            self.char_dict[k] = i

    def transform(self,sentences):
        """
        sentences: list of string
        list of review, review is a list of sequences, sequences is a list of int
        """
        sequences = []

        for sentence in sentences:
            seq = [self.char_dict.get(char, self.char_dict["_unk_"]) for char in sentence]
            
            if self.maxlen:
                length = len(seq)

                if length > self.maxlen:# we need to crop the sequence

                    if self.truncating == 'pre':# we crope from the end of the sequence
                        seq = seq[-self.maxlen:]
                    elif self.truncating == 'post':# we crop the beggining of the sequence
                        seq = seq[:self.maxlen]

                if length < self.maxlen:# we need to pad the sequence

                    diff = np.abs(length - self.maxlen)
                    if self.padding == 'pre':#We pad in the beggining
                        seq = [self.char_dict['_pad_']] * diff + seq
                    elif self.padding == 'post':#We pad at the end
                        seq = seq + [self.char_dict['_pad_']] * diff

            sequences.append(seq)                

        return sequences        
    
    def get_params(self):
        params = vars(self)
        return params


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package tagsets to /root/nltk_data...
[nltk_data]   Unzipping help/tagsets.zip.
[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Unzipping corpora/treebank.zip.
[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Unzipping taggers/universal_tagset.zip.


#Detasets

In [1]:
class AgNews(object):
    """
    credit goes to Xiang Zhang:
    https://scholar.google.com/citations?hl=en&user=n4QjVfoAAAAJ&view_op=list_works&sortby=pubdate
    """
    def __init__(self):
        self.data_name = 'ag_news'
        self.data_folder = "{}/{}".format(DATA_FOLDER, self.data_name)
        self.n_classes = 4        
        self.epoch_size = 15

    def _generator(self, file_name):
        DataPath=os.path.join(self.data_folder, file_name)
        with open(DataPath, mode='r', encoding='utf-8') as f:
            reader = csv.DictReader(f, fieldnames=['label', 'title', 'description'], quotechar='"')
            for i,line in enumerate(reader):
                #if(i%10==0):
                    sentence = "{} {}".format(line['title'], line['description'])
                    label = int(line['label']) - 1
                    yield sentence, label

    def load_train_data(self):
        return self._generator(file_name="train.csv")

    def load_test_data(self):
        return self._generator(file_name="test.csv")

class YelpReview(object):
    """
    credit goes to Xiang Zhang:
    https://scholar.google.com/citations?hl=en&user=n4QjVfoAAAAJ&view_op=list_works&sortby=pubdate
    """
    def __init__(self):
        self.data_name = 'yelp_review_full'
        
        self.data_folder = "{}/{}".format(DATA_FOLDER, self.data_name)
        self.n_classes = 5
        self.epoch_size = 15

    def _generator(self, file_name):
        DataPath=os.path.join(self.data_folder, file_name)
        with open(DataPath, mode='r', encoding='utf-8') as f:
            reader = csv.DictReader(f, fieldnames=['label', 'title', 'description'], quotechar='"')
            for i,line in enumerate(reader):
              #if(i%10==0):
                sentence = "{} {}".format(line['title'], line['description'])
                label = int(line['label']) - 1
                yield sentence, label

    def load_train_data(self):
        return self._generator("train.csv")

    def load_test_data(self):
        return self._generator("test.csv")

class YelpPolarity(object):
    """
    credit goes to Xiang Zhang:
    https://scholar.google.com/citations?hl=en&user=n4QjVfoAAAAJ&view_op=list_works&sortby=pubdate
    """
    def __init__(self):

        self.data_name ='yelp_review_polarity'
        self.data_folder = "{}/{}".format(DATA_FOLDER, self.data_name)
        self.n_classes = 2        
        self.epoch_size = 15

    def _generator(self, file_name):
        DataPath=os.path.join(self.data_folder, file_name)
        with open(DataPath, mode='r', encoding='utf-8') as f:
            reader = csv.DictReader(f, fieldnames=['label', 'title', 'description'], quotechar='"')
            for i,line in enumerate(reader):
                #if(i%10==0):
                    sentence = "{} {}".format(line['title'], line['description'])
                    label = int(line['label']) - 1
                    yield sentence, label

    def load_train_data(self):
        return self._generator("train.csv")

    def load_test_data(self):
        return self._generator("test.csv")

#load data

In [6]:
def load_datasets(name="ag_news"):
    dataset =None
    if name=='ag_news':
        dataset=AgNews()
    if name=='yelp_review_full':
        dataset=YelpReview()
    if name=='yelp_review_polarity':
        dataset=YelpPolarity()
    return dataset


class TupleLoader(Dataset): #torch.utils.dat.Dataset

    def __init__(self, path=""):
        self.path = path

        self.env = lmdb.open(path, max_readers=opt.nthreads, readonly=True, lock=False, readahead=False, meminit=False)
        self.txn = self.env.begin(write=False)

    def __len__(self):
        return list_from_bytes(self.txn.get('nsamples'.encode()))[0]

    def __getitem__(self, i):
        xtxt = list_from_bytes(self.txn.get(('txt-%09d' % i).encode()), np.int)
        lab = list_from_bytes(self.txn.get(('lab-%09d' % i).encode()), np.int)[0]
        return xtxt, lab

def list_to_bytes(l):
    return np.array(l).tobytes()


def list_from_bytes(string, dtype=np.int):
    return np.frombuffer(string, dtype=dtype)

# VDCNN

In [7]:
class BasicConvResBlock(nn.Module):

    def __init__(self, input_dim=128, n_filters=256, kernel_size=3, padding=1, stride=1, shortcut=False, downsample=None):
        super(BasicConvResBlock, self).__init__()

        self.downsample = downsample
        self.shortcut = shortcut

        self.conv1 = nn.Conv1d(input_dim, n_filters, kernel_size=kernel_size, padding=padding, stride=stride)
        self.bn1 = nn.BatchNorm1d(n_filters)
        self.relu = nn.ReLU()
        self.conv2 = nn.Conv1d(n_filters, n_filters, kernel_size=kernel_size, padding=padding, stride=stride)
        self.bn2 = nn.BatchNorm1d(n_filters)

    def forward(self, x):

        residual = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)

        if self.shortcut:
            if self.downsample is not None:
                residual = self.downsample(x)
            out += residual

        out = self.relu(out)

        return out


class VDCNN(nn.Module):

    def __init__(self, n_classes=2, num_embedding=141, embedding_dim=16, depth=9, n_fc_neurons=2048, shortcut=False):
        super(VDCNN, self).__init__()

        layers = []
        fc_layers = []
        self.embed = nn.Embedding(num_embedding,embedding_dim, padding_idx=0, max_norm=None, norm_type=2, scale_grad_by_freq=False, sparse=False)
        layers.append(nn.Conv1d(embedding_dim, 64, kernel_size=3, padding=1))

        if depth == 9:
            n_conv_block_64, n_conv_block_128, n_conv_block_256, n_conv_block_512 = 1, 1, 1, 1
        elif depth == 17:
            n_conv_block_64, n_conv_block_128, n_conv_block_256, n_conv_block_512 = 2, 2, 2, 2
        elif depth == 29:
            n_conv_block_64, n_conv_block_128, n_conv_block_256, n_conv_block_512 = 5, 5, 2, 2
        elif depth == 49:
            n_conv_block_64, n_conv_block_128, n_conv_block_256, n_conv_block_512 = 8, 8, 5, 3

        layers.append(BasicConvResBlock(input_dim=64, n_filters=64, kernel_size=3, padding=1, shortcut=shortcut))
        for _ in range(n_conv_block_64-1):
            layers.append(BasicConvResBlock(input_dim=64, n_filters=64, kernel_size=3, padding=1, shortcut=shortcut))  
        layers.append(nn.MaxPool1d(kernel_size=3, stride=2, padding=1)) # l = initial length / 2

        ds = nn.Sequential(nn.Conv1d(64, 128, kernel_size=1, stride=1, bias=False), nn.BatchNorm1d(128))
        layers.append(BasicConvResBlock(input_dim=64, n_filters=128, kernel_size=3, padding=1, shortcut=shortcut, downsample=ds))
        for _ in range(n_conv_block_128-1):
            layers.append(BasicConvResBlock(input_dim=128, n_filters=128, kernel_size=3, padding=1, shortcut=shortcut))
        layers.append(nn.MaxPool1d(kernel_size=3, stride=2, padding=1)) # l = initial length / 4

        ds = nn.Sequential(nn.Conv1d(128, 256, kernel_size=1, stride=1, bias=False), nn.BatchNorm1d(256))
        layers.append(BasicConvResBlock(input_dim=128, n_filters=256, kernel_size=3, padding=1, shortcut=shortcut, downsample=ds))
        for _ in range(n_conv_block_256 - 1):
            layers.append(BasicConvResBlock(input_dim=256, n_filters=256, kernel_size=3, padding=1, shortcut=shortcut))
        layers.append(nn.MaxPool1d(kernel_size=3, stride=2, padding=1))

        ds = nn.Sequential(nn.Conv1d(256, 512, kernel_size=1, stride=1, bias=False), nn.BatchNorm1d(512))
        layers.append(BasicConvResBlock(input_dim=256, n_filters=512, kernel_size=3, padding=1, shortcut=shortcut, downsample=ds))
        for _ in range(n_conv_block_512 - 1):
            layers.append(BasicConvResBlock(input_dim=512, n_filters=512, kernel_size=3, padding=1, shortcut=shortcut))

        layers.append(nn.AdaptiveMaxPool1d(8))
        fc_layers.extend([nn.Linear(8*512, n_fc_neurons), nn.ReLU()])
        # layers.append(nn.MaxPool1d(kernel_size=8, stride=2, padding=0))
        # fc_layers.extend([nn.Linear(61*512, n_fc_neurons), nn.ReLU()])

        fc_layers.extend([nn.Linear(n_fc_neurons, n_fc_neurons), nn.ReLU()])
        fc_layers.extend([nn.Linear(n_fc_neurons, n_classes)])

        self.layers = nn.Sequential(*layers)
        self.fc_layers = nn.Sequential(*fc_layers)

        self.__init_weights()

    def __init_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv1d):
                nn.init.kaiming_normal_(m.weight, mode='fan_in')
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)

    def forward(self, x):# 128x1024x69 (b X s X f0)

        out = self.embed(x)         # 128x1024x16
        out = out.transpose(1, 2)   #128x1024x16-->128x16x1024

        out = self.layers(out)       #covelutional layers (feature extraction)

        out = out.view(out.size(0), -1)   #flatten (After training this output can be used for any ML method)

        out = self.fc_layers(out)     #fully connected layers(prediction layer)

        return out


In [8]:
####################################################################################################################################
def train(epoch,net,dataset,device,msg="val/test",optimize=False,optimizer=None,scheduler=None,criterion=None):
    
    net.train() if optimize else net.eval()

    epoch_loss = 0
    nclasses = len(list(net.parameters())[-1])
    cm = np.zeros((nclasses,nclasses), dtype=int)

    with tqdm(total=len(dataset),desc="Epoch {} - {}".format(epoch, msg)) as pbar:
        for iteration, (tx, ty) in enumerate(dataset):
            
            data = (tx, ty)
            data = [x.to(device) for x in data]

            if optimize:
                optimizer.zero_grad()

            out = net(data[0])
            ty_prob = F.softmax(out, 1) # probabilites

            #metrics
            y_true = data[1].detach().cpu().numpy()
            y_pred = ty_prob.max(1)[1].cpu().numpy()

            cm += metrics.confusion_matrix(y_true, y_pred, labels=range(nclasses))
            dic_metrics = get_metrics(cm, list_metrics)
            
            loss =  criterion(out, data[1]) 
            epoch_loss += loss.item()
            dic_metrics['logloss'] = epoch_loss/(iteration+1)

            if optimize:
                loss.backward()
                optimizer.step()
                dic_metrics['lr'] = optimizer.state_dict()['param_groups'][0]['lr']

            pbar.update(1)
            pbar.set_postfix(dic_metrics)

    if scheduler:
        scheduler.step()
####################################################################################################################
def predict(net,dataset,device,msg="prediction"):
    
    net.eval()

    y_probs, y_trues = [], []

    for iteration, (tx, ty) in tqdm(enumerate(dataset), total=len(dataset), desc="{}".format(msg)):

        data = (tx, ty)
        data = [x.to(device) for x in data]
        out = net(data[0])
        ty_prob = F.softmax(out, 1) # probabilites
        y_probs.append(ty_prob.detach().cpu().numpy())
        y_trues.append(data[1].detach().cpu().numpy())

    return np.concatenate(y_probs, 0), np.concatenate(y_trues, 0).reshape(-1, 1)
###############################################################################################################
def save(net, path):
    """
    Saves a model's state and it's embedding dic by piggybacking torch's save function
    """
    dict_m = net.state_dict()
    torch.save(dict_m,path)
#################################################################################################################
def get_metrics(cm, list_metrics):
    """Compute metrics from a confusion matrix (cm)
    cm: sklearn confusion matrix
    returns:
    dict: {metric_name: score}

    """
    dic_metrics = {}
    total = np.sum(cm)

    if 'accuracy' in list_metrics:
        out = np.sum(np.diag(cm))
        dic_metrics['accuracy'] = out/total

    if 'pres_0' in list_metrics:
        num = cm[0, 0]
        den = cm[:, 0].sum()
        dic_metrics['pres_0'] =  num/den if den > 0 else 0

    if 'pres_1' in list_metrics:
        num = cm[1, 1]
        den = cm[:, 1].sum()
        dic_metrics['pres_1'] = num/den if den > 0 else 0

    if 'recall_0' in list_metrics:
        num = cm[0, 0]
        den = cm[0, :].sum()
        dic_metrics['recall_0'] = num/den if den > 0 else 0

    if 'recall_1' in list_metrics:
        num = cm[1, 1]
        den = cm[1, :].sum()
        dic_metrics['recall_1'] =  num/den if den > 0 else 0

    return dic_metrics


# Main

In [None]:
opt = get_args()
print("parameters: {}".format(vars(opt)))
    
os.makedirs(opt.model_folder, exist_ok=True)
os.makedirs(opt.data_folder, exist_ok=True)

dataset = load_datasets(opt.dataset)
dataset_name = dataset.data_name
n_classes = dataset.n_classes
print("dataset: {}, n_classes: {}".format(dataset_name, n_classes))

tr_path =  "{}/train.lmdb".format(opt.data_folder)
te_path = "{}/test.lmdb".format(opt.data_folder)
    
# check if datasets exis
all_exist = True if (os.path.exists(tr_path) and os.path.exists(te_path)) else False
all_exist=False
preprocessor = Preprocessing(opt.preproces_type)
vectorizer = CharVectorizer(maxlen=opt.maxlen, padding='post', truncating='pre')
n_tokens = len(vectorizer.char_dict)

if not all_exist:
    print("Creating datasets")
    tr_sentences = [txt for txt,lab in tqdm(dataset.load_train_data(), desc="counting train samples")]
    te_sentences = [txt for txt,lab in tqdm(dataset.load_test_data(), desc="counting test samples")]
            
    n_tr_samples = len(tr_sentences)
    n_te_samples = len(te_sentences)
    del tr_sentences
    del te_sentences

    print("[{}/{}] train/test samples".format(n_tr_samples, n_te_samples))

    ###################
    # transform train #
    ###################
    with lmdb.open(tr_path, map_size=1099511627776) as env:
        with env.begin(write=True) as txn:
            for i, (sentence, label) in enumerate(tqdm(dataset.load_train_data(), desc="transform train...", total= n_tr_samples)):

                xtxt = vectorizer.transform(preprocessor.transform([sentence]))[0]
                lab = label

                txt_key = 'txt-%09d' % i
                lab_key = 'lab-%09d' % i
                    
                txn.put(lab_key.encode(), list_to_bytes([lab]))
                txn.put(txt_key.encode(), list_to_bytes(xtxt))

            txn.put('nsamples'.encode(), list_to_bytes([i+1]))

    ##################
    # transform test #
    ##################
    with lmdb.open(te_path, map_size=1099511627776) as env:  #
        with env.begin(write=True) as txn:
            for i, (sentence, label) in enumerate(tqdm(dataset.load_test_data(), desc="transform test...", total= n_te_samples)):

                xtxt = vectorizer.transform(preprocessor.transform([sentence]))[0]
                lab = label

                txt_key = 'txt-%09d' % i
                lab_key = 'lab-%09d' % i
                    
                txn.put(lab_key.encode(), list_to_bytes([lab]))
                txn.put(txt_key.encode(), list_to_bytes(xtxt))

            txn.put('nsamples'.encode(), list_to_bytes([i+1]))

                
tr_loader = DataLoader(TupleLoader(tr_path), batch_size=opt.batch_size, shuffle=True, num_workers=opt.nthreads, pin_memory=True)
te_loader = DataLoader(TupleLoader(te_path), batch_size=opt.batch_size, shuffle=False, num_workers=opt.nthreads, pin_memory=False) #num_workers=opt.nthreads

# select cpu or gpu
device = get_device()
list_metrics = ['accuracy']


print("Creating model...")
net = VDCNN(n_classes=n_classes, num_embedding=int(n_tokens + 1), embedding_dim=16, depth=opt.depth, n_fc_neurons=2048, shortcut=opt.shortcut)
criterion = torch.nn.CrossEntropyLoss()
net.to(device)

assert opt.solver in ['sgd', 'adam']
if opt.solver == 'sgd':
    print(" - optimizer: sgd")
    optimizer = torch.optim.SGD(net.parameters(), lr = opt.lr, momentum=opt.momentum)
elif opt.solver == 'adam':
    print(" - optimizer: adam")
    optimizer = torch.optim.Adam(net.parameters(), lr = opt.lr)    
        
scheduler = None
if opt.lr_halve_interval and  opt.lr_halve_interval > 0:
    print(" - lr scheduler: {}".format(opt.lr_halve_interval))
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, opt.lr_halve_interval, gamma=opt.gamma, last_epoch=-1)
        
for epoch in range(1, opt.epochs + 1):
    train(epoch,net, tr_loader, device, msg="training", optimize=True, optimizer=optimizer, scheduler=scheduler, criterion=criterion)
    train(epoch,net, te_loader, device, msg="testing ", criterion=criterion)

    if (epoch % opt.snapshot_interval == 0) and (epoch > 0):
        path = "{}/model_epoch_{}".format(opt.model_folder,epoch)
        print("snapshot of model saved as {}".format(path))
        save(net, path=path)


if opt.epochs > 0:
    path = "{}/model_epoch_{}".format(opt.model_folder,opt.epochs)
    print("snapshot of model saved as {}".format(path))
    save(net, path=path)

