In [88]:
from models.english.LSTM import model
from utilities.settings import Params, Paths
import torch.nn as nn

In [48]:
paths = Paths()
params = Params()

In [49]:
paths.path2root =  '/Users/alexpsq/Code/NeuroSpin/LePetitPrince'
paths.path2data =  '/Users/alexpsq/Code/NeuroSpin/LePetitPrince/data'
paths.path2derivatives =  '/Users/alexpsq/Code/NeuroSpin/LePetitPrince/derivatives'
paths.path2code =  '/Users/alexpsq/Code/NeuroSpin/LePetitPrince/code'
paths.path2paradigm =  '/Users/alexpsq/Code/NeuroSpin/LePetitPrince/paradigm'
paths.path2oldstuff =  '/Users/alexpsq/Code/NeuroSpin/LePetitPrince/oldstuff'
paths.path2test =  '/Users/alexpsq/Code/NeuroSpin/LePetitPrince/oldstuff/test'

In [50]:
vars(paths)

{'path2root': '/Users/alexpsq/Code/NeuroSpin/LePetitPrince',
 'path2data': '/Users/alexpsq/Code/NeuroSpin/LePetitPrince/data',
 'path2derivatives': '/Users/alexpsq/Code/NeuroSpin/LePetitPrince/derivatives',
 'path2code': '/Users/alexpsq/Code/NeuroSpin/LePetitPrince/code',
 'path2paradigm': '/Users/alexpsq/Code/NeuroSpin/LePetitPrince/paradigm',
 'path2oldstuff': '/Users/alexpsq/Code/NeuroSpin/LePetitPrince/oldstuff',
 'path2test': '/Users/alexpsq/Code/NeuroSpin/LePetitPrince/oldstuff/test'}

## Dictionary and Corpus

In [51]:
import os
import torch
from collections import defaultdict
import logging
from tqdm import tqdm

class Dictionary(object):
    def __init__(self, path, language):
        self.word2idx = {}
        self.idx2word = []
        self.language = language
        self.word2freq = defaultdict(int)

        vocab_path = os.path.join(path, 'vocab.txt')
        try:
            vocab = open(vocab_path, encoding="utf8").read()
            self.word2idx = {w: i for i, w in enumerate(vocab.split())}
            self.idx2word = [w for w in vocab.split()]
            self.vocab_file_exists = True
        except FileNotFoundError:
            logging.info("Vocab file not found, creating new vocab file.")
            self.create_vocab(os.path.join(path, 'train.txt'))
            open(vocab_path,"w").write("\n".join([w for w in self.idx2word]))


    def add_word(self, word):
        self.word2freq[word] += 1
        if word not in self.word2idx:
            self.idx2word.append(word)
            self.word2idx[word] = len(self.idx2word) - 1


    def __len__(self):
        return len(self.idx2word)


    def create_vocab(self, path):
        iterator = tokenize(path, self.language, train=True)
        for item in tqdm(iterator):
            self.add_word(item)
        self.add_word('<unk>')



class Corpus(object):
    def __init__(self, path, language):
        print('Building dictionary...')
        self.dictionary = Dictionary(path, language)
        print('Dictionary built.')
        train_path = os.path.join(path, 'train.txt')
        valid_path = os.path.join(path, 'valid.txt')
        test_path = os.path.join(path, 'test.txt')
        train_tensor = os.path.join(path, 'train.pkl')
        valid_tensor = os.path.join(path, 'valid.pkl')
        test_tensor = os.path.join(path, 'test.pkl')
        try:
            with open(train_tensor, 'rb') as f:
                self.train = torch.load(f)
            with open(valid_tensor, 'rb') as f:
                self.valid = torch.load(f)
            with open(test_tensor, 'rb') as f:
                self.test = torch.load(f)

        except FileNotFoundError:
            logging.info("Tensor files not found, creating new tensor files.")
            print('Computing train tensor...')
            self.train = create_tokenized_tensor(tokenize(train_path, language, self.dictionary, train=True), self.dictionary)
            print('Train tensor computed.')
            print('Computing valid tensor...')
            self.valid = create_tokenized_tensor(tokenize(valid_path, language, self.dictionary, train=True), self.dictionary)
            print('Valid tensor computed.')
            print('Computing test tensor...')
            self.test = create_tokenized_tensor(tokenize(test_path, language, self.dictionary, train=True), self.dictionary)
            print('Test tensor computed.')

            with open(train_tensor, 'wb') as f:
                torch.save(self.train, f)
            with open(valid_tensor, 'wb') as f:
                torch.save(self.valid, f)
            with open(test_tensor, 'wb') as f:
                torch.save(self.test, f)
        



def create_tokenized_tensor(iterator, dictionary):
    """Create tensor of embeddings from word iterator."""
    tensor = torch.LongTensor(len(iterator))
    token = 0
    for item in tqdm(iterator):
        tensor[token] = dictionary.word2idx[item] if item in dictionary.word2idx else dictionary.word2idx['<unk>']
        token += 1
    return tensor

## Tokenizer

In [52]:
from nltk.tokenize import sent_tokenize 
from nltk.tokenize import word_tokenize
import os
import re
import inflect
from tqdm import tqdm


special_words = {
    'english': {
        'grown-ups': 'grownups',
        'grown-up': 'grownup',
        'hasn\'t': 'hasnt',
        'hasn‘t': 'hasnt'
    },
    'french': {

    }
}


def tokenize(path, language, vocab=None, path_like=True, train=False):
    print('Tokenizing...')
    if path_like:
        print(path)
        assert os.path.exists(path)
        path = open(path, 'r', encoding='utf8').read()

    if not train:
        print('Preprocessing...')
        text = preprocess(path, special_words, language)
        print('Preprocessed.')
    else:
        text = path
    # iterator = [unk_transform(item, vocab).lower() for item in text.split()]
    iterator = [unk_transform(item, vocab) for item in tqdm(text.split())] # vocab words not lowered
    print('Tokenized.')
    return iterator


def unk_transform(word, vocab=None):
    if word == 'unk':
        return '<unk>'
    elif not vocab:
        return word
    elif word in vocab.idx2word:
        return word
    else:
        return '<unk>'


def preprocess(text, special_words, language):
    text = text.replace('\n', '')
    text = text.replace('<unk>', 'unk')
    for word in special_words[language].keys():
        text = text.replace(word, special_words[language][word])
    transf = inflect.engine()
    numbers = re.findall('\d+', text)
    for number in numbers:
        text = text.replace(number, transf.number_to_words(number))
    punctuation = ['.', '\'', ',', ';', ':', '!', '?', '/', '-', '"', '‘', '’', '(', ')', '{', '}', '[', ']', '`', '“', '”', '—']
    for item in punctuation:
        text = text.replace(item, ' '+ item + ' ')
    text = text.replace('.  .  .', '...')
    ### tokenize without punctuation ###
    # for item in punctuation:
    #     text = text.replace(item, ' ')
    ### tokenize with punctuation ###
    # ### tokenize thanks to usual tools for text without strange characters ###
    # tokenized = sent_tokenize(text, language=language)
    # tokenized = [word_tokenize(sentence, language=language) + ['<eos>'] for sentence in tokenized]
    # iterator = [unk_transform(item, vocab).lower() for sublist in tokenized for item in sublist]
    return text

## Model analysis - Generation

In [53]:
def generate(model, input, hidden, temperature, corpus, log_interval):
    output, hidden = model(input, hidden) 
    word_weights = output.squeeze().div(temperature).exp().cpu() 
    word_idx = torch.multinomial(word_weights, 1)[0] 
    input.fill_(word_idx) 
    word = corpus.dictionary.idx2word[word_idx]
    return(word, input)


In [54]:
def check_lstm(model, data, save_path, language, words2generate=1000, temperature=1.0, log_interval=100, saving=False,cuda=False, seed=1111): 
    torch.manual_seed(seed) 
    if torch.cuda.is_available(): 
        if not cuda: 
            print("WARNING: You have a CUDA device, so you should probably run with --cuda") 

    device = torch.device("cuda" if (cuda & torch.cuda.is_available()) else "cpu") 

    model.eval()

    result = ''
    corpus = Corpus(data, language) 
    ntokens = len(corpus.dictionary) 
    hidden = model.init_hidden(1) 
    input = torch.randint(ntokens, (1,1), dtype=torch.long).to(device)
    with torch.no_grad(): # no tracking history
        word, input = generate(model, input, hidden, temperature, corpus, log_interval)
        with open(save_path, 'w') as outf: 
        
            for i in range(words2generate):
                word, input = generate(model, input, hidden, temperature, corpus, log_interval)
                result += ' ' + str(word)
                if saving:
                    outf.write(word + ('\n' if i % 20 == 19 else ' '))
                if i % log_interval == 0: 
                    print('| Generated {}/{} words'.format(i, words2generate))
    print(result)
    return result

In [55]:
data_name = 'wiki_kristina'
language = 'english'

In [56]:
base_model = model.RNNModel('LSTM', 5,200,150,2,dropout=0.1)   

In [79]:
def load(model, data_name, language): 
    path = '_'.join([model.__name__().replace('.', ''), data_name, language]) + '.pt' 
    path = os.path.join(paths.path2derivatives, 'fMRI/models', language, path) 
    print(path)     
    assert os.path.exists(path)
    with open(path, 'rb') as f: 
        return torch.load(f, map_location='cpu') 
    
model2load2layer = load(base_model, data_name, language)

/Users/alexpsq/Code/NeuroSpin/LePetitPrince/derivatives/fMRI/models/english/LSTM_embedding-size_200_nhid_150_nlayers_2_dropout_01_wiki_kristina_english.pt




In [61]:
param_dict = {'model':model2load2layer, 
              'cuda':True, 
              'save_path':os.path.join(paths.path2derivatives, 'fMRI/models/english/LSTM_embedding-size_200_nhid_150_nlayers_2_dropout_01_wiki_kristina_english__generated.text'), 
              'data': '/Users/alexpsq/Code/NeuroSpin/LePetitPrince/data/text/english/lstm_training', 
              'language':'english'}                                  


In [62]:
result = check_lstm(**param_dict)

Building dictionary...
Dictionary built.
| Generated 0/1000 words
| Generated 100/1000 words
| Generated 200/1000 words
| Generated 300/1000 words
| Generated 400/1000 words
| Generated 500/1000 words
| Generated 600/1000 words
| Generated 700/1000 words
| Generated 800/1000 words
| Generated 900/1000 words
 a rate of <unk> — still of telekinesis . <eos> of outfit Cumberland province strong personal corpse systems in this disappears the ability to write the state The Wealth date of <unk> <unk> poor composition . <eos> at the <unk> of the symbol of Annapolis , as they created an associate charitable treatment . <eos> pleased their equipment . <eos> five festivals in his dream brings personal fit at the " sled screamed with General Financial Card , <unk> de Valera , and is held strong , the Second was a <unk> , the king , who may manage to the track would not the separation , following his maternal aunts , Bill <unk> has other series , companion , " <unk> , and Royal House of a general d

In [84]:
print(result.replace('<eos>', ''))

 a rate of <unk> — still of telekinesis .  of outfit Cumberland province strong personal corpse systems in this disappears the ability to write the state The Wealth date of <unk> <unk> poor composition .  at the <unk> of the symbol of Annapolis , as they created an associate charitable treatment .  pleased their equipment .  five festivals in his dream brings personal fit at the " sled screamed with General Financial Card , <unk> de Valera , and is held strong , the Second was a <unk> , the king , who may manage to the track would not the separation , following his maternal aunts , Bill <unk> has other series , companion , " <unk> , and Royal House of a general description of Glory condensation .  the family use in July 6 series .  <unk> <unk> <unk> , the state trade shows and a trigger complement ( along with theory of economics , but we first , Reagan <unk> described , in a more likely to give the various existing venture into <unk> , if the rifle attempting to be prescribed <unk> Da

## Model evaluation

In [80]:
def get_batch(source, i):
    seq_len = min(params.pref.bptt, len(source) - 1 - i)
    data = source[i:i+seq_len]
    target = source[i+1:i+1+seq_len].view(-1)
    return data, target


def batchify(data, bsz, device):
    # Work out how cleanly we can divide the dataset into bsz parts.
    nbatch = data.size(0) // bsz
    # Trim off any extra elements that wouldn't cleanly fit (remainders).
    data = data.narrow(0, 0, nbatch * bsz)
    # Evenly divide the data across the bsz batches.
    data = data.view(bsz, -1).t().contiguous()
    return data.to(device)


def save(model, data_name, language):
    path = '_'.join([model.__name__(), data_name, language]) + '.pt'
    path = os.path.join(paths.path2derivatives, 'fMRI/models', language, path)
    with open(path, 'wb') as f:
        torch.save(model, f)

    
def repackage_hidden(h):
    """Wraps hidden states in new Tensors, to detach them from their history."""
    if isinstance(h, torch.Tensor):
        return h.detach()
    else:
        return tuple(repackage_hidden(v) for v in h)

In [89]:
def evaluate(model, criterion, ntokens, data_source, eval_batch_size):
    # Turn on evaluation mode which disables dropout.
    model.eval()
    total_loss = 0.
    hidden = model.init_hidden(eval_batch_size)
    with torch.no_grad():
        for i in tqdm(range(0, data_source.size(0) - 1, params.pref.bptt)):
            data, targets = get_batch(data_source, i)
            output, hidden = model(data, hidden)
            output_flat = output.view(-1, ntokens)
            total_loss += len(data) * criterion(output_flat, targets).item()
            hidden = repackage_hidden(hidden)
    return total_loss / (len(data_source) - 1)

In [90]:
import math

def test(model):
    torch.manual_seed(params.pref.seed) # setting seed for reproductibility
    cuda = (torch.cuda.is_available() & params.cuda)
    device = torch.device("cuda" if cuda else "cpu")
    corpus = Corpus(param_dict['data'], param_dict['language'])
    train_data = batchify(corpus.train, params.pref.bsz, device)
    val_data = batchify(corpus.valid, params.pref.eval_batch_size, device)
    test_data = batchify(corpus.test, params.pref.bsz, device)

    # Build the model
    ntokens = len(corpus.dictionary)

    model = model.to(device)
    print(model)

    criterion = nn.CrossEntropyLoss()


    model = load(model, data_name, language)
    # after load the rnn params are not a continuous chunk of memory
    # this makes them a continuous chunk, and will speed up forward pass
    model.rnn.flatten_parameters()

    # Run on test data.
    print('evaluation...')
    test_loss = evaluate(model, criterion, ntokens, test_data, params.pref.eval_batch_size)
    print('=' * 89)
    print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(
        test_loss, math.exp(test_loss)))
    print('=' * 89)

In [91]:
test(load(model.RNNModel('LSTM', 5,200,100,3,dropout=0.1)   , data_name, language))
test(load(model.RNNModel('LSTM', 5,200,300,1,dropout=0.1)   , data_name, language))
test(load(model.RNNModel('LSTM', 5,300,300,2,dropout=0.1)   , data_name, language))
test(load(model.RNNModel('LSTM', 5,300,300,3,dropout=0.1)   , data_name, language))

/Users/alexpsq/Code/NeuroSpin/LePetitPrince/derivatives/fMRI/models/english/LSTM_embedding-size_200_nhid_100_nlayers_3_dropout_01_wiki_kristina_english.pt
Building dictionary...
Dictionary built.


  0%|          | 0/14810 [00:00<?, ?it/s]

RNNModel(
  (drop): Dropout(p=0.1)
  (encoder): Embedding(50001, 200)
  (rnn): LSTM(200, 100, num_layers=3, dropout=0.1)
  (decoder): Linear(in_features=100, out_features=50001, bias=True)
)
/Users/alexpsq/Code/NeuroSpin/LePetitPrince/derivatives/fMRI/models/english/LSTM_embedding-size_200_nhid_100_nlayers_3_dropout_01_wiki_kristina_english.pt
evaluation...


  0%|          | 41/14810 [00:11<1:11:35,  3.44it/s]

KeyboardInterrupt: 

In [75]:
vars(model2load2layer)

{'_backend': <torch.nn.backends.thnn.THNNFunctionBackend at 0x10bdcd828>,
 '_parameters': OrderedDict(),
 '_buffers': OrderedDict(),
 '_backward_hooks': OrderedDict(),
 '_forward_hooks': OrderedDict(),
 '_forward_pre_hooks': OrderedDict(),
 '_modules': OrderedDict([('drop', Dropout(p=0.1)),
              ('encoder', Embedding(50001, 200)),
              ('rnn', LSTM(200, 150, num_layers=2, dropout=0.1)),
              ('decoder',
               Linear(in_features=150, out_features=50001, bias=True))]),
 'training': False,
 'backup': <bound method RNNBase.forward of LSTM(200, 150, num_layers=2, dropout=0.1)>,
 'vocab': <LSTM.data.Dictionary at 0x1a270ca978>,
 'param': {'rnn_type': 'LSTM',
  'ntoken': 50001,
  'ninp': 200,
  'nhid': 150,
  'nlayers': 2,
  'dropout': 0.1,
  'tie_weights': False},
 '_state_dict_hooks': OrderedDict(),
 '_load_state_dict_pre_hooks': OrderedDict()}